From 055cad14ad40b5039d800abe7bd7282b784e6fce Mon Sep 17 00:00:00 2001 From: kerstink-GC Date: Thu, 31 Aug 2023 15:56:08 +0000 Subject: [PATCH 01/58] fix pyproject.toml to graphium.cli.train_finetune_test --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f24f61c82..78a5869da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,7 +64,7 @@ dependencies = [ [project.scripts] graphium = "graphium.cli.main:app" -graphium-train = "graphium.cli.train_finetune:cli" +graphium-train = "graphium.cli.train_finetune_test:cli" [project.urls] Website = "https://graphium.datamol.io/" From aa7e79395db076274b01059e747d5b45cbc5005e Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Mon, 4 Sep 2023 14:48:08 +0000 Subject: [PATCH 02/58] Passing pipleine / accelerator args to model --- .../accelerator/ipu_pipeline.yaml | 22 +++++++++++++++++++ graphium/config/_loader.py | 3 +++ 2 files changed, 25 insertions(+) create mode 100644 expts/hydra-configs/accelerator/ipu_pipeline.yaml diff --git a/expts/hydra-configs/accelerator/ipu_pipeline.yaml b/expts/hydra-configs/accelerator/ipu_pipeline.yaml new file mode 100644 index 000000000..996218646 --- /dev/null +++ b/expts/hydra-configs/accelerator/ipu_pipeline.yaml @@ -0,0 +1,22 @@ +type: ipu +ipu_config: + - deviceIterations(60) # IPU would require large batches to be ready for the model. + # 60 for PCQM4mv2 + # 30 for largemix + - replicationFactor(4) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + +ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(60) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + +accelerator_kwargs: + _accelerator: "ipu" + gnn_layers_per_ipu: [4, 4, 4, 4] \ No newline at end of file diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py index 3235c9b68..85e94b197 100644 --- a/graphium/config/_loader.py +++ b/graphium/config/_loader.py @@ -251,6 +251,8 @@ def load_architecture( # Set the parameters for the full network task_heads_kwargs = omegaconf.OmegaConf.to_object(task_heads_kwargs) + # Get accelerator_kwargs if they exist + accelerator_kwargs = config["accelerator"].get("accelerator_kwargs", None) # Set all the input arguments for the model model_kwargs = dict( gnn_kwargs=gnn_kwargs, @@ -259,6 +261,7 @@ def load_architecture( pe_encoders_kwargs=pe_encoders_kwargs, graph_output_nn_kwargs=graph_output_nn_kwargs, task_heads_kwargs=task_heads_kwargs, + accelerator_kwargs=accelerator_kwargs ) if model_class is FullGraphFinetuningNetwork: From 341a6563ca061e5866960e93858e26c1774677a5 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Tue, 5 Sep 2023 10:00:35 +0000 Subject: [PATCH 03/58] Reworking the model kwargs --- graphium/config/_loader.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py index 85e94b197..c9d3d30dd 100644 --- a/graphium/config/_loader.py +++ b/graphium/config/_loader.py @@ -251,8 +251,6 @@ def load_architecture( # Set the parameters for the full network task_heads_kwargs = omegaconf.OmegaConf.to_object(task_heads_kwargs) - # Get accelerator_kwargs if they exist - accelerator_kwargs = config["accelerator"].get("accelerator_kwargs", None) # Set all the input arguments for the model model_kwargs = dict( gnn_kwargs=gnn_kwargs, @@ -260,9 +258,12 @@ def load_architecture( pre_nn_edges_kwargs=pre_nn_edges_kwargs, pe_encoders_kwargs=pe_encoders_kwargs, graph_output_nn_kwargs=graph_output_nn_kwargs, - task_heads_kwargs=task_heads_kwargs, - accelerator_kwargs=accelerator_kwargs + task_heads_kwargs=task_heads_kwargs ) + # Get accelerator_kwargs if they exist + accelerator_kwargs = config["accelerator"].get("accelerator_kwargs", None) + if accelerator_kwargs is not None: + model_kwargs["accelerator_kwargs"] = accelerator_kwargs if model_class is FullGraphFinetuningNetwork: finetuning_head_kwargs = config["finetuning"].pop("finetuning_head", None) From bf5379db1c8360825e71fd392b1649d56af3c733 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Tue, 5 Sep 2023 10:01:56 +0000 Subject: [PATCH 04/58] linting loader --- graphium/config/_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py index c9d3d30dd..da55a9266 100644 --- a/graphium/config/_loader.py +++ b/graphium/config/_loader.py @@ -258,7 +258,7 @@ def load_architecture( pre_nn_edges_kwargs=pre_nn_edges_kwargs, pe_encoders_kwargs=pe_encoders_kwargs, graph_output_nn_kwargs=graph_output_nn_kwargs, - task_heads_kwargs=task_heads_kwargs + task_heads_kwargs=task_heads_kwargs, ) # Get accelerator_kwargs if they exist accelerator_kwargs = config["accelerator"].get("accelerator_kwargs", None) From 4db06f0fb6819deafd9af227bd06b1925a7af7b5 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Tue, 5 Sep 2023 10:22:21 +0000 Subject: [PATCH 05/58] Zinc config update for test including accelerator --- graphium/config/zinc_default_multitask_pyg.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/graphium/config/zinc_default_multitask_pyg.yaml b/graphium/config/zinc_default_multitask_pyg.yaml index 07ae4bf9b..b9435ec7e 100644 --- a/graphium/config/zinc_default_multitask_pyg.yaml +++ b/graphium/config/zinc_default_multitask_pyg.yaml @@ -181,3 +181,5 @@ architecture: # The parameters for the full graph network are taken from `co dropout: 0.2 normalization: none residual_type: none +accelerator: + type: cpu \ No newline at end of file From 20434b62669313745b02af7db18f61e07aa875b2 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Tue, 5 Sep 2023 15:46:56 +0000 Subject: [PATCH 06/58] Fix to allow use edges for MPNN layer --- expts/hydra-configs/model/mpnn.yaml | 2 ++ .../nn/architectures/global_architectures.py | 27 ++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/expts/hydra-configs/model/mpnn.yaml b/expts/hydra-configs/model/mpnn.yaml index dce40c932..d37eecd36 100644 --- a/expts/hydra-configs/model/mpnn.yaml +++ b/expts/hydra-configs/model/mpnn.yaml @@ -22,3 +22,5 @@ architecture: attn_type: "none" # "full-attention", "none" # biased_attention: false attn_kwargs: null + virtual_node: 'sum' + use_virtual_edges: true diff --git a/graphium/nn/architectures/global_architectures.py b/graphium/nn/architectures/global_architectures.py index 903803642..69b235c1d 100644 --- a/graphium/nn/architectures/global_architectures.py +++ b/graphium/nn/architectures/global_architectures.py @@ -12,6 +12,7 @@ from torch import Tensor, nn import torch from torch_geometric.data import Data +from omegaconf import DictConfig, OmegaConf # graphium imports from graphium.data.utils import get_keys @@ -592,6 +593,26 @@ def _check_bad_arguments(self): (self.in_dim_edges > 0) or (self.full_dims_edges is not None) ) and not self.layer_class.layer_supports_edges: raise ValueError(f"Cannot use edge features with class `{self.layer_class}`") + + def get_nested_key(self, d, target_key): + """ + Get the value associated with a key in a nested dictionary. + + Parameters: + - d: The dictionary to search in + - target_key: The key to search for + + Returns: + - The value associated with the key if found, None otherwise + """ + if target_key in d: + return d[target_key] + for key, value in d.items(): + if isinstance(value, (dict, DictConfig)): + nested_result = self.get_nested_key(value, target_key) + if nested_result is not None: + return nested_result + return None def _create_layers(self): r""" @@ -632,6 +653,7 @@ def _create_layers(self): # Find the edge key-word arguments depending on the layer type and residual connection this_edge_kwargs = {} + # import ipdb; ipdb.set_trace() if self.layer_class.layer_supports_edges and self.in_dim_edges > 0: this_edge_kwargs["in_dim_edges"] = this_in_dim_edges if "out_dim_edges" in inspect.signature(self.layer_class.__init__).parameters.keys(): @@ -639,8 +661,10 @@ def _create_layers(self): this_out_dim_edges = self.full_dims_edges[ii + 1] this_edge_kwargs["out_dim_edges"] = this_out_dim_edges else: - this_out_dim_edges = self.layer_kwargs.get("out_dim_edges") + this_out_dim_edges = self.get_nested_key(self.layer_kwargs, "out_dim_edges") + this_edge_kwargs["out_dim_edges"] = this_out_dim_edges layer_out_dims_edges.append(this_out_dim_edges) + # import ipdb; ipdb.set_trace() # Create the GNN layer self.layers.append( @@ -659,6 +683,7 @@ def _create_layers(self): # Create the Virtual Node layer, except at the last layer if ii < len(residual_out_dims): + # import ipdb; ipdb.set_trace() self.virtual_node_layers.append( self.virtual_node_class( in_dim=this_out_dim * self.layers[-1].out_dim_factor, From daf011c480dc8e4b7165cc5018b8d876cb780144 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Tue, 5 Sep 2023 15:52:51 +0000 Subject: [PATCH 07/58] linting + remove debug statements --- graphium/nn/architectures/global_architectures.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/graphium/nn/architectures/global_architectures.py b/graphium/nn/architectures/global_architectures.py index 69b235c1d..a90a2cba9 100644 --- a/graphium/nn/architectures/global_architectures.py +++ b/graphium/nn/architectures/global_architectures.py @@ -593,15 +593,15 @@ def _check_bad_arguments(self): (self.in_dim_edges > 0) or (self.full_dims_edges is not None) ) and not self.layer_class.layer_supports_edges: raise ValueError(f"Cannot use edge features with class `{self.layer_class}`") - + def get_nested_key(self, d, target_key): """ Get the value associated with a key in a nested dictionary. - + Parameters: - d: The dictionary to search in - target_key: The key to search for - + Returns: - The value associated with the key if found, None otherwise """ @@ -653,7 +653,6 @@ def _create_layers(self): # Find the edge key-word arguments depending on the layer type and residual connection this_edge_kwargs = {} - # import ipdb; ipdb.set_trace() if self.layer_class.layer_supports_edges and self.in_dim_edges > 0: this_edge_kwargs["in_dim_edges"] = this_in_dim_edges if "out_dim_edges" in inspect.signature(self.layer_class.__init__).parameters.keys(): @@ -664,7 +663,6 @@ def _create_layers(self): this_out_dim_edges = self.get_nested_key(self.layer_kwargs, "out_dim_edges") this_edge_kwargs["out_dim_edges"] = this_out_dim_edges layer_out_dims_edges.append(this_out_dim_edges) - # import ipdb; ipdb.set_trace() # Create the GNN layer self.layers.append( @@ -683,7 +681,6 @@ def _create_layers(self): # Create the Virtual Node layer, except at the last layer if ii < len(residual_out_dims): - # import ipdb; ipdb.set_trace() self.virtual_node_layers.append( self.virtual_node_class( in_dim=this_out_dim * self.layers[-1].out_dim_factor, From 4a9893fed93b5b785a200dd2421098c1f6e32816 Mon Sep 17 00:00:00 2001 From: DomInvivo Date: Thu, 7 Sep 2023 17:04:50 -0400 Subject: [PATCH 08/58] Remove the gpu logging, it's handled automatically by lightning, and it was not right --- graphium/trainer/predictor_summaries.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/graphium/trainer/predictor_summaries.py b/graphium/trainer/predictor_summaries.py index d62e50a42..8ce863e74 100644 --- a/graphium/trainer/predictor_summaries.py +++ b/graphium/trainer/predictor_summaries.py @@ -248,8 +248,6 @@ def get_metrics_logs(self) -> Dict[str, Any]: metric_logs[self.metric_log_name(self.task_name, "median_target", self.step_name)] = nan_median( targets ) - if torch.cuda.is_available(): - metric_logs[f"gpu_allocated_GB"] = torch.tensor(torch.cuda.memory_allocated() / (2**30)) # Specify which metrics to use metrics_to_use = self.metrics From c1dea0de60b1049a4a56649518372490fad0a6e8 Mon Sep 17 00:00:00 2001 From: kerstink-GC Date: Fri, 8 Sep 2023 12:23:14 +0000 Subject: [PATCH 09/58] added iclr configs --- .../.config_ogbpcq_mpnn_GPS++.yaml.swp | Bin 0 -> 12288 bytes expts/iclr2023_configs/__init__.py | 0 .../base_config/ogbpcqm4mv2.yaml | 288 ++++++++++++++++++ .../base_config/ogbpcqm4mv2_GPS++.yaml | 288 ++++++++++++++++++ .../iclr2023_configs/config_ogbpcq_mpnn.yaml | 57 ++++ .../config_ogbpcq_mpnn_GPS++.yaml | 47 +++ .../config_ogbpcq_mpnn_JosefOG.yaml | 57 ++++ .../config_ogbpcq_mpnn_hydradims.yaml | 101 ++++++ .../config_ogbpcq_mpnn_largerffn.yaml | 58 ++++ 9 files changed, 896 insertions(+) create mode 100644 expts/iclr2023_configs/.config_ogbpcq_mpnn_GPS++.yaml.swp create mode 100644 expts/iclr2023_configs/__init__.py create mode 100644 expts/iclr2023_configs/base_config/ogbpcqm4mv2.yaml create mode 100644 expts/iclr2023_configs/base_config/ogbpcqm4mv2_GPS++.yaml create mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn.yaml create mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml create mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_JosefOG.yaml create mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_hydradims.yaml create mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_largerffn.yaml diff --git a/expts/iclr2023_configs/.config_ogbpcq_mpnn_GPS++.yaml.swp b/expts/iclr2023_configs/.config_ogbpcq_mpnn_GPS++.yaml.swp new file mode 100644 index 0000000000000000000000000000000000000000..78ab30c0ce46b96b10e0a72414b1d02a037cb2fe GIT binary patch literal 12288 zcmeHN&x;&I6s`!0CT3mYB^TjkMqD+{uiiB*EeOd)f(Ri?Jc!WLOxN@j+g;suRnKM= z6};$0uX<4M;{PCuH**&8FAxGAym`&Vud2Jpo?T4#AWEbPK6?69y;rZk_v#J9yy^JO z>u=EhaKbRQ8T)PPUVL|T@zJ{un;2HS8Fkp%^>_Z&p zk>)_7=6UAWvw8Io&wyvZGvFEU40r}S1D*lTfM>un;2H1?cm^Iq26)WaoyQsb>IpcH z|Nq_^}`;0xe0;8WluUH0JsTU117*Tz&3Dii?LsUpMcMSw}5NFRbU(V z`8;Dk0IvhD0Da&w;EyMv1AGH~0w~~Z;3{wl*aE&jhrNJr&ocHZa1Qt#zW)Mz57={j z3Had|@C6DbBVH!ZQ0UHV8ls)3 z&WfF4m4x=aNGsiDn{&JrTE@uJZbG?mwRz}_)sl{_q;O{XgwdLa>Uk}*BDI<6CR46W zr1CQ3seBi0#x`|d+<0$SZU~Xc^`2JRYnQ}jULb}cjnVtu-GI-~z}#537oeT&$`g@R ztvnWmS%kFLzAt1P3zwC+c+VwBb=(ZP@{i@ne6Iew1df+=Dh4lKX}zfL#qoTZiodyz zZGu7_9*35X17V13S31b##Fx2@iBpklGr-+c!W`u#dcz6hSlAlc2!5W7*t75Pa<-60 z%uH2^VOKxFbZ#oWNGp Date: Fri, 8 Sep 2023 14:46:13 +0000 Subject: [PATCH 10/58] make one big sweep config --- .../.config_ogbpcq_mpnn_GPS++.yaml.swp | Bin 12288 -> 0 bytes .../config_ogbpcq_mpnn_GPS++.yaml | 285 ++++++++++++++++-- 2 files changed, 266 insertions(+), 19 deletions(-) delete mode 100644 expts/iclr2023_configs/.config_ogbpcq_mpnn_GPS++.yaml.swp diff --git a/expts/iclr2023_configs/.config_ogbpcq_mpnn_GPS++.yaml.swp b/expts/iclr2023_configs/.config_ogbpcq_mpnn_GPS++.yaml.swp deleted file mode 100644 index 78ab30c0ce46b96b10e0a72414b1d02a037cb2fe..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeHN&x;&I6s`!0CT3mYB^TjkMqD+{uiiB*EeOd)f(Ri?Jc!WLOxN@j+g;suRnKM= z6};$0uX<4M;{PCuH**&8FAxGAym`&Vud2Jpo?T4#AWEbPK6?69y;rZk_v#J9yy^JO z>u=EhaKbRQ8T)PPUVL|T@zJ{un;2HS8Fkp%^>_Z&p zk>)_7=6UAWvw8Io&wyvZGvFEU40r}S1D*lTfM>un;2H1?cm^Iq26)WaoyQsb>IpcH z|Nq_^}`;0xe0;8WluUH0JsTU117*Tz&3Dii?LsUpMcMSw}5NFRbU(V z`8;Dk0IvhD0Da&w;EyMv1AGH~0w~~Z;3{wl*aE&jhrNJr&ocHZa1Qt#zW)Mz57={j z3Had|@C6DbBVH!ZQ0UHV8ls)3 z&WfF4m4x=aNGsiDn{&JrTE@uJZbG?mwRz}_)sl{_q;O{XgwdLa>Uk}*BDI<6CR46W zr1CQ3seBi0#x`|d+<0$SZU~Xc^`2JRYnQ}jULb}cjnVtu-GI-~z}#537oeT&$`g@R ztvnWmS%kFLzAt1P3zwC+c+VwBb=(ZP@{i@ne6Iew1df+=Dh4lKX}zfL#qoTZiodyz zZGu7_9*35X17V13S31b##Fx2@iBpklGr-+c!W`u#dcz6hSlAlc2!5W7*t75Pa<-60 z%uH2^VOKxFbZ#oWNGp Date: Fri, 8 Sep 2023 15:10:50 +0000 Subject: [PATCH 11/58] added stochastic depth to config --- expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml index bca1428b5..62ca5a614 100644 --- a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml +++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml @@ -238,6 +238,7 @@ architecture: activation: gelu last_activation: none dropout: 0.1 + droppath_rate_ffn: 0.3 normalization: "layer_norm" last_normalization: *normalization residual_type: simple From dbc574fcea2a41d5611fd3b62ab04626042df9e5 Mon Sep 17 00:00:00 2001 From: kerstink-GC Date: Fri, 8 Sep 2023 15:32:09 +0000 Subject: [PATCH 12/58] fixed config --- expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml index 62ca5a614..3141747d4 100644 --- a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml +++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml @@ -77,7 +77,7 @@ datamodule: split_names: ["train", "valid", "test-dev"] seed: ${constants.seed} label_normalization: - normalize_val_test: true + normalize_val_test: false method: "normal" # Featurization @@ -238,7 +238,6 @@ architecture: activation: gelu last_activation: none dropout: 0.1 - droppath_rate_ffn: 0.3 normalization: "layer_norm" last_normalization: *normalization residual_type: simple @@ -256,6 +255,7 @@ architecture: attn_type: "none" # "full-attention", "none" # biased_attention: false attn_kwargs: null + droppath_rate_ffn: 0.3 graph_output_nn: graph: From ef5db7f94ab25c88ffdc5ac3749156312d386535 Mon Sep 17 00:00:00 2001 From: DomInvivo Date: Sat, 9 Sep 2023 21:20:28 -0400 Subject: [PATCH 13/58] Track learning rate with callback. `n_epochs` redundant with the `epochs` tracked by lightning. --- graphium/config/_loader.py | 7 ++++++- graphium/trainer/predictor.py | 5 ----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py index da55a9266..5a6754c54 100644 --- a/graphium/config/_loader.py +++ b/graphium/config/_loader.py @@ -13,7 +13,7 @@ # Lightning from lightning import Trainer -from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint +from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor from lightning.pytorch.loggers import Logger, WandbLogger from loguru import logger @@ -415,6 +415,11 @@ def load_trainer( if "model_checkpoint" in cfg_trainer.keys(): callbacks.append(ModelCheckpoint(**cfg_trainer["model_checkpoint"])) + if "learning_rate_monitor" in cfg_trainer.keys(): + callbacks.append(LearningRateMonitor(**cfg_trainer["learning_rate_monitor"])) + else: + callbacks.append(LearningRateMonitor()) + # Define the logger parameters wandb_cfg = config["constants"].get("wandb") if wandb_cfg is not None: diff --git a/graphium/trainer/predictor.py b/graphium/trainer/predictor.py index c4e700895..6824a40df 100644 --- a/graphium/trainer/predictor.py +++ b/graphium/trainer/predictor.py @@ -618,11 +618,6 @@ def on_validation_epoch_end(self) -> None: concatenated_metrics_logs = self.task_epoch_summary.concatenate_metrics_logs(metrics_logs) concatenated_metrics_logs["val/mean_time"] = torch.tensor(self.mean_val_time_tracker.mean_value) concatenated_metrics_logs["val/mean_tput"] = self.mean_val_tput_tracker.mean_value - - if hasattr(self.optimizers(), "param_groups"): - lr = self.optimizers().param_groups[0]["lr"] - concatenated_metrics_logs["lr"] = torch.tensor(lr) - concatenated_metrics_logs["n_epochs"] = torch.tensor(self.current_epoch, dtype=torch.float32) self.log_dict(concatenated_metrics_logs) # Save yaml file with the per-task metrics summaries From 5cfd705d22808a7c082499da744616bf2dbe3632 Mon Sep 17 00:00:00 2001 From: kerstink-GC Date: Mon, 11 Sep 2023 16:09:34 +0000 Subject: [PATCH 14/58] added PCQM4Mv2 GCN and GINE configs --- .../config_ogbpcq_GCN_16layers.yaml | 263 +++++++++++++++++ .../config_ogbpcq_GCN_4layers.yaml | 263 +++++++++++++++++ .../config_ogbpcq_GINE_16layers.yaml | 273 ++++++++++++++++++ .../config_ogbpcq_GINE_4layers.yaml | 273 ++++++++++++++++++ .../config_ogbpcq_mpnn_GPS++.yaml | 2 +- 5 files changed, 1073 insertions(+), 1 deletion(-) create mode 100644 expts/iclr2023_configs/config_ogbpcq_GCN_16layers.yaml create mode 100644 expts/iclr2023_configs/config_ogbpcq_GCN_4layers.yaml create mode 100644 expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml create mode 100644 expts/iclr2023_configs/config_ogbpcq_GINE_4layers.yaml diff --git a/expts/iclr2023_configs/config_ogbpcq_GCN_16layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GCN_16layers.yaml new file mode 100644 index 000000000..bffa2ee04 --- /dev/null +++ b/expts/iclr2023_configs/config_ogbpcq_GCN_16layers.yaml @@ -0,0 +1,263 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/PCQM4Mv2/" + epochs: 100 + name: ogb_pcqm4mv2_gcn + wandb: + entity: multitask-gnn + name: neurips2023_scaling_gcn + project: neurips2023_graphcore_scaling_mpnn + + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 100 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 100 + # Data handling-related + batch_size_training: 30 + batch_size_inference: 30 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 8 + + ipu_config: + - deviceIterations(60) # IPU would require large batches to be ready for the model. + - replicationFactor(4) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + gnn_layers_per_ipu: [4, 4, 4, 4] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + homolumo: + df: null + task_level: graph + df_path: graphium/data/PCQM4M/pcqm4mv2.csv + # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` + smiles_col: "cxsmiles" + label_cols: ["homo_lumo_gap"] # graph_* # graph_* means all columns starting with "graph_" + # sample_size: 100000 # use sample_size for test + splits_path: graphium/data/PCQM4M/split_dict_v2.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` + split_names: ["train", "valid", "test-dev"] + seed: ${constants.seed} + label_normalization: + normalize_val_test: true + method: "normal" + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: "disk" + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + # cache_data_path: . + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + homolumo: [] + metrics_on_training_set: + homolumo: [] + loss_fun: + homolumo: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: ${constants.epochs} + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + homolumo: + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/ogb_pcqm4mv2/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${constants.epochs} + min_epochs: 1 + check_val_every_n_epoch: 5 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: # Set as null to avoid a pre-nn network + out_dim: 64 + hidden_dims: 256 + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.18 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: null + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + in_dim: 64 # or otherwise the correct value + out_dim: &gnn_dim 768 + hidden_dims: *gnn_dim + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'none' + layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps + + + graph_output_nn: + graph: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + homolumo: + task_level: graph + out_dim: 1 + hidden_dims: 256 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none diff --git a/expts/iclr2023_configs/config_ogbpcq_GCN_4layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GCN_4layers.yaml new file mode 100644 index 000000000..ef46cda2a --- /dev/null +++ b/expts/iclr2023_configs/config_ogbpcq_GCN_4layers.yaml @@ -0,0 +1,263 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/PCQM4Mv2/" + epochs: 100 + name: ogb_pcqm4mv2_gcn + wandb: + entity: multitask-gnn + name: neurips2023_scaling_gcn + project: neurips2023_graphcore_scaling_mpnn + + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 100 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 100 + # Data handling-related + batch_size_training: 30 + batch_size_inference: 30 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 2 + + ipu_config: + - deviceIterations(60) # IPU would require large batches to be ready for the model. + - replicationFactor(16) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + # accelerator_kwargs: + # _accelerator: "ipu" + # gnn_layers_per_ipu: [4, 4, 4, 4] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + homolumo: + df: null + task_level: graph + df_path: graphium/data/PCQM4M/pcqm4mv2.csv + # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` + smiles_col: "cxsmiles" + label_cols: ["homo_lumo_gap"] # graph_* # graph_* means all columns starting with "graph_" + # sample_size: 100000 # use sample_size for test + splits_path: graphium/data/PCQM4M/split_dict_v2.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` + split_names: ["train", "valid", "test-dev"] + seed: ${constants.seed} + label_normalization: + normalize_val_test: true + method: "normal" + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: "disk" + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + # cache_data_path: . + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + homolumo: [] + metrics_on_training_set: + homolumo: [] + loss_fun: + homolumo: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: ${constants.epochs} + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + homolumo: + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/ogb_pcqm4mv2/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${constants.epochs} + min_epochs: 1 + check_val_every_n_epoch: 5 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: # Set as null to avoid a pre-nn network + out_dim: 64 + hidden_dims: 256 + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.18 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: null + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + in_dim: 64 # or otherwise the correct value + out_dim: &gnn_dim 768 + hidden_dims: *gnn_dim + depth: 4 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'none' + layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps + + + graph_output_nn: + graph: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + homolumo: + task_level: graph + out_dim: 1 + hidden_dims: 256 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none diff --git a/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml new file mode 100644 index 000000000..d53dd60b8 --- /dev/null +++ b/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml @@ -0,0 +1,273 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/PCQM4Mv2/" + epochs: 100 + name: ogb_pcqm4mv2_gine + wandb: + entity: multitask-gnn + name: neurips2023_scaling_gine + project: neurips2023_graphcore_scaling_mpnn + + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 100 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 100 + # Data handling-related + batch_size_training: 30 + batch_size_inference: 30 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 8 + + ipu_config: + - deviceIterations(60) # IPU would require large batches to be ready for the model. + - replicationFactor(4) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + gnn_layers_per_ipu: [4, 4, 4, 4] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + homolumo: + df: null + task_level: graph + df_path: graphium/data/PCQM4M/pcqm4mv2.csv + # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` + smiles_col: "cxsmiles" + label_cols: ["homo_lumo_gap"] # graph_* # graph_* means all columns starting with "graph_" + # sample_size: 100000 # use sample_size for test + splits_path: graphium/data/PCQM4M/split_dict_v2.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` + split_names: ["train", "valid", "test-dev"] + seed: ${constants.seed} + label_normalization: + normalize_val_test: true + method: "normal" + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: "disk" + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + # cache_data_path: . + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + homolumo: [] + metrics_on_training_set: + homolumo: [] + loss_fun: + homolumo: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: ${constants.epochs} + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + homolumo: + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/ogb_pcqm4mv2/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${constants.epochs} + min_epochs: 1 + check_val_every_n_epoch: 5 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: # Set as null to avoid a pre-nn network + out_dim: 64 + hidden_dims: 256 + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.18 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: # Set as null to avoid a pre-nn network + out_dim: 32 + hidden_dims: 128 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: *normalization + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + + gnn: # Set as null to avoid a post-nn network + out_dim: &gnn_dim 704 + hidden_dims: *gnn_dim + depth: 4 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'none' + layer_type: 'pyg:gine' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps + + + graph_output_nn: + graph: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + + task_heads: + homolumo: + task_level: graph + out_dim: 1 + hidden_dims: 256 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none diff --git a/expts/iclr2023_configs/config_ogbpcq_GINE_4layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GINE_4layers.yaml new file mode 100644 index 000000000..ff173adde --- /dev/null +++ b/expts/iclr2023_configs/config_ogbpcq_GINE_4layers.yaml @@ -0,0 +1,273 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/PCQM4Mv2/" + epochs: 100 + name: ogb_pcqm4mv2_gine + wandb: + entity: multitask-gnn + name: neurips2023_scaling_gine + project: neurips2023_graphcore_scaling_mpnn + + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 100 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 100 + # Data handling-related + batch_size_training: 30 + batch_size_inference: 30 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 2 + + ipu_config: + - deviceIterations(60) # IPU would require large batches to be ready for the model. + - replicationFactor(16) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + # accelerator_kwargs: + # _accelerator: "ipu" + # gnn_layers_per_ipu: [4, 4, 4, 4] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + homolumo: + df: null + task_level: graph + df_path: graphium/data/PCQM4M/pcqm4mv2.csv + # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` + smiles_col: "cxsmiles" + label_cols: ["homo_lumo_gap"] # graph_* # graph_* means all columns starting with "graph_" + # sample_size: 100000 # use sample_size for test + splits_path: graphium/data/PCQM4M/split_dict_v2.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` + split_names: ["train", "valid", "test-dev"] + seed: ${constants.seed} + label_normalization: + normalize_val_test: true + method: "normal" + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: "disk" + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + # cache_data_path: . + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + homolumo: [] + metrics_on_training_set: + homolumo: [] + loss_fun: + homolumo: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: ${constants.epochs} + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + homolumo: + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/ogb_pcqm4mv2/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${constants.epochs} + min_epochs: 1 + check_val_every_n_epoch: 5 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: # Set as null to avoid a pre-nn network + out_dim: 64 + hidden_dims: 256 + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.18 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: # Set as null to avoid a pre-nn network + out_dim: 32 + hidden_dims: 128 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: *normalization + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + + gnn: # Set as null to avoid a post-nn network + out_dim: &gnn_dim 704 + hidden_dims: *gnn_dim + depth: 4 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'none' + layer_type: 'pyg:gine' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps + + + graph_output_nn: + graph: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + + task_heads: + homolumo: + task_level: graph + out_dim: 1 + hidden_dims: 256 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml index 3141747d4..b9fcfc780 100644 --- a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml +++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml @@ -77,7 +77,7 @@ datamodule: split_names: ["train", "valid", "test-dev"] seed: ${constants.seed} label_normalization: - normalize_val_test: false + normalize_val_test: true method: "normal" # Featurization From 1cc9c0c981ba59cf256936547e8e29accf68e0cc Mon Sep 17 00:00:00 2001 From: kerstink-GC Date: Thu, 14 Sep 2023 10:43:33 +0000 Subject: [PATCH 15/58] PCQM4Mv2 configs with NEW DATA --- .../config_ogbpcq_GINE_16layers.yaml | 2 +- .../config_ogbpcq_mpnn_GPS++.yaml | 8 +- .../config_ogbpcq_mpnn_GPS++_newDATA.yaml | 295 ++++++++++++++++++ ...fig_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml | 295 ++++++++++++++++++ 4 files changed, 595 insertions(+), 5 deletions(-) create mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA.yaml create mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml diff --git a/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml index d53dd60b8..bde17fe0d 100644 --- a/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml +++ b/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml @@ -234,7 +234,7 @@ architecture: gnn: # Set as null to avoid a post-nn network out_dim: &gnn_dim 704 hidden_dims: *gnn_dim - depth: 4 + depth: 16 activation: gelu last_activation: none dropout: 0.1 diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml index b9fcfc780..61f645c44 100644 --- a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml +++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml @@ -7,10 +7,10 @@ constants: raise_train_error: true # Whether the code should raise an error if it crashes during training datacache_path: "/localdata/PCQM4Mv2/" epochs: 100 - name: ogb_pcqm4mv2_mpnn + name: ogb_pcqm4mv2_mpnn_no1hot wandb: entity: multitask-gnn - name: neurips2023_scaling_mpnn + name: neurips2023_scaling_mpnn_no1hot project: neurips2023_graphcore_scaling_mpnn @@ -91,8 +91,8 @@ datamodule: # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [atomic-number, group, period, total-valence, degree, formal-charge, radical-electron, aromatic, in-ring] # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] edge_property_list: [bond-type-onehot, stereo, in-ring] add_self_loop: False diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA.yaml new file mode 100644 index 000000000..b97402680 --- /dev/null +++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA.yaml @@ -0,0 +1,295 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/PCQM4Mv2/" + epochs: 100 + name: ogb_pcqm4mv2_mpnn_NewData + wandb: + entity: multitask-gnn + name: neurips2023_scaling_mpnn_NewData + project: neurips2023_graphcore_scaling_mpnn + + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 100 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 100 + # Data handling-related + batch_size_training: 30 + batch_size_inference: 30 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 8 + + ipu_config: + - deviceIterations(60) # IPU would require large batches to be ready for the model. + - replicationFactor(4) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + gnn_layers_per_ipu: [4, 4, 4, 4] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + homolumo: + df: null + task_level: graph + df_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/raw/data.csv + # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` + smiles_col: "smiles" + label_cols: ["homolumogap"] # graph_* # graph_* means all columns starting with "graph_" + # sample_size: 100000 # use sample_size for test + splits_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/split_dict.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` + split_names: ["train", "valid", "test-dev"] + seed: ${constants.seed} + label_normalization: + normalize_val_test: true + method: "normal" + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: "disk" + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + # cache_data_path: . + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + homolumo: [] + metrics_on_training_set: + homolumo: [] + loss_fun: + homolumo: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: ${constants.epochs} + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + homolumo: + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/ogb_pcqm4mv2/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${constants.epochs} + min_epochs: 1 + check_val_every_n_epoch: 5 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: 256 + hidden_dims: 1024 + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: 128 + hidden_dims: 512 + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + # in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: 256 + hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + homolumo: + task_level: graph + out_dim: 1 + hidden_dims: 256 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml new file mode 100644 index 000000000..94535b7cc --- /dev/null +++ b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml @@ -0,0 +1,295 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/PCQM4Mv2/" + epochs: 100 + name: ogb_pcqm4mv2_mpnn_NewData_4layers + wandb: + entity: multitask-gnn + name: neurips2023_scaling_mpnn_NewData_4layers + project: neurips2023_graphcore_scaling_mpnn + + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 100 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 100 + # Data handling-related + batch_size_training: 30 + batch_size_inference: 30 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 2 + + ipu_config: + - deviceIterations(60) # IPU would require large batches to be ready for the model. + - replicationFactor(16) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(1) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + # accelerator_kwargs: + # _accelerator: "ipu" + # gnn_layers_per_ipu: [4, 4, 4, 4] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + homolumo: + df: null + task_level: graph + df_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/raw/data.csv + # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` + smiles_col: "smiles" + label_cols: ["homolumogap"] # graph_* # graph_* means all columns starting with "graph_" + # sample_size: 100000 # use sample_size for test + splits_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/split_dict.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` + split_names: ["train", "valid", "test-dev"] + seed: ${constants.seed} + label_normalization: + normalize_val_test: true + method: "normal" + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: "disk" + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + # cache_data_path: . + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + homolumo: [] + metrics_on_training_set: + homolumo: [] + loss_fun: + homolumo: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: ${constants.epochs} + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + homolumo: + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/ogb_pcqm4mv2/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${constants.epochs} + min_epochs: 1 + check_val_every_n_epoch: 5 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: 256 + hidden_dims: 1024 + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: 128 + hidden_dims: 512 + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + # in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: 256 + hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 4 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + homolumo: + task_level: graph + out_dim: 1 + hidden_dims: 256 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none From 6a632d50036a02783fb007473b98b84b111835b1 Mon Sep 17 00:00:00 2001 From: kerstink-GC Date: Fri, 15 Sep 2023 11:10:07 +0000 Subject: [PATCH 16/58] added configs for LargeMix MPNN foundation model --- expts/foundation_model/__init__.py | 0 .../config_LargeMix_mpnn_GPS++.yaml | 462 ++++++++++++++++++ .../config_ogbpcq_mpnn_GPS++_newDATA.yaml | 295 +++++++++++ 3 files changed, 757 insertions(+) create mode 100644 expts/foundation_model/__init__.py create mode 100644 expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml create mode 100644 expts/foundation_model/config_ogbpcq_mpnn_GPS++_newDATA.yaml diff --git a/expts/foundation_model/__init__.py b/expts/foundation_model/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml new file mode 100644 index 000000000..dcacb5371 --- /dev/null +++ b/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml @@ -0,0 +1,462 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/PCQM4Mv2/" + epochs: 100 + name: ogb_pcqm4mv2_mpnn_NewData + wandb: + entity: multitask-gnn + name: neurips2023_scaling_mpnn_NewData + project: neurips2023_graphcore_scaling_mpnn + + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 100 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 100 + # Data handling-related + batch_size_training: 30 + batch_size_inference: 30 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 8 + + ipu_config: + - deviceIterations(30) # IPU would require large batches to be ready for the model. + - replicationFactor(4) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + gnn_layers_per_ipu: [4, 4, 4, 4] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + l1000_vcap: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` + epoch_sampling_fraction: 1.0 + + l1000_mcf7: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcba_1328: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: assayID-* # assayID-* means all columns starting with "assayID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcqm4m_g25: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: graph_* # graph_* means all columns starting with "graph_" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + pcqm4m_n4: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: node_* # node_* means all columns starting with "node_" + # sample_size: 2000 # use sample_size for test + task_level: node + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + seed: 42 + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: disk + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + num_workers: 32 # -1 to use all + persistent_workers: True # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + metrics_on_training_set: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + loss_fun: + l1000_vcap: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: 0.5 + l1000_mcf7: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: ${predictor.loss_fun.l1000_vcap.alpha} + pcba_1328: bce_logits_ipu + pcqm4m_g25: mae_ipu + pcqm4m_n4: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + l1000_vcap: &classif_metrics + - name: auroc + metric: auroc + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + - name: avpr + metric: averageprecision + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + l1000_mcf7: *classif_metrics + pcba_1328: + # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly + - name: auroc + metric: auroc + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + - name: avpr + metric: averageprecision + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + pcqm4m_g25: &pcqm_metrics + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2 + metric: r2_score_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + pcqm4m_n4: *pcqm_metrics + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/neurips2023-large/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} + min_epochs: 1 + check_val_every_n_epoch: 20 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: 256 + hidden_dims: 1024 + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: 128 + hidden_dims: 512 + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + # in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: 256 + hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + l1000_vcap: + task_level: graph + out_dim: 2934 + hidden_dims: 128 + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + l1000_mcf7: + task_level: graph + out_dim: 2934 + hidden_dims: 128 + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcba_1328: + task_level: graph + out_dim: 1328 + hidden_dims: 64 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_g25: + task_level: graph + out_dim: 25 + hidden_dims: 32 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_n4: + task_level: node + out_dim: 4 + hidden_dims: 32 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + + # task_heads: + # homolumo: + # task_level: graph + # out_dim: 1 + # hidden_dims: 256 + # depth: 2 + # activation: relu + # last_activation: none + # dropout: *dropout + # normalization: *normalization + # last_normalization: "none" + # residual_type: none diff --git a/expts/foundation_model/config_ogbpcq_mpnn_GPS++_newDATA.yaml b/expts/foundation_model/config_ogbpcq_mpnn_GPS++_newDATA.yaml new file mode 100644 index 000000000..b97402680 --- /dev/null +++ b/expts/foundation_model/config_ogbpcq_mpnn_GPS++_newDATA.yaml @@ -0,0 +1,295 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/PCQM4Mv2/" + epochs: 100 + name: ogb_pcqm4mv2_mpnn_NewData + wandb: + entity: multitask-gnn + name: neurips2023_scaling_mpnn_NewData + project: neurips2023_graphcore_scaling_mpnn + + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 100 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 100 + # Data handling-related + batch_size_training: 30 + batch_size_inference: 30 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 8 + + ipu_config: + - deviceIterations(60) # IPU would require large batches to be ready for the model. + - replicationFactor(4) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + gnn_layers_per_ipu: [4, 4, 4, 4] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + homolumo: + df: null + task_level: graph + df_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/raw/data.csv + # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` + smiles_col: "smiles" + label_cols: ["homolumogap"] # graph_* # graph_* means all columns starting with "graph_" + # sample_size: 100000 # use sample_size for test + splits_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/split_dict.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` + split_names: ["train", "valid", "test-dev"] + seed: ${constants.seed} + label_normalization: + normalize_val_test: true + method: "normal" + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: "disk" + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + # cache_data_path: . + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + homolumo: [] + metrics_on_training_set: + homolumo: [] + loss_fun: + homolumo: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: ${constants.epochs} + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + homolumo: + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/ogb_pcqm4mv2/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${constants.epochs} + min_epochs: 1 + check_val_every_n_epoch: 5 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: 256 + hidden_dims: 1024 + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: 128 + hidden_dims: 512 + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + # in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: 256 + hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + homolumo: + task_level: graph + out_dim: 1 + hidden_dims: 256 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none From de949f96f243dc9daba779784f2a1ff01305d6e3 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Fri, 15 Sep 2023 13:57:41 +0000 Subject: [PATCH 17/58] small config --- expts/foundation_model/small.yaml | 346 ++++++++++++++++++++++++++++++ 1 file changed, 346 insertions(+) create mode 100644 expts/foundation_model/small.yaml diff --git a/expts/foundation_model/small.yaml b/expts/foundation_model/small.yaml new file mode 100644 index 000000000..739ecb24e --- /dev/null +++ b/expts/foundation_model/small.yaml @@ -0,0 +1,346 @@ +# @package _global_ + +constants: + seed: &seed 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + entity: multitask-gnn + name: small_test + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 44 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 80 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 44 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 80 + # Data handling-related + batch_size_training: 50 + batch_size_inference: 50 + predictor: + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16 + accumulate_grad_batches: 16 + + ipu_config: + - deviceIterations(5) # IPU would require large batches to be ready for the model. + - replicationFactor(4) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 128) + - Precision.enableStochasticRounding(True) + + accelerator_kwargs: + _accelerator: "ipu" + gnn_layers_per_ipu: [4, 4, 4, 4] +# accelerator: +# type: cpu # cpu or ipu or gpu +# config_override: +# datamodule: +# batch_size_training: 64 +# batch_size_inference: 256 +# trainer: +# trainer: +# precision: 32 +# accumulate_grad_batches: 1 + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + qm9: + df: null + df_path: data/neurips2023/small-dataset/qm9.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9.csv.gz + # or set path as the URL directly + smiles_col: "smiles" + label_cols: ["A", "B", "C", "mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "u0", "u298", "h298", "g298", "cv", "u0_atom", "u298_atom", "h298_atom", "g298_atom"] + # sample_size: 2000 # use sample_size for test + splits_path: data/neurips2023/small-dataset/qm9_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9_random_splits.pt` + seed: *seed + task_level: graph + label_normalization: + normalize_val_test: True + method: "normal" + + tox21: + df: null + df_path: data/neurips2023/small-dataset/Tox21-7k-12-labels.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21-7k-12-labels.csv.gz + # or set path as the URL directly + smiles_col: "smiles" + label_cols: ["NR-AR", "NR-AR-LBD", "NR-AhR", "NR-Aromatase", "NR-ER", "NR-ER-LBD", "NR-PPAR-gamma", "SR-ARE", "SR-ATAD5", "SR-HSE", "SR-MMP", "SR-p53"] + # sample_size: 2000 # use sample_size for test + splits_path: data/neurips2023/small-dataset/Tox21_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21_random_splits.pt` + seed: *seed + task_level: graph + + zinc: + df: null + df_path: data/neurips2023/small-dataset/ZINC12k.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k.csv.gz + # or set path as the URL directly + smiles_col: "smiles" + label_cols: ["SA", "logp", "score"] + # sample_size: 2000 # use sample_size for test + splits_path: data/neurips2023/small-dataset/ZINC12k_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k_random_splits.pt` + seed: *seed + task_level: graph + label_normalization: + normalize_val_test: True + method: "normal" + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + num_workers: 30 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: # Set as null to avoid a pre-nn network + out_dim: 64 + hidden_dims: 256 + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.18 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: null # Set as null to avoid a pre-nn network + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + + + gnn: # Set as null to avoid a post-nn network + layer_type: 'pyg:gcn' + in_dim: 64 # or otherwise the correct value + out_dim: &gnn_dim 96 + hidden_dims: *gnn_dim + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'none' + layer_kwargs: null # Parameters for the model itself. You could define dropout_attn: 0.1 + + + graph_output_nn: + graph: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + qm9: + task_level: graph + out_dim: 19 + hidden_dims: 128 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + tox21: + task_level: graph + out_dim: 12 + hidden_dims: 64 + depth: 2 + activation: relu + last_activation: sigmoid + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + zinc: + task_level: graph + out_dim: 3 + hidden_dims: 32 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + +#Task-specific +predictor: + metrics_on_progress_bar: + qm9: ["mae"] + tox21: ["auroc"] + zinc: ["mae"] + loss_fun: + qm9: mae_ipu + tox21: bce_ipu + zinc: mae_ipu + random_seed: *seed + optim_kwargs: + lr: 4.e-5 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + qm9: &qm9_metrics + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: flatten + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2_score + metric: r2_score_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + tox21: + - name: auroc + metric: auroc_ipu + task: binary + multitask_handling: mean-per-label + threshold_kwargs: null + - name: avpr + metric: average_precision_ipu + task: binary + multitask_handling: mean-per-label + threshold_kwargs: null + - name: f1 > 0.5 + metric: f1 + multitask_handling: mean-per-label + target_to_int: True + num_classes: 2 + average: micro + threshold_kwargs: &threshold_05 + operator: greater + threshold: 0.5 + th_on_preds: True + th_on_target: True + - name: precision > 0.5 + metric: precision + multitask_handling: mean-per-label + average: micro + threshold_kwargs: *threshold_05 + zinc: *qm9_metrics + +trainer: + seed: *seed + logger: + save_dir: logs/neurips2023-small/ + name: ${constants.name} + project: ${constants.name} + #early_stopping: + # monitor: *monitor + # min_delta: 0 + # patience: 10 + # mode: &mode min + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: *max_epochs + min_epochs: 1 + check_val_every_n_epoch: 20 From 4a83e1288739aae8a94c5392e6fe6df6286c0a7c Mon Sep 17 00:00:00 2001 From: kerstink-GC Date: Fri, 15 Sep 2023 16:01:58 +0000 Subject: [PATCH 18/58] cleaned up configs foundation model --- .../config_LargeMix_mpnn_GPS++.yaml | 16 +- expts/foundation_model/small.yaml | 346 ------------------ 2 files changed, 8 insertions(+), 354 deletions(-) delete mode 100644 expts/foundation_model/small.yaml diff --git a/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml index dcacb5371..ce6914154 100644 --- a/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml @@ -5,12 +5,12 @@ constants: seed: 42 raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/PCQM4Mv2/" + datacache_path: "/localdata/neurips2023-large/" epochs: 100 - name: ogb_pcqm4mv2_mpnn_NewData + name: LargeMix_mpnn_40M wandb: entity: multitask-gnn - name: neurips2023_scaling_mpnn_NewData + name: LargeMix_mpnn_40M project: neurips2023_graphcore_scaling_mpnn @@ -28,8 +28,8 @@ accelerator: max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 max_num_edges_per_graph: 100 # Data handling-related - batch_size_training: 30 - batch_size_inference: 30 + batch_size_training: 16 + batch_size_inference: 16 predictor: metrics_every_n_train_steps: 1000 optim_kwargs: @@ -37,7 +37,7 @@ accelerator: trainer: trainer: precision: 16-true - accumulate_grad_batches: 8 + accumulate_grad_batches: 16 ipu_config: - deviceIterations(30) # IPU would require large batches to be ready for the model. @@ -58,7 +58,7 @@ accelerator: accelerator_kwargs: _accelerator: "ipu" - gnn_layers_per_ipu: [4, 4, 4, 4] + gnn_layers_per_ipu: [4,4,4,4] datamodule: module_type: "MultitaskFromSmilesDataModule" @@ -170,7 +170,7 @@ datamodule: ksteps: 16 num_workers: 32 # -1 to use all - persistent_workers: True # if use persistent worker at the start of each epoch. + persistent_workers: False # if use persistent worker at the start of each epoch. # Using persistent_workers false might make the start of each epoch very long. #Task-specific diff --git a/expts/foundation_model/small.yaml b/expts/foundation_model/small.yaml deleted file mode 100644 index 739ecb24e..000000000 --- a/expts/foundation_model/small.yaml +++ /dev/null @@ -1,346 +0,0 @@ -# @package _global_ - -constants: - seed: &seed 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - entity: multitask-gnn - name: small_test - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 44 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 80 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 44 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 80 - # Data handling-related - batch_size_training: 50 - batch_size_inference: 50 - predictor: - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16 - accumulate_grad_batches: 16 - - ipu_config: - - deviceIterations(5) # IPU would require large batches to be ready for the model. - - replicationFactor(4) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 128) - - Precision.enableStochasticRounding(True) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [4, 4, 4, 4] -# accelerator: -# type: cpu # cpu or ipu or gpu -# config_override: -# datamodule: -# batch_size_training: 64 -# batch_size_inference: 256 -# trainer: -# trainer: -# precision: 32 -# accumulate_grad_batches: 1 - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - qm9: - df: null - df_path: data/neurips2023/small-dataset/qm9.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9.csv.gz - # or set path as the URL directly - smiles_col: "smiles" - label_cols: ["A", "B", "C", "mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "u0", "u298", "h298", "g298", "cv", "u0_atom", "u298_atom", "h298_atom", "g298_atom"] - # sample_size: 2000 # use sample_size for test - splits_path: data/neurips2023/small-dataset/qm9_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9_random_splits.pt` - seed: *seed - task_level: graph - label_normalization: - normalize_val_test: True - method: "normal" - - tox21: - df: null - df_path: data/neurips2023/small-dataset/Tox21-7k-12-labels.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21-7k-12-labels.csv.gz - # or set path as the URL directly - smiles_col: "smiles" - label_cols: ["NR-AR", "NR-AR-LBD", "NR-AhR", "NR-Aromatase", "NR-ER", "NR-ER-LBD", "NR-PPAR-gamma", "SR-ARE", "SR-ATAD5", "SR-HSE", "SR-MMP", "SR-p53"] - # sample_size: 2000 # use sample_size for test - splits_path: data/neurips2023/small-dataset/Tox21_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21_random_splits.pt` - seed: *seed - task_level: graph - - zinc: - df: null - df_path: data/neurips2023/small-dataset/ZINC12k.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k.csv.gz - # or set path as the URL directly - smiles_col: "smiles" - label_cols: ["SA", "logp", "score"] - # sample_size: 2000 # use sample_size for test - splits_path: data/neurips2023/small-dataset/ZINC12k_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k_random_splits.pt` - seed: *seed - task_level: graph - label_normalization: - normalize_val_test: True - method: "normal" - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - num_workers: 30 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: # Set as null to avoid a pre-nn network - out_dim: 64 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.18 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: null # Set as null to avoid a pre-nn network - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - - - gnn: # Set as null to avoid a post-nn network - layer_type: 'pyg:gcn' - in_dim: 64 # or otherwise the correct value - out_dim: &gnn_dim 96 - hidden_dims: *gnn_dim - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'none' - layer_kwargs: null # Parameters for the model itself. You could define dropout_attn: 0.1 - - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - qm9: - task_level: graph - out_dim: 19 - hidden_dims: 128 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - tox21: - task_level: graph - out_dim: 12 - hidden_dims: 64 - depth: 2 - activation: relu - last_activation: sigmoid - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - zinc: - task_level: graph - out_dim: 3 - hidden_dims: 32 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - -#Task-specific -predictor: - metrics_on_progress_bar: - qm9: ["mae"] - tox21: ["auroc"] - zinc: ["mae"] - loss_fun: - qm9: mae_ipu - tox21: bce_ipu - zinc: mae_ipu - random_seed: *seed - optim_kwargs: - lr: 4.e-5 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - qm9: &qm9_metrics - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: flatten - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2_score - metric: r2_score_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - tox21: - - name: auroc - metric: auroc_ipu - task: binary - multitask_handling: mean-per-label - threshold_kwargs: null - - name: avpr - metric: average_precision_ipu - task: binary - multitask_handling: mean-per-label - threshold_kwargs: null - - name: f1 > 0.5 - metric: f1 - multitask_handling: mean-per-label - target_to_int: True - num_classes: 2 - average: micro - threshold_kwargs: &threshold_05 - operator: greater - threshold: 0.5 - th_on_preds: True - th_on_target: True - - name: precision > 0.5 - metric: precision - multitask_handling: mean-per-label - average: micro - threshold_kwargs: *threshold_05 - zinc: *qm9_metrics - -trainer: - seed: *seed - logger: - save_dir: logs/neurips2023-small/ - name: ${constants.name} - project: ${constants.name} - #early_stopping: - # monitor: *monitor - # min_delta: 0 - # patience: 10 - # mode: &mode min - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: *max_epochs - min_epochs: 1 - check_val_every_n_epoch: 20 From 98d3d5be016bc6d86dbe8d971f6e685f209b0e2d Mon Sep 17 00:00:00 2001 From: kerstink-GC Date: Mon, 18 Sep 2023 09:54:43 +0000 Subject: [PATCH 19/58] added 50/100/200/400M configs for foundation model --- .../100M_config_LargeMix_mpnn_GPS++.yaml | 462 +++++++++++++++++ .../200M_config_LargeMix_mpnn_GPS++.yaml | 462 +++++++++++++++++ .../400M_config_LargeMix_mpnn_GPS++.yaml | 463 ++++++++++++++++++ .../50M_config_LargeMix_mpnn_GPS++.yaml | 462 +++++++++++++++++ 4 files changed, 1849 insertions(+) create mode 100644 expts/foundation_model/100M_config_LargeMix_mpnn_GPS++.yaml create mode 100644 expts/foundation_model/200M_config_LargeMix_mpnn_GPS++.yaml create mode 100644 expts/foundation_model/400M_config_LargeMix_mpnn_GPS++.yaml create mode 100644 expts/foundation_model/50M_config_LargeMix_mpnn_GPS++.yaml diff --git a/expts/foundation_model/100M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/100M_config_LargeMix_mpnn_GPS++.yaml new file mode 100644 index 000000000..c8eefd4d2 --- /dev/null +++ b/expts/foundation_model/100M_config_LargeMix_mpnn_GPS++.yaml @@ -0,0 +1,462 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/neurips2023-large/" + epochs: 100 + name: LargeMix_mpnn_100M + wandb: + entity: multitask-gnn + name: LargeMix_mpnn_100M + project: neurips2023_graphcore_scaling_mpnn + + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 50 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 100 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 55 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 100 + # Data handling-related + batch_size_training: 12 + batch_size_inference: 12 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 12 + + ipu_config: + - deviceIterations(30) # IPU would require large batches to be ready for the model. + - replicationFactor(4) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + gnn_layers_per_ipu: [4,4,4,4] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + l1000_vcap: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` + epoch_sampling_fraction: 1.0 + + l1000_mcf7: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcba_1328: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: assayID-* # assayID-* means all columns starting with "assayID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcqm4m_g25: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: graph_* # graph_* means all columns starting with "graph_" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + pcqm4m_n4: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: node_* # node_* means all columns starting with "node_" + # sample_size: 2000 # use sample_size for test + task_level: node + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + seed: 42 + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: disk + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + metrics_on_training_set: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + loss_fun: + l1000_vcap: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: 0.5 + l1000_mcf7: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: ${predictor.loss_fun.l1000_vcap.alpha} + pcba_1328: bce_logits_ipu + pcqm4m_g25: mae_ipu + pcqm4m_n4: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + l1000_vcap: &classif_metrics + - name: auroc + metric: auroc + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + - name: avpr + metric: averageprecision + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + l1000_mcf7: *classif_metrics + pcba_1328: + # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly + - name: auroc + metric: auroc + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + - name: avpr + metric: averageprecision + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + pcqm4m_g25: &pcqm_metrics + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2 + metric: r2_score_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + pcqm4m_n4: *pcqm_metrics + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/neurips2023-large/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} + min_epochs: 1 + check_val_every_n_epoch: 20 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: &gnn_dim 428 + hidden_dims: 1024 + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: 128 + hidden_dims: 512 + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + # in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: *gnn_dim + hidden_dims: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + l1000_vcap: + task_level: graph + out_dim: 2934 + hidden_dims: 128 + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + l1000_mcf7: + task_level: graph + out_dim: 2934 + hidden_dims: 128 + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcba_1328: + task_level: graph + out_dim: 1328 + hidden_dims: 64 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_g25: + task_level: graph + out_dim: 25 + hidden_dims: 32 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_n4: + task_level: node + out_dim: 4 + hidden_dims: 32 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + + # task_heads: + # homolumo: + # task_level: graph + # out_dim: 1 + # hidden_dims: 256 + # depth: 2 + # activation: relu + # last_activation: none + # dropout: *dropout + # normalization: *normalization + # last_normalization: "none" + # residual_type: none diff --git a/expts/foundation_model/200M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/200M_config_LargeMix_mpnn_GPS++.yaml new file mode 100644 index 000000000..298d68109 --- /dev/null +++ b/expts/foundation_model/200M_config_LargeMix_mpnn_GPS++.yaml @@ -0,0 +1,462 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/neurips2023-large/" + epochs: 100 + name: LargeMix_mpnn_200M + wandb: + entity: multitask-gnn + name: LargeMix_mpnn_200M + project: neurips2023_graphcore_scaling_mpnn + + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 116 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 116 + # Data handling-related + batch_size_training: 8 + batch_size_inference: 8 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 16 + + ipu_config: + - deviceIterations(30) # IPU would require large batches to be ready for the model. + - replicationFactor(2) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + l1000_vcap: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` + epoch_sampling_fraction: 1.0 + + l1000_mcf7: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcba_1328: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: assayID-* # assayID-* means all columns starting with "assayID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcqm4m_g25: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: graph_* # graph_* means all columns starting with "graph_" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + pcqm4m_n4: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: node_* # node_* means all columns starting with "node_" + # sample_size: 2000 # use sample_size for test + task_level: node + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + seed: 42 + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: disk + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + metrics_on_training_set: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + loss_fun: + l1000_vcap: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: 0.5 + l1000_mcf7: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: ${predictor.loss_fun.l1000_vcap.alpha} + pcba_1328: bce_logits_ipu + pcqm4m_g25: mae_ipu + pcqm4m_n4: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + l1000_vcap: &classif_metrics + - name: auroc + metric: auroc + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + - name: avpr + metric: averageprecision + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + l1000_mcf7: *classif_metrics + pcba_1328: + # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly + - name: auroc + metric: auroc + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + - name: avpr + metric: averageprecision + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + pcqm4m_g25: &pcqm_metrics + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2 + metric: r2_score_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + pcqm4m_n4: *pcqm_metrics + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/neurips2023-large/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} + min_epochs: 1 + check_val_every_n_epoch: 20 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: &gnn_dim 628 + hidden_dims: 1024 + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: 128 + hidden_dims: 512 + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + # in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: *gnn_dim + hidden_dims: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + l1000_vcap: + task_level: graph + out_dim: 2934 + hidden_dims: 128 + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + l1000_mcf7: + task_level: graph + out_dim: 2934 + hidden_dims: 128 + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcba_1328: + task_level: graph + out_dim: 1328 + hidden_dims: 64 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_g25: + task_level: graph + out_dim: 25 + hidden_dims: 32 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_n4: + task_level: node + out_dim: 4 + hidden_dims: 32 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + + # task_heads: + # homolumo: + # task_level: graph + # out_dim: 1 + # hidden_dims: 256 + # depth: 2 + # activation: relu + # last_activation: none + # dropout: *dropout + # normalization: *normalization + # last_normalization: "none" + # residual_type: none diff --git a/expts/foundation_model/400M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/400M_config_LargeMix_mpnn_GPS++.yaml new file mode 100644 index 000000000..08820d330 --- /dev/null +++ b/expts/foundation_model/400M_config_LargeMix_mpnn_GPS++.yaml @@ -0,0 +1,463 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/neurips2023-large/" + epochs: 100 + name: LargeMix_mpnn_400M + wandb: + entity: multitask-gnn + name: LargeMix_mpnn_400M + project: neurips2023_graphcore_scaling_mpnn + + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 70 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 150 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 70 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 150 + # Data handling-related + batch_size_training: 4 + batch_size_inference: 4 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 32 + + ipu_config: + - deviceIterations(30) # IPU would require large batches to be ready for the model. + - replicationFactor(1) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] + #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + l1000_vcap: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` + epoch_sampling_fraction: 1.0 + + l1000_mcf7: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcba_1328: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: assayID-* # assayID-* means all columns starting with "assayID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcqm4m_g25: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: graph_* # graph_* means all columns starting with "graph_" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + pcqm4m_n4: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: node_* # node_* means all columns starting with "node_" + # sample_size: 2000 # use sample_size for test + task_level: node + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + seed: 42 + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: disk + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + metrics_on_training_set: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + loss_fun: + l1000_vcap: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: 0.5 + l1000_mcf7: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: ${predictor.loss_fun.l1000_vcap.alpha} + pcba_1328: bce_logits_ipu + pcqm4m_g25: mae_ipu + pcqm4m_n4: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + l1000_vcap: &classif_metrics + - name: auroc + metric: auroc + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + - name: avpr + metric: averageprecision + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + l1000_mcf7: *classif_metrics + pcba_1328: + # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly + - name: auroc + metric: auroc + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + - name: avpr + metric: averageprecision + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + pcqm4m_g25: &pcqm_metrics + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2 + metric: r2_score_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + pcqm4m_n4: *pcqm_metrics + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/neurips2023-large/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} + min_epochs: 1 + check_val_every_n_epoch: 20 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: &gnn_dim 910 + hidden_dims: 1024 + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: 128 + hidden_dims: 512 + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + # in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: *gnn_dim + hidden_dims: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + l1000_vcap: + task_level: graph + out_dim: 2934 + hidden_dims: 128 + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + l1000_mcf7: + task_level: graph + out_dim: 2934 + hidden_dims: 128 + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcba_1328: + task_level: graph + out_dim: 1328 + hidden_dims: 64 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_g25: + task_level: graph + out_dim: 25 + hidden_dims: 32 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_n4: + task_level: node + out_dim: 4 + hidden_dims: 32 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + + # task_heads: + # homolumo: + # task_level: graph + # out_dim: 1 + # hidden_dims: 256 + # depth: 2 + # activation: relu + # last_activation: none + # dropout: *dropout + # normalization: *normalization + # last_normalization: "none" + # residual_type: none diff --git a/expts/foundation_model/50M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/50M_config_LargeMix_mpnn_GPS++.yaml new file mode 100644 index 000000000..b75d7d2e0 --- /dev/null +++ b/expts/foundation_model/50M_config_LargeMix_mpnn_GPS++.yaml @@ -0,0 +1,462 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/neurips2023-large/" + epochs: 100 + name: LargeMix_mpnn_50M + wandb: + entity: multitask-gnn + name: LargeMix_mpnn_50M + project: neurips2023_graphcore_scaling_mpnn + + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 40 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 100 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 45 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 100 + # Data handling-related + batch_size_training: 16 + batch_size_inference: 16 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 16 + + ipu_config: + - deviceIterations(30) # IPU would require large batches to be ready for the model. + - replicationFactor(4) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + gnn_layers_per_ipu: [4,4,4,4] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + l1000_vcap: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` + epoch_sampling_fraction: 1.0 + + l1000_mcf7: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcba_1328: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: assayID-* # assayID-* means all columns starting with "assayID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcqm4m_g25: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: graph_* # graph_* means all columns starting with "graph_" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + pcqm4m_n4: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: node_* # node_* means all columns starting with "node_" + # sample_size: 2000 # use sample_size for test + task_level: node + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + seed: 42 + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: disk + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + metrics_on_training_set: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + loss_fun: + l1000_vcap: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: 0.5 + l1000_mcf7: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: ${predictor.loss_fun.l1000_vcap.alpha} + pcba_1328: bce_logits_ipu + pcqm4m_g25: mae_ipu + pcqm4m_n4: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + l1000_vcap: &classif_metrics + - name: auroc + metric: auroc + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + - name: avpr + metric: averageprecision + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + l1000_mcf7: *classif_metrics + pcba_1328: + # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly + - name: auroc + metric: auroc + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + - name: avpr + metric: averageprecision + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + pcqm4m_g25: &pcqm_metrics + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2 + metric: r2_score_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + pcqm4m_n4: *pcqm_metrics + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/neurips2023-large/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} + min_epochs: 1 + check_val_every_n_epoch: 20 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: &gnn_dim 282 + hidden_dims: 1024 + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: 128 + hidden_dims: 512 + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + # in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: *gnn_dim + hidden_dims: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: *gnn_dim + hidden_dims: *gnn_dim + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + l1000_vcap: + task_level: graph + out_dim: 2934 + hidden_dims: 128 + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + l1000_mcf7: + task_level: graph + out_dim: 2934 + hidden_dims: 128 + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcba_1328: + task_level: graph + out_dim: 1328 + hidden_dims: 64 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_g25: + task_level: graph + out_dim: 25 + hidden_dims: 32 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_n4: + task_level: node + out_dim: 4 + hidden_dims: 32 + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + + # task_heads: + # homolumo: + # task_level: graph + # out_dim: 1 + # hidden_dims: 256 + # depth: 2 + # activation: relu + # last_activation: none + # dropout: *dropout + # normalization: *normalization + # last_normalization: "none" + # residual_type: none From 6d7f18b26f0031d3f78e86c187c05890397126ec Mon Sep 17 00:00:00 2001 From: dominique Date: Mon, 18 Sep 2023 21:11:34 -0400 Subject: [PATCH 20/58] Added toymix baselines of GatedGCN and MPNN++ --- docs/baseline.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/docs/baseline.md b/docs/baseline.md index 029996554..8c86ca067 100644 --- a/docs/baseline.md +++ b/docs/baseline.md @@ -11,18 +11,24 @@ One can observe that the smaller datasets (`Zinc12k` and `Tox21`) beneficiate fr | **QM9** | GCN | 0.102 ± 0.0003 | 0.958 ± 0.0007 | 0.920 ± 0.002 | 0.119 ± 0.01 | 0.955 ± 0.001 | 0.915 ± 0.001 | | | GIN | 0.0976 ± 0.0006 | **0.959 ± 0.0002** | **0.922 ± 0.0004** | 0.117 ± 0.01 | 0.950 ± 0.002 | 0.908 ± 0.003 | | | GINE | **0.0959 ± 0.0002** | 0.955 ± 0.002 | 0.918 ± 0.004 | 0.102 ± 0.01 | 0.956 ± 0.0009 | 0.918 ± 0.002 | -| -| **Zinc12k** | GCN | 0.348 ± 0.02 | 0.941 ± 0.002 | 0.863 ± 0.01 | 0.226 ± 0.004 | 0.973 ± 0.0005 | 0.940 ± 0.003 | +| | GatedGCN | | | | 0.1212 ± 0.0009 | 0.9457 ± 0.0002 | 0.8964 ± 0.0006 | +| | MPNN++ (sum) | | | | 0.1174 ± 0.0012 | 0.9460 ± 0.0005 | 0.8989 ± 0.0008 | + **Zinc12k** | GCN | 0.348 ± 0.02 | 0.941 ± 0.002 | 0.863 ± 0.01 | 0.226 ± 0.004 | 0.973 ± 0.0005 | 0.940 ± 0.003 | | | GIN | 0.303 ± 0.007 | 0.950 ± 0.003 | 0.889 ± 0.003 | 0.189 ± 0.004 | 0.978 ± 0.006 | 0.953 ± 0.002 | -| | GINE | 0.266 ± 0.02 | 0.961 ± 0.003 | 0.915 ± 0.01 | **0.147 ± 0.009** | **0.987 ± 0.001** | **0.971 ± 0.003** | +| | GINE | 0.266 ± 0.02 | 0.961 ± 0.003 | 0.915 ± 0.01 | 0.147 ± 0.009 | 0.987 ± 0.001 | 0.971 ± 0.003 | +| | GatedGCN | | | | 0.1282 ± 0.0045 | 0.9850 ± 0.0006 | 0.9639 ± 0.0024 | +| | MPNN++ (sum) | | | | **0.1002 ± 0.0025** | **0.9909 ± 0.0004** | **0.9777 ± 0.0014** | | | | BCE ↓ | AUROC ↑ | AP ↑ | BCE ↓ | AUROC ↑ | AP ↑ | |-----------|-------|-----------|-----------|-----------|---------|-----------|---------| | | Single-Task Model Multi-Task Model | | -| **Tox21** | GCN | 0.202 ± 0.005 | 0.773 ± 0.006 | 0.334 ± 0.03 | **0.176 ± 0.001** | **0.850 ± 0.006** | 0.446 ± 0.01 | +| **Tox21** | GCN | 0.202 ± 0.005 | 0.773 ± 0.006 | 0.334 ± 0.03 | 0.176 ± 0.001 | 0.850 ± 0.006 | 0.446 ± 0.01 | | | GIN | 0.200 ± 0.002 | 0.789 ± 0.009 | 0.350 ± 0.01 | 0.176 ± 0.001 | 0.841 ± 0.005 | 0.454 ± 0.009 | -| | GINE | 0.201 ± 0.007 | 0.783 ± 0.007 | 0.345 ± 0.02 | 0.177 ± 0.0008 | 0.836 ± 0.004 | **0.455 ± 0.008** | +| | GINE | 0.201 ± 0.007 | 0.783 ± 0.007 | 0.345 ± 0.02 | 0.177 ± 0.0008 | 0.836 ± 0.004 | 0.455 ± 0.008 | +| | GatedGCN | | | | 0.1733 ± 0.0015 | 0.8522 ± 0.0022 | **0.4620 ± 0.0118** | +| | MPNN++ (sum) | | | | **0.1725 ± 0.0012** | **0.8569 ± 0.0005** | 0.4598 ± 0.0044 | + # LargeMix Baseline ## LargeMix test set metrics From 01c40fc15aee049e5b6d4add8a1056a087cc85df Mon Sep 17 00:00:00 2001 From: dominique Date: Mon, 18 Sep 2023 21:23:40 -0400 Subject: [PATCH 21/58] Adapted text for new baselines --- docs/baseline.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/baseline.md b/docs/baseline.md index 8c86ca067..6ddb0b86c 100644 --- a/docs/baseline.md +++ b/docs/baseline.md @@ -4,6 +4,8 @@ From the paper to be released soon. Below, you can see the baselines for the `To One can observe that the smaller datasets (`Zinc12k` and `Tox21`) beneficiate from adding another unrelated task (`QM9`), where the labels are computed from DFT simulations. +**NEW baselines added 2023/09/18**: Multitask baselines have been added for GatedGCN and MPNN++ (sum aggretator) using 3 random seeds. They achieve the best performance by a significant margin on Zinc12k and Tox21, while sacrificing a little on QM9. + | Dataset | Model | MAE ↓ | Pearson ↑ | R² ↑ | MAE ↓ | Pearson ↑ | R² ↑ | |-----------|-------|-----------|-----------|-----------|---------|-----------|---------| | | Single-Task Model Multi-Task Model | From e2551f26126d91f8d4a6f74f375d64b6a32f7d42 Mon Sep 17 00:00:00 2001 From: dominique Date: Mon, 18 Sep 2023 22:03:44 -0400 Subject: [PATCH 22/58] Added new largemix baselines --- docs/baseline.md | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/docs/baseline.md b/docs/baseline.md index 6ddb0b86c..cac1ee282 100644 --- a/docs/baseline.md +++ b/docs/baseline.md @@ -96,6 +96,40 @@ This is not surprising as they contain two orders of magnitude more datapoints a | | GIN | 0.1873 ± 0.0033 | **0.1701 ± 0.0142** | | | GINE | 0.1883 ± 0.0039 | **0.1771 ± 0.0010** | +## NEW: Largemix improved sweep - 2023/08-18 + +Unsatisfied with the prior results, we ran a bayesian search over a broader set of parameters, and including only more expressive models, namely GINE, GatedGCN and MPNN++. We further increase the number of parameters to 10M due to evidence of underfitting. We evaluate only the multitask setting. + +We observe a significant improvement over all tasks, with a very notable r2-score increase of +0.53 (0.27 -> 0.80) compared to the best node-level property prediction on PCQM4M_N4. + +The results are reported below over 1 seed. We are currently running more seeds of the same models. + +| Dataset | Model | MAE ↓ | Pearson ↑ | R² ↑ | +|---------------|----------------|--------|---------|--------| +| **PCQM4M_G25** | GINE | 0.2250 | 0.8840 | 0.7911 | +| | GatedGCN | 0.2457 | 0.8698 | 0.7688 | +| | MPNN++ (sum) | 0.2269 | 0.8802 | 0.7855 | +| +| **PCQM4M_N4** | GINE | 0.2699 | 0.8475 | 0.7182 | +| | GatedGCN | 0.3337 | 0.8102 | 0.6566 | +| | MPNN++ (sum) | 0.2114 | 0.8942 | 0.8000 | + +| Dataset | Model | BCE ↓ | AUROC ↑ | AP ↑ | +|---------------|----------------|--------|---------|--------| +| **PCBA_1328** | GINE | 0.0334 | 0.7879 | 0.2808 | +| | GatedGCN | 0.0351 | 0.7788 | 0.2611 | +| | MPNN++ (sum) | 0.0344 | 0.7815 | 0.2666 | +| +| **L1000_VCAP** | GINE | 0.1907 | 0.6416 | 0.4042 | +| | GatedGCN | 0.1866 | 0.6395 | 0.4092 | +| | MPNN++ (sum) | 0.1867 | 0.6478 | 0.4131 | +| +| **L1000_MCF7** | GINE | 0.1931 | 0.6352 | 0.4235 | +| | GatedGCN | 0.1859 | 0.6547 | 0.4224 | +| | MPNN++ (sum) | 0.1870 | 0.6593 | 0.4254 | + + + # UltraLarge Baseline ## UltraLarge test set metrics From 1188f97651bfcb4c2a46ecaab4e288c726f8427c Mon Sep 17 00:00:00 2001 From: Maciej Sypetkowski Date: Fri, 22 Sep 2023 08:40:52 -0600 Subject: [PATCH 23/58] Minor fixes --- graphium/cli/train_finetune_test.py | 3 +++ graphium/nn/architectures/encoder_manager.py | 2 ++ graphium/nn/architectures/global_architectures.py | 13 +++++++++++-- graphium/nn/encoders/laplace_pos_encoder.py | 5 +++-- graphium/nn/pyg_layers/gps_pyg.py | 6 ++++-- graphium/nn/pyg_layers/mpnn_pyg.py | 7 ++++--- graphium/utils/spaces.py | 5 ++++- 7 files changed, 31 insertions(+), 10 deletions(-) diff --git a/graphium/cli/train_finetune_test.py b/graphium/cli/train_finetune_test.py index ffb5a7512..885839ec0 100644 --- a/graphium/cli/train_finetune_test.py +++ b/graphium/cli/train_finetune_test.py @@ -5,6 +5,7 @@ import fsspec import hydra +import numpy as np import torch import wandb import yaml @@ -40,6 +41,8 @@ TESTING_ONLY_CONFIG_KEY = "testing_only" +OmegaConf.register_new_resolver("eval", lambda x: eval(x, {"np": np})) + @hydra.main(version_base=None, config_path="../../expts/hydra-configs", config_name="main") def cli(cfg: DictConfig) -> None: diff --git a/graphium/nn/architectures/encoder_manager.py b/graphium/nn/architectures/encoder_manager.py index e3e48aeba..464d9e9cc 100644 --- a/graphium/nn/architectures/encoder_manager.py +++ b/graphium/nn/architectures/encoder_manager.py @@ -135,6 +135,8 @@ def _initialize_positional_encoders(self, pe_encoders_kwargs: Dict[str, Any]) -> if pe_out_dim2 is not None: assert edge_pe_out_dim == pe_out_dim2, f"values mismatch {pe_out_dim}!={pe_out_dim2}" pe_encoders[encoder_name] = encoder(out_dim=edge_pe_out_dim, **this_in_dims, **encoder_kwargs) + else: + pe_encoders[encoder_name] = encoder(**this_in_dims, **encoder_kwargs) return pe_encoders diff --git a/graphium/nn/architectures/global_architectures.py b/graphium/nn/architectures/global_architectures.py index a90a2cba9..75dfa7b38 100644 --- a/graphium/nn/architectures/global_architectures.py +++ b/graphium/nn/architectures/global_architectures.py @@ -422,6 +422,7 @@ def __init__( residual_skip_steps: int = 1, in_dim_edges: int = 0, hidden_dims_edges: List[int] = [], + out_dim_edges: int = 0, name: str = "GNN", layer_kwargs: Optional[Dict] = None, virtual_node: str = "none", @@ -509,6 +510,10 @@ def __init__( Hidden dimensions for the edges. Most models don't support it, so it should only be used for those that do, i.e. `GatedGCNLayer` + out_dim_edges: + Output edge-feature dimensions of the network. Keep at 0 if not using + edge features, or if the layer doesn't support edges. + name: Name attributed to the current network, for display and printing purposes. @@ -545,6 +550,7 @@ def __init__( # Initialize the additional attributes self.in_dim_edges = in_dim_edges + self.out_dim_edges = out_dim_edges if isinstance(hidden_dims_edges, int): self.hidden_dims_edges = [hidden_dims_edges] * (depth - 1) elif len(hidden_dims_edges) == 0: @@ -553,8 +559,9 @@ def __init__( self.hidden_dims_edges = list(hidden_dims_edges) assert depth is None self.full_dims_edges = None - if len(self.hidden_dims_edges) > 0: - self.full_dims_edges = [self.in_dim_edges] + self.hidden_dims_edges + [self.hidden_dims_edges[-1]] + if len(self.hidden_dims_edges) or out_dim_edges > 0: + assert out_dim_edges > 0, out_dim_edges + self.full_dims_edges = [self.in_dim_edges] + self.hidden_dims_edges + [out_dim_edges] self.virtual_node = virtual_node.lower() if virtual_node is not None else "none" @@ -922,6 +929,7 @@ def get_init_kwargs(self) -> Dict[str, Any]: new_kwargs = dict( in_dim_edges=self.in_dim_edges, hidden_dims_edges=self.hidden_dims_edges, + out_dim_edges=self.out_dim_edges, virtual_node=self.virtual_node, use_virtual_edges=self.use_virtual_edges, ) @@ -953,6 +961,7 @@ def make_mup_base_kwargs( kwargs["in_dim_edges"] = round(kwargs["in_dim_edges"] / divide_factor) if not self.last_layer_is_readout: kwargs["out_dim"] = round(kwargs["out_dim"] / divide_factor) + kwargs["out_dim_edges"] = round(kwargs["out_dim_edges"] / divide_factor) def _recursive_divide_dim(x: collections.abc.Mapping): for k, v in x.items(): diff --git a/graphium/nn/encoders/laplace_pos_encoder.py b/graphium/nn/encoders/laplace_pos_encoder.py index ccf642e9d..7cc69919b 100644 --- a/graphium/nn/encoders/laplace_pos_encoder.py +++ b/graphium/nn/encoders/laplace_pos_encoder.py @@ -3,7 +3,7 @@ import torch.nn as nn from torch_geometric.data import Batch -from graphium.nn.base_layers import MLP, get_norm, FCLayer +from graphium.nn.base_layers import MLP, get_norm, FCLayer, TransformerEncoderLayerMup from graphium.nn.encoders.base_encoder import BaseEncoder @@ -70,7 +70,8 @@ def __init__( if self.model_type == "Transformer": # Transformer model for LapPE model_kwargs.setdefault("nhead", 1) - encoder_layer = nn.TransformerEncoderLayer( + encoder_layer = TransformerEncoderLayerMup( + None, d_model=hidden_dim, batch_first=True, dropout=dropout, diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py index f3da56979..b82fad782 100644 --- a/graphium/nn/pyg_layers/gps_pyg.py +++ b/graphium/nn/pyg_layers/gps_pyg.py @@ -240,7 +240,7 @@ def forward(self, batch: Batch) -> Batch: def _parse_mpnn_layer(self, mpnn_type, mpnn_kwargs: Dict[str, Any]) -> Optional[Module]: """Parse the MPNN layer.""" - if mpnn_type is None: + if mpnn_type is None or mpnn_type == "none": return mpnn_kwargs = deepcopy(mpnn_kwargs) @@ -375,7 +375,7 @@ def _self_attention_block(self, feat: Tensor, feat_in: Tensor, batch: Batch) -> ) attn_bias = None - if self.biased_attention_key is not None: + if self.biased_attention_key is not None and self.biased_attention_key != 'none': attn_bias = batch[self.biased_attention_key] # h_dense[num_graphs, max_num_nodes, hidden_dim] -> feat_attn[num_graphs, max_num_nodes, hidden_dim] @@ -463,6 +463,8 @@ def layer_outputs_edges(self) -> bool: bool: Always ``False`` for the current class """ + if self.mpnn is None: + return False return self.mpnn.layer_outputs_edges @property diff --git a/graphium/nn/pyg_layers/mpnn_pyg.py b/graphium/nn/pyg_layers/mpnn_pyg.py index e11e0c9f3..73118c8f7 100644 --- a/graphium/nn/pyg_layers/mpnn_pyg.py +++ b/graphium/nn/pyg_layers/mpnn_pyg.py @@ -130,14 +130,15 @@ def __init__( self.num_edge_mlp = num_edge_mlp self.edge_dropout_rate = edge_dropout_rate - self.aggregator = MultiAggregation(aggregation_method) + self.aggregator = MultiAggregation(list(aggregation_method)) + n_agg = len(aggregation_method) # node_model: edge_dim = self.out_dim_edges if use_edges else self.in_dim_edges if self.node_combine_method == "concat": - node_model_in_dim = 3 * self.in_dim + 2 * edge_dim + node_model_in_dim = (1 + 2 * n_agg) * self.in_dim + 2 * n_agg * edge_dim elif self.node_combine_method == "sum": - node_model_in_dim = 2 * self.in_dim + edge_dim + node_model_in_dim = (1 + n_agg) * self.in_dim + n_agg * edge_dim else: raise ValueError(f"node_combine_method {self.node_combine_method} not recognised.") node_model_hidden_dim = self.mlp_expansion_ratio * self.in_dim diff --git a/graphium/utils/spaces.py b/graphium/utils/spaces.py index 8ba7c4505..d821223a4 100644 --- a/graphium/utils/spaces.py +++ b/graphium/utils/spaces.py @@ -35,6 +35,7 @@ "pyg:pna-msgpass": PygLayers.PNAMessagePassingPyg, "pyg:gps": PygLayers.GPSLayerPyg, "pyg:dimenet": PygLayers.DimeNetPyg, + "pyg:mpnnplus": PygLayers.MPNNPlusPyg, } LAYERS_DICT = deepcopy(FC_LAYERS_DICT) @@ -51,6 +52,8 @@ } LOSS_DICT = { + "bce": torch.nn.BCELoss, + "bce_logits": torch.nn.BCEWithLogitsLoss, "mse": torch.nn.MSELoss, "bce": torch.nn.BCELoss, "l1": torch.nn.L1Loss, @@ -105,7 +108,7 @@ "msle": TorchMetrics.mean_squared_log_error, "pearsonr": TorchMetrics.pearson_corrcoef, "spearmanr": TorchMetrics.spearman_corrcoef, - "r2": TorchMetrics.r2_score, + "r2_score": TorchMetrics.r2_score, "cosine": TorchMetrics.cosine_similarity, "pearsonr_ipu": Metrics.pearson_ipu, "spearmanr_ipu": Metrics.spearman_ipu, From b3b4b87ddf92a5f43a966445579544ec3362dea7 Mon Sep 17 00:00:00 2001 From: shenyangHuang Date: Sat, 23 Sep 2023 11:55:36 -0400 Subject: [PATCH 24/58] update dataset links --- docs/datasets.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/datasets.md b/docs/datasets.md index fc4e0f292..6733736f4 100644 --- a/docs/datasets.md +++ b/docs/datasets.md @@ -1,6 +1,8 @@ # Graphium Datasets -Graphium datasets are hosted at on Zenodo on [this link](https://zenodo.org/record/8206704). +Graphium datasets are hosted at on Zenodo +- ***ToyMix*** and ***LargeMix*** dataseets are hosted on [this link](https://doi.org/10.5281/zenodo.7998401) +- ***UltraLarge*** dataset is hosted on [this link](https://doi.org/10.5281/zenodo.8370547) Instead of provinding datasets as a single entity, our aim is to provide dataset mixes containing a variety of datasets that are meant to be predicted simultaneously using multi-tasking. From b34cb9ee3289ae162565dea0a4ccc5c32690dcd5 Mon Sep 17 00:00:00 2001 From: kerstink-GC Date: Mon, 25 Sep 2023 09:44:11 +0000 Subject: [PATCH 25/58] added configs with scaling factor sections --- .../SF_11M_config_LargeMix_mpnn_GPS++.yaml | 476 ++++++++++++++++++ .../SF_169M_config_LargeMix_mpnn_GPS++.yaml | 476 ++++++++++++++++++ .../SF_378M_config_LargeMix_mpnn_GPS++.yaml | 476 ++++++++++++++++++ .../SF_42M_config_LargeMix_mpnn_GPS++.yaml | 476 ++++++++++++++++++ .../SF_671M_config_LargeMix_mpnn_GPS++.yaml | 476 ++++++++++++++++++ 5 files changed, 2380 insertions(+) create mode 100644 expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml create mode 100644 expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml create mode 100644 expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml create mode 100644 expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml create mode 100644 expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml diff --git a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml new file mode 100644 index 000000000..a1778670e --- /dev/null +++ b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml @@ -0,0 +1,476 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/neurips2023-large/" + epochs: 100 + name: SF_11M_sweep_LargeMix_mpnn + wandb: + entity: multitask-gnn + name: SF_11M_sweep_LargeMix_mpnn + project: neurips2023_graphcore_scaling_mpnn + + # This whole sections is for minimizing mistakes for the scaling experiments. + # This is the ONLY place where dimensions ahve to change. + # No other dimensions have to be changed in the architecture part. + +dimensions: + pre_nn_out_dim: &gnn_dim 256 # original 256 + pre_nn_hidden_dims: 1024 # original 1024 + pre_nn_edges_out_dim: &gnn_dim_edges 128 # original 128 + pre_nn_edges_hidden_dims: 512 # original 512 + gnn_out_dim: *gnn_dim # original 256 + gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + mpnn_in_dim: *gnn_dim # original 256 + mpnn_out_dim: *gnn_dim # original 256 + mpnn_in_dim_edges: *gnn_dim_edges # original 128 + mpnn_out_dim_edges: *gnn_dim_edges # original 128 + graph_output_nn_out_dims: *gnn_dim # original 256 + graph_output_nn_hidden_dims: *gnn_dim # original 256 + node_output_nn_out_dims: *gnn_dim # original 256 + node_output_nn_hidden_dims: *gnn_dim # original 256 + l1000_vcap_hidden_dims: 128 # original 128 + l1000_mcf7_hidden_dims: 128 # original 128 + pcba_1328_hidden_dims: 64 # original 64 + pcqm4m_g25_hidden_dims: 32 # original 32 + pcqm4m_n4_hidden_dims: 32 # original 32 + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 100 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 100 + # Data handling-related + batch_size_training: 30 + batch_size_inference: 30 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 2 + + ipu_config: + - deviceIterations(30) # IPU would require large batches to be ready for the model. + - replicationFactor(16) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + # accelerator_kwargs: + #_accelerator: "ipu" + #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] + #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] + #gnn_layers_per_ipu: [4,4,4,4] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + l1000_vcap: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` + epoch_sampling_fraction: 1.0 + + l1000_mcf7: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcba_1328: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: assayID-* # assayID-* means all columns starting with "assayID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcqm4m_g25: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: graph_* # graph_* means all columns starting with "graph_" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + pcqm4m_n4: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: node_* # node_* means all columns starting with "node_" + # sample_size: 2000 # use sample_size for test + task_level: node + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + seed: 42 + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: disk + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + metrics_on_training_set: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + loss_fun: + l1000_vcap: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: 0.5 + l1000_mcf7: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: ${predictor.loss_fun.l1000_vcap.alpha} + pcba_1328: bce_logits_ipu + pcqm4m_g25: mae_ipu + pcqm4m_n4: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + l1000_vcap: &classif_metrics + - name: auroc + metric: auroc + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + - name: avpr + metric: averageprecision + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + l1000_mcf7: *classif_metrics + pcba_1328: + # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly + - name: auroc + metric: auroc + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + - name: avpr + metric: averageprecision + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + pcqm4m_g25: &pcqm_metrics + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2 + metric: r2_score_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + pcqm4m_n4: *pcqm_metrics + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/neurips2023-large/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} + min_epochs: 1 + check_val_every_n_epoch: 20 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: ${dimensions.pre_nn_out_dim} + hidden_dims: ${dimensions.pre_nn_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: ${dimensions.pre_nn_edges_out_dim} + hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + # in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: ${dimensions.gnn_out_dim} + hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 4 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: ${dimensions.graph_output_nn_out_dims} + hidden_dims: ${dimensions.graph_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: ${dimensions.node_output_nn_out_dims} + hidden_dims: ${dimensions.node_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + l1000_vcap: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_vcap_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + l1000_mcf7: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcba_1328: + task_level: graph + out_dim: 1328 + hidden_dims: ${dimensions.pcba_1328_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_g25: + task_level: graph + out_dim: 25 + hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_n4: + task_level: node + out_dim: 4 + hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + diff --git a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml new file mode 100644 index 000000000..a645870be --- /dev/null +++ b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml @@ -0,0 +1,476 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/neurips2023-large/" + epochs: 100 + name: SF_169M_sweep_LargeMix_mpnn + wandb: + entity: multitask-gnn + name: SF_169M_sweep_LargeMix_mpnn + project: neurips2023_graphcore_scaling_mpnn + + # This whole sections is for minimizing mistakes for the scaling experiments. + # This is the ONLY place where dimensions ahve to change. + # No other dimensions have to be changed in the architecture part. + +dimensions: + pre_nn_out_dim: &gnn_dim 512 # original 256 + pre_nn_hidden_dims: 2048 # original 1024 + pre_nn_edges_out_dim: &gnn_dim_edges 256 # original 128 + pre_nn_edges_hidden_dims: 1024 # original 512 + gnn_out_dim: *gnn_dim # original 256 + gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + mpnn_in_dim: *gnn_dim # original 256 + mpnn_out_dim: *gnn_dim # original 256 + mpnn_in_dim_edges: *gnn_dim_edges # original 128 + mpnn_out_dim_edges: *gnn_dim_edges # original 128 + graph_output_nn_out_dims: *gnn_dim # original 256 + graph_output_nn_hidden_dims: *gnn_dim # original 256 + node_output_nn_out_dims: *gnn_dim # original 256 + node_output_nn_hidden_dims: *gnn_dim # original 256 + l1000_vcap_hidden_dims: 256 # original 128 + l1000_mcf7_hidden_dims: 256 # original 128 + pcba_1328_hidden_dims: 128 # original 64 + pcqm4m_g25_hidden_dims: 64 # original 32 + pcqm4m_n4_hidden_dims: 64 # original 32 + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 116 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 116 + # Data handling-related + batch_size_training: 8 + batch_size_inference: 8 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 16 + + ipu_config: + - deviceIterations(30) # IPU would require large batches to be ready for the model. + - replicationFactor(2) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] + gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] + #gnn_layers_per_ipu: [4,4,4,4] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + l1000_vcap: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` + epoch_sampling_fraction: 1.0 + + l1000_mcf7: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcba_1328: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: assayID-* # assayID-* means all columns starting with "assayID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcqm4m_g25: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: graph_* # graph_* means all columns starting with "graph_" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + pcqm4m_n4: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: node_* # node_* means all columns starting with "node_" + # sample_size: 2000 # use sample_size for test + task_level: node + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + seed: 42 + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: disk + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + metrics_on_training_set: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + loss_fun: + l1000_vcap: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: 0.5 + l1000_mcf7: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: ${predictor.loss_fun.l1000_vcap.alpha} + pcba_1328: bce_logits_ipu + pcqm4m_g25: mae_ipu + pcqm4m_n4: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + l1000_vcap: &classif_metrics + - name: auroc + metric: auroc + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + - name: avpr + metric: averageprecision + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + l1000_mcf7: *classif_metrics + pcba_1328: + # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly + - name: auroc + metric: auroc + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + - name: avpr + metric: averageprecision + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + pcqm4m_g25: &pcqm_metrics + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2 + metric: r2_score_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + pcqm4m_n4: *pcqm_metrics + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/neurips2023-large/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} + min_epochs: 1 + check_val_every_n_epoch: 20 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: ${dimensions.pre_nn_out_dim} + hidden_dims: ${dimensions.pre_nn_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: ${dimensions.pre_nn_edges_out_dim} + hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + # in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: ${dimensions.gnn_out_dim} + hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: ${dimensions.graph_output_nn_out_dims} + hidden_dims: ${dimensions.graph_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: ${dimensions.node_output_nn_out_dims} + hidden_dims: ${dimensions.node_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + l1000_vcap: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_vcap_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + l1000_mcf7: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcba_1328: + task_level: graph + out_dim: 1328 + hidden_dims: ${dimensions.pcba_1328_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_g25: + task_level: graph + out_dim: 25 + hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_n4: + task_level: node + out_dim: 4 + hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + diff --git a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml new file mode 100644 index 000000000..5a5dbc203 --- /dev/null +++ b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml @@ -0,0 +1,476 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/neurips2023-large/" + epochs: 100 + name: SF_378M_sweep_LargeMix_mpnn + wandb: + entity: multitask-gnn + name: SF_378M_sweep_LargeMix_mpnn + project: neurips2023_graphcore_scaling_mpnn + + # This whole sections is for minimizing mistakes for the scaling experiments. + # This is the ONLY place where dimensions ahve to change. + # No other dimensions have to be changed in the architecture part. + +dimensions: + pre_nn_out_dim: &gnn_dim 768 # original 256 + pre_nn_hidden_dims: 3072 # original 1024 + pre_nn_edges_out_dim: &gnn_dim_edges 384 # original 128 + pre_nn_edges_hidden_dims: 1536 # original 512 + gnn_out_dim: *gnn_dim # original 256 + gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + mpnn_in_dim: *gnn_dim # original 256 + mpnn_out_dim: *gnn_dim # original 256 + mpnn_in_dim_edges: *gnn_dim_edges # original 128 + mpnn_out_dim_edges: *gnn_dim_edges # original 128 + graph_output_nn_out_dims: *gnn_dim # original 256 + graph_output_nn_hidden_dims: *gnn_dim # original 256 + node_output_nn_out_dims: *gnn_dim # original 256 + node_output_nn_hidden_dims: *gnn_dim # original 256 + l1000_vcap_hidden_dims: 384 # original 128 + l1000_mcf7_hidden_dims: 384 # original 128 + pcba_1328_hidden_dims: 192 # original 64 + pcqm4m_g25_hidden_dims: 96 # original 32 + pcqm4m_n4_hidden_dims: 96 # original 32 + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 80 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 115 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 115 + # Data handling-related + batch_size_training: 7 + batch_size_inference: 7 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 64 + + ipu_config: + - deviceIterations(30) # IPU would require large batches to be ready for the model. + - replicationFactor(1) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] + #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] + #gnn_layers_per_ipu: [4,4,4,4] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + l1000_vcap: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` + epoch_sampling_fraction: 1.0 + + l1000_mcf7: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcba_1328: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: assayID-* # assayID-* means all columns starting with "assayID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcqm4m_g25: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: graph_* # graph_* means all columns starting with "graph_" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + pcqm4m_n4: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: node_* # node_* means all columns starting with "node_" + # sample_size: 2000 # use sample_size for test + task_level: node + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + seed: 42 + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: disk + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + metrics_on_training_set: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + loss_fun: + l1000_vcap: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: 0.5 + l1000_mcf7: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: ${predictor.loss_fun.l1000_vcap.alpha} + pcba_1328: bce_logits_ipu + pcqm4m_g25: mae_ipu + pcqm4m_n4: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + l1000_vcap: &classif_metrics + - name: auroc + metric: auroc + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + - name: avpr + metric: averageprecision + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + l1000_mcf7: *classif_metrics + pcba_1328: + # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly + - name: auroc + metric: auroc + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + - name: avpr + metric: averageprecision + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + pcqm4m_g25: &pcqm_metrics + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2 + metric: r2_score_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + pcqm4m_n4: *pcqm_metrics + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/neurips2023-large/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} + min_epochs: 1 + check_val_every_n_epoch: 20 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: ${dimensions.pre_nn_out_dim} + hidden_dims: ${dimensions.pre_nn_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: ${dimensions.pre_nn_edges_out_dim} + hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + #in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: ${dimensions.gnn_out_dim} + hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: ${dimensions.graph_output_nn_out_dims} + hidden_dims: ${dimensions.graph_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: ${dimensions.node_output_nn_out_dims} + hidden_dims: ${dimensions.node_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + l1000_vcap: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_vcap_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + l1000_mcf7: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcba_1328: + task_level: graph + out_dim: 1328 + hidden_dims: ${dimensions.pcba_1328_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_g25: + task_level: graph + out_dim: 25 + hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_n4: + task_level: node + out_dim: 4 + hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + diff --git a/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml new file mode 100644 index 000000000..a7b42ce53 --- /dev/null +++ b/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml @@ -0,0 +1,476 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/neurips2023-large/" + epochs: 100 + name: SF_42M_sweep_LargeMix_mpnn + wandb: + entity: multitask-gnn + name: SF_42M_sweep_LargeMix_mpnn + project: neurips2023_graphcore_scaling_mpnn + + # This whole sections is for minimizing mistakes for the scaling experiments. + # This is the ONLY place where dimensions ahve to change. + # No other dimensions have to be changed in the architecture part. + +dimensions: + pre_nn_out_dim: &gnn_dim 256 # original 256 + pre_nn_hidden_dims: 1024 # original 1024 + pre_nn_edges_out_dim: &gnn_dim_edges 128 # original 128 + pre_nn_edges_hidden_dims: 512 # original 512 + gnn_out_dim: *gnn_dim # original 256 + gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + mpnn_in_dim: *gnn_dim # original 256 + mpnn_out_dim: *gnn_dim # original 256 + mpnn_in_dim_edges: *gnn_dim_edges # original 128 + mpnn_out_dim_edges: *gnn_dim_edges # original 128 + graph_output_nn_out_dims: *gnn_dim # original 256 + graph_output_nn_hidden_dims: *gnn_dim # original 256 + node_output_nn_out_dims: *gnn_dim # original 256 + node_output_nn_hidden_dims: *gnn_dim # original 256 + l1000_vcap_hidden_dims: 128 # original 128 + l1000_mcf7_hidden_dims: 128 # original 128 + pcba_1328_hidden_dims: 64 # original 64 + pcqm4m_g25_hidden_dims: 32 # original 32 + pcqm4m_n4_hidden_dims: 32 # original 32 + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 40 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 100 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 45 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 100 + # Data handling-related + batch_size_training: 16 + batch_size_inference: 16 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 16 + + ipu_config: + - deviceIterations(30) # IPU would require large batches to be ready for the model. + - replicationFactor(4) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] + #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] + gnn_layers_per_ipu: [4,4,4,4] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + l1000_vcap: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` + epoch_sampling_fraction: 1.0 + + l1000_mcf7: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcba_1328: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: assayID-* # assayID-* means all columns starting with "assayID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcqm4m_g25: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: graph_* # graph_* means all columns starting with "graph_" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + pcqm4m_n4: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: node_* # node_* means all columns starting with "node_" + # sample_size: 2000 # use sample_size for test + task_level: node + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + seed: 42 + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: disk + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + metrics_on_training_set: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + loss_fun: + l1000_vcap: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: 0.5 + l1000_mcf7: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: ${predictor.loss_fun.l1000_vcap.alpha} + pcba_1328: bce_logits_ipu + pcqm4m_g25: mae_ipu + pcqm4m_n4: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + l1000_vcap: &classif_metrics + - name: auroc + metric: auroc + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + - name: avpr + metric: averageprecision + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + l1000_mcf7: *classif_metrics + pcba_1328: + # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly + - name: auroc + metric: auroc + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + - name: avpr + metric: averageprecision + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + pcqm4m_g25: &pcqm_metrics + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2 + metric: r2_score_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + pcqm4m_n4: *pcqm_metrics + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/neurips2023-large/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} + min_epochs: 1 + check_val_every_n_epoch: 20 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: ${dimensions.pre_nn_out_dim} + hidden_dims: ${dimensions.pre_nn_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: ${dimensions.pre_nn_edges_out_dim} + hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + # in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: ${dimensions.gnn_out_dim} + hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: ${dimensions.graph_output_nn_out_dims} + hidden_dims: ${dimensions.graph_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: ${dimensions.node_output_nn_out_dims} + hidden_dims: ${dimensions.node_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + l1000_vcap: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_vcap_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + l1000_mcf7: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcba_1328: + task_level: graph + out_dim: 1328 + hidden_dims: ${dimensions.pcba_1328_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_g25: + task_level: graph + out_dim: 25 + hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_n4: + task_level: node + out_dim: 4 + hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + diff --git a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml new file mode 100644 index 000000000..b403e958e --- /dev/null +++ b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml @@ -0,0 +1,476 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/neurips2023-large/" + epochs: 100 + name: SF_173M_sweep_LargeMix_mpnn + wandb: + entity: multitask-gnn + name: SF_173M_sweep_LargeMix_mpnn + project: neurips2023_graphcore_scaling_mpnn + + # This whole sections is for minimizing mistakes for the scaling experiments. + # This is the ONLY place where dimensions ahve to change. + # No other dimensions have to be changed in the architecture part. + +dimensions: + pre_nn_out_dim: &gnn_dim 1024 # original 256 + pre_nn_hidden_dims: 4096 # original 1024 + pre_nn_edges_out_dim: &gnn_dim_edges 512 # original 128 + pre_nn_edges_hidden_dims: 2048 # original 512 + gnn_out_dim: *gnn_dim # original 256 + gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + mpnn_in_dim: *gnn_dim # original 256 + mpnn_out_dim: *gnn_dim # original 256 + mpnn_in_dim_edges: *gnn_dim_edges # original 128 + mpnn_out_dim_edges: *gnn_dim_edges # original 128 + graph_output_nn_out_dims: *gnn_dim # original 256 + graph_output_nn_hidden_dims: *gnn_dim # original 256 + node_output_nn_out_dims: *gnn_dim # original 256 + node_output_nn_hidden_dims: *gnn_dim # original 256 + l1000_vcap_hidden_dims: 512 # original 128 + l1000_mcf7_hidden_dims: 512 # original 128 + pcba_1328_hidden_dims: 256 # original 64 + pcqm4m_g25_hidden_dims: 128 # original 32 + pcqm4m_n4_hidden_dims: 128 # original 32 + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 116 + ipu_dataloader_inference_opts: + mode: async + max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 116 + # Data handling-related + batch_size_training: 8 + batch_size_inference: 8 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 60 + + ipu_config: + - deviceIterations(30) # IPU would require large batches to be ready for the model. + - replicationFactor(2) + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] + gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] + + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + l1000_vcap: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` + epoch_sampling_fraction: 1.0 + + l1000_mcf7: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcba_1328: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: assayID-* # assayID-* means all columns starting with "assayID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcqm4m_g25: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: graph_* # graph_* means all columns starting with "graph_" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + pcqm4m_n4: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: node_* # node_* means all columns starting with "node_" + # sample_size: 2000 # use sample_size for test + task_level: node + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + seed: 42 + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: disk + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + metrics_on_training_set: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + loss_fun: + l1000_vcap: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: 0.5 + l1000_mcf7: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: ${predictor.loss_fun.l1000_vcap.alpha} + pcba_1328: bce_logits_ipu + pcqm4m_g25: mae_ipu + pcqm4m_n4: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + l1000_vcap: &classif_metrics + - name: auroc + metric: auroc + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + - name: avpr + metric: averageprecision + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + l1000_mcf7: *classif_metrics + pcba_1328: + # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly + - name: auroc + metric: auroc + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + - name: avpr + metric: averageprecision + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + pcqm4m_g25: &pcqm_metrics + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2 + metric: r2_score_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + pcqm4m_n4: *pcqm_metrics + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/neurips2023-large/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} + min_epochs: 1 + check_val_every_n_epoch: 20 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: ${dimensions.pre_nn_out_dim} + hidden_dims: ${dimensions.pre_nn_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: ${dimensions.pre_nn_edges_out_dim} + hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + # in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: ${dimensions.gnn_out_dim} + hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: ${dimensions.graph_output_nn_out_dims} + hidden_dims: ${dimensions.graph_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: ${dimensions.node_output_nn_out_dims} + hidden_dims: ${dimensions.node_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + l1000_vcap: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_vcap_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + l1000_mcf7: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcba_1328: + task_level: graph + out_dim: 1328 + hidden_dims: ${dimensions.pcba_1328_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_g25: + task_level: graph + out_dim: 25 + hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_n4: + task_level: node + out_dim: 4 + hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + From 011167251e66a5b45357b861c81aa13beaa028c1 Mon Sep 17 00:00:00 2001 From: Maciej Sypetkowski Date: Mon, 25 Sep 2023 04:10:38 -0600 Subject: [PATCH 26/58] Fix style --- graphium/nn/pyg_layers/gps_pyg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py index b82fad782..555d8bef0 100644 --- a/graphium/nn/pyg_layers/gps_pyg.py +++ b/graphium/nn/pyg_layers/gps_pyg.py @@ -375,7 +375,7 @@ def _self_attention_block(self, feat: Tensor, feat_in: Tensor, batch: Batch) -> ) attn_bias = None - if self.biased_attention_key is not None and self.biased_attention_key != 'none': + if self.biased_attention_key is not None and self.biased_attention_key != "none": attn_bias = batch[self.biased_attention_key] # h_dense[num_graphs, max_num_nodes, hidden_dim] -> feat_attn[num_graphs, max_num_nodes, hidden_dim] From dcf237ed3f9c71d8d061cc3f239eb85da48d8d03 Mon Sep 17 00:00:00 2001 From: Maciej Sypetkowski Date: Mon, 25 Sep 2023 04:16:28 -0600 Subject: [PATCH 27/58] Rename r2 -> r2_score in configs --- expts/hydra-configs/tasks/loss_metrics_datamodule/admet.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/expts/hydra-configs/tasks/loss_metrics_datamodule/admet.yaml b/expts/hydra-configs/tasks/loss_metrics_datamodule/admet.yaml index 87136b683..1affb5042 100644 --- a/expts/hydra-configs/tasks/loss_metrics_datamodule/admet.yaml +++ b/expts/hydra-configs/tasks/loss_metrics_datamodule/admet.yaml @@ -80,7 +80,7 @@ metrics: target_nan_mask: null multitask_handling: mean-per-label - name: r2_score - metric: r2 + metric: r2_score target_nan_mask: null multitask_handling: mean-per-label threshold_kwargs: null @@ -138,4 +138,4 @@ datamodule: args: # TDC specific tdc_benchmark_names: null - tdc_train_val_seed: ${constants.seed} \ No newline at end of file + tdc_train_val_seed: ${constants.seed} From a38ba090d12842d6851a3811aa74e3146ac71738 Mon Sep 17 00:00:00 2001 From: Maciej Sypetkowski Date: Mon, 25 Sep 2023 04:36:50 -0600 Subject: [PATCH 28/58] Make out_dim_edges to default to the last hidden edge dim --- .../nn/architectures/global_architectures.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/graphium/nn/architectures/global_architectures.py b/graphium/nn/architectures/global_architectures.py index 75dfa7b38..0eaee740f 100644 --- a/graphium/nn/architectures/global_architectures.py +++ b/graphium/nn/architectures/global_architectures.py @@ -422,7 +422,7 @@ def __init__( residual_skip_steps: int = 1, in_dim_edges: int = 0, hidden_dims_edges: List[int] = [], - out_dim_edges: int = 0, + out_dim_edges: Optional[int] = None, name: str = "GNN", layer_kwargs: Optional[Dict] = None, virtual_node: str = "none", @@ -512,7 +512,8 @@ def __init__( out_dim_edges: Output edge-feature dimensions of the network. Keep at 0 if not using - edge features, or if the layer doesn't support edges. + edge features, or if the layer doesn't support edges. Defaults to the + last value of hidden_dims_edges. name: Name attributed to the current network, for display and printing @@ -550,7 +551,6 @@ def __init__( # Initialize the additional attributes self.in_dim_edges = in_dim_edges - self.out_dim_edges = out_dim_edges if isinstance(hidden_dims_edges, int): self.hidden_dims_edges = [hidden_dims_edges] * (depth - 1) elif len(hidden_dims_edges) == 0: @@ -558,10 +558,17 @@ def __init__( else: self.hidden_dims_edges = list(hidden_dims_edges) assert depth is None + self.out_dim_edges = ( + out_dim_edges + if out_dim_edges is not None + else self.hidden_dims_edges[-1] + if self.hidden_dims_edges + else 0 + ) self.full_dims_edges = None - if len(self.hidden_dims_edges) or out_dim_edges > 0: - assert out_dim_edges > 0, out_dim_edges - self.full_dims_edges = [self.in_dim_edges] + self.hidden_dims_edges + [out_dim_edges] + if len(self.hidden_dims_edges) or self.out_dim_edges > 0: + assert self.out_dim_edges > 0, self.out_dim_edges + self.full_dims_edges = [self.in_dim_edges] + self.hidden_dims_edges + [self.out_dim_edges] self.virtual_node = virtual_node.lower() if virtual_node is not None else "none" From a0db06d80d8792b0ab0ced23ace8ff3bbf1193e0 Mon Sep 17 00:00:00 2001 From: Maciej Sypetkowski Date: Mon, 25 Sep 2023 05:24:28 -0600 Subject: [PATCH 29/58] Fix artifact logging, log also unresolved config --- graphium/cli/train_finetune_test.py | 3 ++- graphium/config/_loader.py | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/graphium/cli/train_finetune_test.py b/graphium/cli/train_finetune_test.py index 885839ec0..55a27881c 100644 --- a/graphium/cli/train_finetune_test.py +++ b/graphium/cli/train_finetune_test.py @@ -57,6 +57,7 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None: The main (pre-)training and fine-tuning loop. """ + unresolved_cfg = OmegaConf.to_container(cfg, resolve=False) cfg = OmegaConf.to_container(cfg, resolve=True) dst_dir = cfg["constants"].get("results_dir") @@ -136,7 +137,7 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None: trainer.callbacks.append(GraphFinetuning(**finetuning_training_kwargs)) if wandb_cfg is not None: - save_params_to_wandb(trainer.logger, cfg, predictor, datamodule) + save_params_to_wandb(trainer.logger, cfg, predictor, datamodule, unresolved_config=unresolved_cfg) # Determine the max num nodes and edges in training and validation logger.info("Computing the maximum number of nodes and edges per graph") diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py index 5a6754c54..ab8b72e05 100644 --- a/graphium/config/_loader.py +++ b/graphium/config/_loader.py @@ -445,6 +445,7 @@ def save_params_to_wandb( config: Union[omegaconf.DictConfig, Dict[str, Any]], predictor: PredictorModule, datamodule: MultitaskFromSmilesDataModule, + unresolved_config: Optional[Union[omegaconf.DictConfig, Dict[str, Any]]] = None, ): """ Save a few stuff to weights-and-biases WandB @@ -453,13 +454,16 @@ def save_params_to_wandb( config: The config file, with key `trainer` predictor: The predictor used to handle the train/val/test steps logic datamodule: The datamodule used to load the data into training + unresolved_config: The unresolved config file """ # Get the wandb runner and directory wandb_run = logger.experiment + if wandb_run is None: - wandb_run = "" - wandb_dir = wandb_run.dir + wandb_dir = "" + else: + wandb_dir = wandb_run.dir # Save the mup base model to WandB as a yaml file mup.save_base_shapes(predictor.model, os.path.join(wandb_dir, "mup_base_params.yaml")) @@ -468,14 +472,18 @@ def save_params_to_wandb( with open(os.path.join(wandb_dir, "full_configs.yaml"), "w") as file: yaml.dump(config, file) + if unresolved_config is not None: + with open(os.path.join(wandb_dir, "unresolved_config.yaml"), "w") as file: + yaml.dump(unresolved_config, file) + # Save the featurizer into wandb featurizer_path = os.path.join(wandb_dir, "featurizer.pickle") joblib.dump(datamodule.smiles_transformer, featurizer_path) # Save the featurizer and configs into wandb if wandb_run is not None: - wandb_run.save("*.yaml") - wandb_run.save("*.pickle") + wandb_run.save(os.path.join(wandb_dir, "*.yaml"), wandb_dir) + wandb_run.save(os.path.join(wandb_dir, "*.pickle"), wandb_dir) def load_accelerator(config: Union[omegaconf.DictConfig, Dict[str, Any]]) -> Tuple[Dict[str, Any], str]: From 4d59a517e5447b953ea77a7457ed127b73da0efd Mon Sep 17 00:00:00 2001 From: wenkelf Date: Mon, 25 Sep 2023 15:11:14 +0000 Subject: [PATCH 30/58] Few mup updates --- .gitignore | 1 + graphium/cli/train_finetune_test.py | 4 +++- graphium/config/_loader.py | 3 +++ graphium/nn/utils.py | 2 +- scripts/scale_mpnn.sh | 9 +++++++++ 5 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 scripts/scale_mpnn.sh diff --git a/.gitignore b/.gitignore index b9f39521e..c43a954c3 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ tests/temp_cache* predictions/ draft/ scripts-expts/ +mup/ # Data and predictions graphium/data/ZINC_bench_gnn/ diff --git a/graphium/cli/train_finetune_test.py b/graphium/cli/train_finetune_test.py index 885839ec0..e7909a17f 100644 --- a/graphium/cli/train_finetune_test.py +++ b/graphium/cli/train_finetune_test.py @@ -142,9 +142,11 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None: logger.info("Computing the maximum number of nodes and edges per graph") predictor.set_max_nodes_edges_per_graph(datamodule, stages=["train", "val"]) + ckpt_path = cfg["trainer"].pop("resume_from_checkpoint", None) + # Run the model training with SafeRun(name="TRAINING", raise_error=cfg["constants"]["raise_train_error"], verbose=True): - trainer.fit(model=predictor, datamodule=datamodule) + trainer.fit(model=predictor, datamodule=datamodule, ckpt_path=ckpt_path) # Save validation metrics - Base utility in case someone doesn't use a logger. results = trainer.callback_metrics diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py index 5a6754c54..f54d5db55 100644 --- a/graphium/config/_loader.py +++ b/graphium/config/_loader.py @@ -327,6 +327,9 @@ def load_predictor( model_class=model_class, model_kwargs=scaled_model_kwargs, metrics=metrics, + task_levels=task_levels, + featurization=featurization, + task_norms=task_norms, **cfg_pred, ) diff --git a/graphium/nn/utils.py b/graphium/nn/utils.py index 68a8779c4..e9ac4fa0c 100644 --- a/graphium/nn/utils.py +++ b/graphium/nn/utils.py @@ -40,7 +40,7 @@ def scale_kwargs(self, scale_factor: Real, scale_in_dim: bool = False): divide_factor = 1 / scale_factor - if scale_in_dim is None: + if not scale_in_dim: return self.make_mup_base_kwargs(divide_factor=divide_factor) # If scale_in_dim passed, need to check it can be forwarded diff --git a/scripts/scale_mpnn.sh b/scripts/scale_mpnn.sh new file mode 100644 index 000000000..8cd61fb86 --- /dev/null +++ b/scripts/scale_mpnn.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +graphium-train \ + --config-path=/home/frederik_valencediscovery_com/projects/graphium_hps/expts/configs/ \ + --config-name=config_mpnn_base.yaml \ + constants.max_epochs=100 \ + trainer.model_checkpoint.dirpath=model_checkpoints/large-dataset/scale_mpnn/ \ + +architecture.mup_scale_factor=2 +architecture.mup_base_path=mup/mpnn_base/base_shapes.yaml \ + datamodule.args.batch_size_inference=1024 datamodule.args.batch_size_training=1024 +trainer.trainer.accumulate_grad_batches=2 \ \ No newline at end of file From 1f869ccf239bce1e6d616b4fbce5ee026d804ca8 Mon Sep 17 00:00:00 2001 From: wenkelf Date: Mon, 25 Sep 2023 20:20:47 +0000 Subject: [PATCH 31/58] Updating predictor --- graphium/trainer/predictor.py | 1 + graphium/trainer/predictor_options.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/graphium/trainer/predictor.py b/graphium/trainer/predictor.py index 6824a40df..19a5c7a29 100644 --- a/graphium/trainer/predictor.py +++ b/graphium/trainer/predictor.py @@ -221,6 +221,7 @@ def configure_optimizers(self, impl=None): # Define the optimizer and schedulers optimiser = MuAdam(self.parameters(), **self.optim_options.optim_kwargs, impl=impl) + self.optim_options.torch_scheduler_kwargs.pop("module_type") torch_scheduler = self.optim_options.scheduler_class( optimizer=optimiser, **self.optim_options.torch_scheduler_kwargs ) diff --git a/graphium/trainer/predictor_options.py b/graphium/trainer/predictor_options.py index 0bab97674..3fbfb4e4d 100644 --- a/graphium/trainer/predictor_options.py +++ b/graphium/trainer/predictor_options.py @@ -99,7 +99,7 @@ def set_kwargs(self): self.torch_scheduler_kwargs.setdefault("module_type", "ReduceLROnPlateau") # Get the class for the scheduler - scheduler_class = self.torch_scheduler_kwargs.pop("module_type", None) + scheduler_class = self.torch_scheduler_kwargs.get("module_type", None) if self.scheduler_class is None: if isinstance(scheduler_class, str): self.scheduler_class = SCHEDULER_DICT[scheduler_class] From 870ea4017b299995aa1a3499776f190a3ed74b4a Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Fri, 29 Sep 2023 12:28:00 +0000 Subject: [PATCH 32/58] Adding a 671M config that compiles and should run ona POD 16 system --- ..._config_LargeMix_mpnn_GPS++_compiling.yaml | 482 ++++++++++++++++++ 1 file changed, 482 insertions(+) create mode 100644 expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml diff --git a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml new file mode 100644 index 000000000..2d00c9768 --- /dev/null +++ b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml @@ -0,0 +1,482 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/net/group/research/kerstink/neurips2023-large/" + epochs: 100 + name: SF_671M_sweep_LargeMix_mpnn + wandb: + entity: multitask-gnn + name: SF_671M_sweep_LargeMix_mpnn + project: neurips2023_graphcore_scaling_mpnn + + # This whole sections is for minimizing mistakes for the scaling experiments. + # This is the ONLY place where dimensions ahve to change. + # No other dimensions have to be changed in the architecture part. + +dimensions: + pre_nn_out_dim: &gnn_dim 1024 # original 256 + pre_nn_hidden_dims: 4096 # original 1024 + pre_nn_edges_out_dim: &gnn_dim_edges 512 # original 128 + pre_nn_edges_hidden_dims: 2048 # original 512 + gnn_out_dim: *gnn_dim # original 256 + gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + mpnn_in_dim: *gnn_dim # original 256 + mpnn_out_dim: *gnn_dim # original 256 + mpnn_in_dim_edges: *gnn_dim_edges # original 128 + mpnn_out_dim_edges: *gnn_dim_edges # original 128 + graph_output_nn_out_dims: *gnn_dim # original 256 + graph_output_nn_hidden_dims: *gnn_dim # original 256 + node_output_nn_out_dims: *gnn_dim # original 256 + node_output_nn_hidden_dims: *gnn_dim # original 256 + l1000_vcap_hidden_dims: 512 # original 128 + l1000_mcf7_hidden_dims: 512 # original 128 + pcba_1328_hidden_dims: 256 # original 64 + pcqm4m_g25_hidden_dims: 128 # original 32 + pcqm4m_n4_hidden_dims: 128 # original 32 + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 + # max_num_edges_per_graph: 116 + max_num_nodes: 128 + max_num_edges: 272 + ipu_dataloader_inference_opts: + mode: async + # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 + # max_num_edges_per_graph: 116 + max_num_nodes: 128 + max_num_edges: 272 + # Data handling-related + batch_size_training: 3 + batch_size_inference: 2 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 320 + + ipu_config: + - deviceIterations(1) # IPU would require large batches to be ready for the model. + - replicationFactor(1) + - 'setAvailableMemoryProportion({"IPU0": 0.1})' + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + # - TensorLocations.numIOTiles(128) + - _Popart.set("defaultBufferingDepth", 96) + - _Popart.set("saveInitializersToFile", "weights.onnx") + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(30) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] + # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] + + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + l1000_vcap: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` + epoch_sampling_fraction: 1.0 + + l1000_mcf7: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcba_1328: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: assayID-* # assayID-* means all columns starting with "assayID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcqm4m_g25: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: graph_* # graph_* means all columns starting with "graph_" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + pcqm4m_n4: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: node_* # node_* means all columns starting with "node_" + # sample_size: 2000 # use sample_size for test + task_level: node + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + seed: 42 + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: disk + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + max_num_nodes: 100 + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + metrics_on_training_set: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + loss_fun: + l1000_vcap: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: 0.5 + l1000_mcf7: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: ${predictor.loss_fun.l1000_vcap.alpha} + pcba_1328: bce_logits_ipu + pcqm4m_g25: mae_ipu + pcqm4m_n4: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + l1000_vcap: &classif_metrics + - name: auroc + metric: auroc + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + - name: avpr + metric: averageprecision + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + l1000_mcf7: *classif_metrics + pcba_1328: + # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly + - name: auroc + metric: auroc + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + - name: avpr + metric: averageprecision + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + pcqm4m_g25: &pcqm_metrics + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2 + metric: r2_score_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + pcqm4m_n4: *pcqm_metrics + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/neurips2023-large/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} + min_epochs: 1 + check_val_every_n_epoch: 20 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: ${dimensions.pre_nn_out_dim} + hidden_dims: ${dimensions.pre_nn_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: ${dimensions.pre_nn_edges_out_dim} + hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + # in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: ${dimensions.gnn_out_dim} + hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: ${dimensions.graph_output_nn_out_dims} + hidden_dims: ${dimensions.graph_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: ${dimensions.node_output_nn_out_dims} + hidden_dims: ${dimensions.node_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + l1000_vcap: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_vcap_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + l1000_mcf7: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcba_1328: + task_level: graph + out_dim: 1328 + hidden_dims: ${dimensions.pcba_1328_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_g25: + task_level: graph + out_dim: 25 + hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_n4: + task_level: node + out_dim: 4 + hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + From 8a9acd31d993ef76f6781ed43aec7f86e594023a Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Fri, 29 Sep 2023 12:32:00 +0000 Subject: [PATCH 33/58] nodes -> atoms --- .../SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml index 2d00c9768..01b0bf315 100644 --- a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml +++ b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml @@ -181,7 +181,7 @@ datamodule: add_self_loop: False explicit_H: False # if H is included use_bonds_weights: False - max_num_nodes: 100 + max_num_atoms: 100 pos_encoding_as_features: # encoder dropout 0.18 pos_types: lap_eigvec: From 2c7f037bde32418b59b3384cf8149504608d94c7 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Wed, 4 Oct 2023 10:42:10 +0000 Subject: [PATCH 34/58] Config changes --- .../SF_378M_config_LargeMix_mpnn_GPS++.yaml | 31 +++---- .../SF_671M_config_LargeMix_mpnn_GPS++.yaml | 82 ++++++++++--------- 2 files changed, 60 insertions(+), 53 deletions(-) diff --git a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml index 5a5dbc203..490da9370 100644 --- a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml @@ -45,45 +45,45 @@ accelerator: args: ipu_dataloader_training_opts: mode: async - max_num_nodes_per_graph: 80 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 115 + max_num_nodes_per_graph: 56 # train max nodes: 20, max_edges: 54 + max_num_edges_per_graph: 96 ipu_dataloader_inference_opts: mode: async - max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 115 + max_num_nodes_per_graph: 56 # valid max nodes: 51, max_edges: 118 + max_num_edges_per_graph: 96 # Data handling-related - batch_size_training: 7 - batch_size_inference: 7 + batch_size_training: 4 + batch_size_inference: 4 predictor: metrics_every_n_train_steps: 1000 optim_kwargs: - loss_scaling: 1024 + loss_scaling: 16000 trainer: trainer: precision: 16-true - accumulate_grad_batches: 64 + accumulate_grad_batches: 240 ipu_config: - - deviceIterations(30) # IPU would require large batches to be ready for the model. + - deviceIterations(5) # IPU would require large batches to be ready for the model. - replicationFactor(1) # - enableProfiling("graph_analyser") # The folder where the profile will be stored # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) + # - TensorLocations.numIOTiles(128) + # - _Popart.set("defaultBufferingDepth", 96) - Precision.enableStochasticRounding(True) # - Precision.enableFloatingPointExceptions(True) ipu_inference_config: # set device iteration and replication factor to 1 during inference # gradient accumulation was set to 1 in the code - - deviceIterations(30) + - deviceIterations(20) - replicationFactor(1) - Precision.enableStochasticRounding(False) accelerator_kwargs: _accelerator: "ipu" gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] - #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] + # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] #gnn_layers_per_ipu: [4,4,4,4] datamodule: @@ -176,6 +176,7 @@ datamodule: add_self_loop: False explicit_H: False # if H is included use_bonds_weights: False + max_num_atoms: 100 pos_encoding_as_features: # encoder dropout 0.18 pos_types: lap_eigvec: @@ -195,8 +196,8 @@ datamodule: pos_type: rw_return_probs ksteps: 16 - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. + num_workers: 30 # -1 to use all + persistent_workers: True # if use persistent worker at the start of each epoch. # Using persistent_workers false might make the start of each epoch very long. #Task-specific diff --git a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml index b403e958e..c6719001a 100644 --- a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml @@ -5,12 +5,12 @@ constants: seed: 42 raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/neurips2023-large/" + datacache_path: "../../neurips2023-large/" epochs: 100 - name: SF_173M_sweep_LargeMix_mpnn + name: SF_671M_sweep_LargeMix_mpnn wandb: entity: multitask-gnn - name: SF_173M_sweep_LargeMix_mpnn + name: SF_671M_sweep_LargeMix_mpnn project: neurips2023_graphcore_scaling_mpnn # This whole sections is for minimizing mistakes for the scaling experiments. @@ -18,25 +18,25 @@ constants: # No other dimensions have to be changed in the architecture part. dimensions: - pre_nn_out_dim: &gnn_dim 1024 # original 256 - pre_nn_hidden_dims: 4096 # original 1024 - pre_nn_edges_out_dim: &gnn_dim_edges 512 # original 128 - pre_nn_edges_hidden_dims: 2048 # original 512 - gnn_out_dim: *gnn_dim # original 256 - gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - mpnn_in_dim: *gnn_dim # original 256 - mpnn_out_dim: *gnn_dim # original 256 - mpnn_in_dim_edges: *gnn_dim_edges # original 128 - mpnn_out_dim_edges: *gnn_dim_edges # original 128 - graph_output_nn_out_dims: *gnn_dim # original 256 - graph_output_nn_hidden_dims: *gnn_dim # original 256 - node_output_nn_out_dims: *gnn_dim # original 256 - node_output_nn_hidden_dims: *gnn_dim # original 256 - l1000_vcap_hidden_dims: 512 # original 128 - l1000_mcf7_hidden_dims: 512 # original 128 - pcba_1328_hidden_dims: 256 # original 64 - pcqm4m_g25_hidden_dims: 128 # original 32 - pcqm4m_n4_hidden_dims: 128 # original 32 + pre_nn_out_dim: 960 + pre_nn_hidden_dims: 3840 + pre_nn_edges_out_dim: 480 + pre_nn_edges_hidden_dims: 1920 + gnn_out_dim: 960 + gnn_hidden_dims: 960 + mpnn_in_dim: 960 + mpnn_out_dim: 960 + mpnn_in_dim_edges: 480 + mpnn_out_dim_edges: 480 + graph_output_nn_out_dims: 960 + graph_output_nn_hidden_dims: 960 + node_output_nn_out_dims: 960 + node_output_nn_hidden_dims: 960 + l1000_vcap_hidden_dims: 480 + l1000_mcf7_hidden_dims: 480 + pcba_1328_hidden_dims: 240 + pcqm4m_g25_hidden_dims: 120 + pcqm4m_n4_hidden_dims: 120 accelerator: type: ipu # cpu or ipu or gpu @@ -45,15 +45,19 @@ accelerator: args: ipu_dataloader_training_opts: mode: async - max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 116 + # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 + # max_num_edges_per_graph: 116 + max_num_nodes: 140 + max_num_edges: 290 ipu_dataloader_inference_opts: mode: async - max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 116 + # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 + # max_num_edges_per_graph: 116 + max_num_nodes: 140 + max_num_edges: 290 # Data handling-related - batch_size_training: 8 - batch_size_inference: 8 + batch_size_training: 3 + batch_size_inference: 3 predictor: metrics_every_n_train_steps: 1000 optim_kwargs: @@ -61,29 +65,30 @@ accelerator: trainer: trainer: precision: 16-true - accumulate_grad_batches: 60 + accumulate_grad_batches: 320 ipu_config: - - deviceIterations(30) # IPU would require large batches to be ready for the model. - - replicationFactor(2) + - deviceIterations(5) # IPU would require large batches to be ready for the model. + - replicationFactor(1) + # - 'setAvailableMemoryProportion({"IPU0": 0.05})' # - enableProfiling("graph_analyser") # The folder where the profile will be stored # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) + # - TensorLocations.numIOTiles(128) + # - _Popart.set("defaultBufferingDepth", 96) + - _Popart.set("saveInitializersToFile", "weights.onnx") - Precision.enableStochasticRounding(True) # - Precision.enableFloatingPointExceptions(True) - ipu_inference_config: # set device iteration and replication factor to 1 during inference # gradient accumulation was set to 1 in the code - - deviceIterations(30) + - deviceIterations(16) - replicationFactor(1) - Precision.enableStochasticRounding(False) accelerator_kwargs: _accelerator: "ipu" - #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] - gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] + gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] + # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] datamodule: @@ -176,6 +181,7 @@ datamodule: add_self_loop: False explicit_H: False # if H is included use_bonds_weights: False + max_num_atoms: 100 pos_encoding_as_features: # encoder dropout 0.18 pos_types: lap_eigvec: @@ -196,7 +202,7 @@ datamodule: ksteps: 16 num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. + persistent_workers: True # if use persistent worker at the start of each epoch. # Using persistent_workers false might make the start of each epoch very long. #Task-specific From 826996a5704359f53dc3ae8ffe19ca684e175d7f Mon Sep 17 00:00:00 2001 From: kerstink-GC Date: Wed, 4 Oct 2023 12:51:16 +0000 Subject: [PATCH 35/58] 378M debug config --- ...378M_config_LargeMix_mpnn_GPS++_debug.yaml | 481 +++++++++++++++++ .../SF_590M_config_LargeMix_mpnn_GPS++.yaml | 482 ++++++++++++++++++ 2 files changed, 963 insertions(+) create mode 100644 expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++_debug.yaml create mode 100644 expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml diff --git a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++_debug.yaml b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++_debug.yaml new file mode 100644 index 000000000..d7fe43e6e --- /dev/null +++ b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++_debug.yaml @@ -0,0 +1,481 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/neurips2023-large/" + epochs: 100 + name: SF_378M_sweep_LargeMix_mpnn + wandb: + entity: multitask-gnn + name: SF_378M_sweep_LargeMix_mpnn + project: neurips2023_graphcore_scaling_mpnn + + # This whole sections is for minimizing mistakes for the scaling experiments. + # This is the ONLY place where dimensions ahve to change. + # No other dimensions have to be changed in the architecture part. + +dimensions: + pre_nn_out_dim: 768 + pre_nn_hidden_dims: 3072 + pre_nn_edges_out_dim: 384 + pre_nn_edges_hidden_dims: 1536 + gnn_out_dim: 768 + gnn_hidden_dims: 768 + mpnn_in_dim: 768 + mpnn_out_dim: 768 + mpnn_in_dim_edges: 384 + mpnn_out_dim_edges: 384 + graph_output_nn_out_dims: 768 + graph_output_nn_hidden_dims: 768 + node_output_nn_out_dims: 768 + node_output_nn_hidden_dims: 768 + l1000_vcap_hidden_dims: 384 + l1000_mcf7_hidden_dims: 384 + pcba_1328_hidden_dims: 192 + pcqm4m_g25_hidden_dims: 96 + pcqm4m_n4_hidden_dims: 96 + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 + # max_num_edges_per_graph: 116 + max_num_nodes: 140 + max_num_edges: 290 + ipu_dataloader_inference_opts: + mode: async + # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 + # max_num_edges_per_graph: 116 + max_num_nodes: 140 + max_num_edges: 290 + # Data handling-related + batch_size_training: 3 + batch_size_inference: 3 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 320 + + ipu_config: + - deviceIterations(5) # IPU would require large batches to be ready for the model. + - replicationFactor(1) + # - 'setAvailableMemoryProportion({"IPU0": 0.05})' + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + # - TensorLocations.numIOTiles(128) + # - _Popart.set("defaultBufferingDepth", 96) + - _Popart.set("saveInitializersToFile", "weights.onnx") + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(16) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] + # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + l1000_vcap: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` + epoch_sampling_fraction: 1.0 + + l1000_mcf7: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcba_1328: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: assayID-* # assayID-* means all columns starting with "assayID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcqm4m_g25: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: graph_* # graph_* means all columns starting with "graph_" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + pcqm4m_n4: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: node_* # node_* means all columns starting with "node_" + # sample_size: 2000 # use sample_size for test + task_level: node + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + seed: 42 + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: disk + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + max_num_atoms: 100 + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + num_workers: 32 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + metrics_on_training_set: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + loss_fun: + l1000_vcap: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: 0.5 + l1000_mcf7: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: ${predictor.loss_fun.l1000_vcap.alpha} + pcba_1328: bce_logits_ipu + pcqm4m_g25: mae_ipu + pcqm4m_n4: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + l1000_vcap: &classif_metrics + - name: auroc + metric: auroc + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + - name: avpr + metric: averageprecision + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + l1000_mcf7: *classif_metrics + pcba_1328: + # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly + - name: auroc + metric: auroc + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + - name: avpr + metric: averageprecision + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + pcqm4m_g25: &pcqm_metrics + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2 + metric: r2_score_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + pcqm4m_n4: *pcqm_metrics + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/neurips2023-large/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} + min_epochs: 1 + check_val_every_n_epoch: 20 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: ${dimensions.pre_nn_out_dim} + hidden_dims: ${dimensions.pre_nn_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: ${dimensions.pre_nn_edges_out_dim} + hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + # in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: ${dimensions.gnn_out_dim} + hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: ${dimensions.graph_output_nn_out_dims} + hidden_dims: ${dimensions.graph_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: ${dimensions.node_output_nn_out_dims} + hidden_dims: ${dimensions.node_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + l1000_vcap: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_vcap_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + l1000_mcf7: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcba_1328: + task_level: graph + out_dim: 1328 + hidden_dims: ${dimensions.pcba_1328_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_g25: + task_level: graph + out_dim: 25 + hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_n4: + task_level: node + out_dim: 4 + hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + diff --git a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml new file mode 100644 index 000000000..c6719001a --- /dev/null +++ b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml @@ -0,0 +1,482 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "../../neurips2023-large/" + epochs: 100 + name: SF_671M_sweep_LargeMix_mpnn + wandb: + entity: multitask-gnn + name: SF_671M_sweep_LargeMix_mpnn + project: neurips2023_graphcore_scaling_mpnn + + # This whole sections is for minimizing mistakes for the scaling experiments. + # This is the ONLY place where dimensions ahve to change. + # No other dimensions have to be changed in the architecture part. + +dimensions: + pre_nn_out_dim: 960 + pre_nn_hidden_dims: 3840 + pre_nn_edges_out_dim: 480 + pre_nn_edges_hidden_dims: 1920 + gnn_out_dim: 960 + gnn_hidden_dims: 960 + mpnn_in_dim: 960 + mpnn_out_dim: 960 + mpnn_in_dim_edges: 480 + mpnn_out_dim_edges: 480 + graph_output_nn_out_dims: 960 + graph_output_nn_hidden_dims: 960 + node_output_nn_out_dims: 960 + node_output_nn_hidden_dims: 960 + l1000_vcap_hidden_dims: 480 + l1000_mcf7_hidden_dims: 480 + pcba_1328_hidden_dims: 240 + pcqm4m_g25_hidden_dims: 120 + pcqm4m_n4_hidden_dims: 120 + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 + # max_num_edges_per_graph: 116 + max_num_nodes: 140 + max_num_edges: 290 + ipu_dataloader_inference_opts: + mode: async + # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 + # max_num_edges_per_graph: 116 + max_num_nodes: 140 + max_num_edges: 290 + # Data handling-related + batch_size_training: 3 + batch_size_inference: 3 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 320 + + ipu_config: + - deviceIterations(5) # IPU would require large batches to be ready for the model. + - replicationFactor(1) + # - 'setAvailableMemoryProportion({"IPU0": 0.05})' + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + # - TensorLocations.numIOTiles(128) + # - _Popart.set("defaultBufferingDepth", 96) + - _Popart.set("saveInitializersToFile", "weights.onnx") + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(16) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] + # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] + + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + l1000_vcap: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` + epoch_sampling_fraction: 1.0 + + l1000_mcf7: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcba_1328: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: assayID-* # assayID-* means all columns starting with "assayID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcqm4m_g25: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: graph_* # graph_* means all columns starting with "graph_" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + pcqm4m_n4: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: node_* # node_* means all columns starting with "node_" + # sample_size: 2000 # use sample_size for test + task_level: node + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + seed: 42 + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: disk + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + max_num_atoms: 100 + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + num_workers: 32 # -1 to use all + persistent_workers: True # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + metrics_on_training_set: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + loss_fun: + l1000_vcap: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: 0.5 + l1000_mcf7: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: ${predictor.loss_fun.l1000_vcap.alpha} + pcba_1328: bce_logits_ipu + pcqm4m_g25: mae_ipu + pcqm4m_n4: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + l1000_vcap: &classif_metrics + - name: auroc + metric: auroc + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + - name: avpr + metric: averageprecision + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + l1000_mcf7: *classif_metrics + pcba_1328: + # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly + - name: auroc + metric: auroc + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + - name: avpr + metric: averageprecision + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + pcqm4m_g25: &pcqm_metrics + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2 + metric: r2_score_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + pcqm4m_n4: *pcqm_metrics + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/neurips2023-large/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: models_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} + min_epochs: 1 + check_val_every_n_epoch: 20 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: ${dimensions.pre_nn_out_dim} + hidden_dims: ${dimensions.pre_nn_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: ${dimensions.pre_nn_edges_out_dim} + hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + # in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: ${dimensions.gnn_out_dim} + hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: ${dimensions.graph_output_nn_out_dims} + hidden_dims: ${dimensions.graph_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: ${dimensions.node_output_nn_out_dims} + hidden_dims: ${dimensions.node_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + l1000_vcap: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_vcap_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + l1000_mcf7: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcba_1328: + task_level: graph + out_dim: 1328 + hidden_dims: ${dimensions.pcba_1328_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_g25: + task_level: graph + out_dim: 25 + hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_n4: + task_level: node + out_dim: 4 + hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + From 428847f2a96e1f38b7a0de6edf160d25694170ca Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Thu, 5 Oct 2023 15:01:21 +0000 Subject: [PATCH 36/58] Simple solution to save checkpoints to wandb --- graphium/cli/train_finetune_test.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/graphium/cli/train_finetune_test.py b/graphium/cli/train_finetune_test.py index ffb5a7512..298da8df6 100644 --- a/graphium/cli/train_finetune_test.py +++ b/graphium/cli/train_finetune_test.py @@ -161,6 +161,12 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None: logger.info("-" * 50) if wandb_cfg is not None: + # Save initial model state - and upload checkpoint to wandb + if cfg["trainer"]["model_checkpoint"]["save_last"] is True: + checkpoint_path = f"{cfg['trainer']['model_checkpoint']['dirpath']}{cfg['trainer']['model_checkpoint']['filename']}_final_model.ckpt" + torch.save(predictor.model.state_dict(), checkpoint_path) + # Log the initial model checkpoint to wandb + wandb.save(checkpoint_path) wandb.finish() # Save test metrics - Base utility in case someone doesn't use a logger. From a2670e51455737961bf6b0b937cf893f8b682084 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Fri, 6 Oct 2023 10:24:02 +0000 Subject: [PATCH 37/58] more explicit checking edges --- graphium/features/featurizer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/graphium/features/featurizer.py b/graphium/features/featurizer.py index 66f241663..0d917b9a6 100644 --- a/graphium/features/featurizer.py +++ b/graphium/features/featurizer.py @@ -1062,11 +1062,15 @@ def mol_to_graph_dict( mol = Chem.AddHs(mol) else: mol = Chem.RemoveHs(mol) - + # SAMUELM: Temp fix + max_num_bonds = 265 num_atoms = mol.GetNumAtoms() + num_bonds = mol.GetNumBonds() if (max_num_atoms is not None) and (num_atoms > max_num_atoms): raise ValueError(f"Maximum number of atoms greater than permitted {num_atoms}>{max_num_atoms}") - + elif (max_num_atoms is not None) and (num_bonds > max_num_bonds): + raise ValueError(f"Maximum number of atoms greater than permitted {num_atoms}>{max_num_atoms}") + else: ( adj, ndata, From 473dd6f7b31469db55303db05eb837fffc73a0cc Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Fri, 6 Oct 2023 10:26:24 +0000 Subject: [PATCH 38/58] Dumb print --- .../foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml | 6 +++--- graphium/config/_loader.py | 2 +- graphium/features/featurizer.py | 2 ++ graphium/finetuning/utils.py | 1 - 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml index a1778670e..7b151aa5e 100644 --- a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml @@ -5,7 +5,7 @@ constants: seed: 42 raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/neurips2023-large/" + datacache_path: "/net/group/research/kerstink/neurips2023-large/" epochs: 100 name: SF_11M_sweep_LargeMix_mpnn wandb: @@ -231,8 +231,8 @@ predictor: # weight_decay: 1.e-7 torch_scheduler_kwargs: module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 + max_num_epochs: &max_epochs 10 + warmup_epochs: 5 verbose: False scheduler_kwargs: # monitor: &monitor qm9/mae/train diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py index da55a9266..0764492ff 100644 --- a/graphium/config/_loader.py +++ b/graphium/config/_loader.py @@ -421,7 +421,7 @@ def load_trainer( name = wandb_cfg.pop("name", "main") if len(date_time_suffix) > 0: name += f"_{date_time_suffix}" - trainer_kwargs["logger"] = WandbLogger(name=name, **wandb_cfg) + trainer_kwargs["logger"] = WandbLogger(name=name, log_model=True, **wandb_cfg) trainer_kwargs["callbacks"] = callbacks trainer = Trainer( diff --git a/graphium/features/featurizer.py b/graphium/features/featurizer.py index 0d917b9a6..56e8bd036 100644 --- a/graphium/features/featurizer.py +++ b/graphium/features/featurizer.py @@ -1067,8 +1067,10 @@ def mol_to_graph_dict( num_atoms = mol.GetNumAtoms() num_bonds = mol.GetNumBonds() if (max_num_atoms is not None) and (num_atoms > max_num_atoms): + logger.info("removing based on atoms") raise ValueError(f"Maximum number of atoms greater than permitted {num_atoms}>{max_num_atoms}") elif (max_num_atoms is not None) and (num_bonds > max_num_bonds): + logger.info("removing based on edges") raise ValueError(f"Maximum number of atoms greater than permitted {num_atoms}>{max_num_atoms}") else: ( diff --git a/graphium/finetuning/utils.py b/graphium/finetuning/utils.py index ede0f639c..4bb343180 100644 --- a/graphium/finetuning/utils.py +++ b/graphium/finetuning/utils.py @@ -45,7 +45,6 @@ def modify_cfg_for_finetuning(cfg: Dict[str, Any]): """ Function combining information from configuration and pretrained model for finetuning. """ - task = cfg["finetuning"]["task"] # Filter the config based on the task name From 5700c439b85afcdc78b83f5e0dc159142d273399 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Fri, 6 Oct 2023 15:49:07 +0000 Subject: [PATCH 39/58] Sweep Configs --- .../SF_11M_config_LargeMix_mpnn_GPS++.yaml | 6 +- .../SF_169M_config_LargeMix_mpnn_GPS++.yaml | 4 +- .../SF_378M_config_LargeMix_mpnn_GPS++.yaml | 76 +-- ...378M_config_LargeMix_mpnn_GPS++_debug.yaml | 481 ------------------ .../SF_42M_config_LargeMix_mpnn_GPS++.yaml | 4 +- .../SF_590M_config_LargeMix_mpnn_GPS++.yaml | 12 +- graphium/cli/train_finetune_test.py | 10 +- graphium/features/featurizer.py | 5 - 8 files changed, 61 insertions(+), 537 deletions(-) delete mode 100644 expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++_debug.yaml diff --git a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml index 7b151aa5e..140bc6ad3 100644 --- a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml @@ -231,8 +231,8 @@ predictor: # weight_decay: 1.e-7 torch_scheduler_kwargs: module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 10 - warmup_epochs: 5 + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 verbose: False scheduler_kwargs: # monitor: &monitor qm9/mae/train @@ -302,7 +302,7 @@ trainer: name: ${constants.name} project: ${constants.name} model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ + dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/ filename: ${constants.name} # monitor: *monitor # mode: *mode diff --git a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml index a645870be..12bdf9806 100644 --- a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml @@ -195,7 +195,7 @@ datamodule: pos_type: rw_return_probs ksteps: 16 - num_workers: 32 # -1 to use all + num_workers: 16 # -1 to use all persistent_workers: False # if use persistent worker at the start of each epoch. # Using persistent_workers false might make the start of each epoch very long. @@ -302,7 +302,7 @@ trainer: name: ${constants.name} project: ${constants.name} model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ + dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/ filename: ${constants.name} # monitor: *monitor # mode: *mode diff --git a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml index 490da9370..85668d003 100644 --- a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml @@ -18,25 +18,25 @@ constants: # No other dimensions have to be changed in the architecture part. dimensions: - pre_nn_out_dim: &gnn_dim 768 # original 256 - pre_nn_hidden_dims: 3072 # original 1024 - pre_nn_edges_out_dim: &gnn_dim_edges 384 # original 128 - pre_nn_edges_hidden_dims: 1536 # original 512 - gnn_out_dim: *gnn_dim # original 256 - gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - mpnn_in_dim: *gnn_dim # original 256 - mpnn_out_dim: *gnn_dim # original 256 - mpnn_in_dim_edges: *gnn_dim_edges # original 128 - mpnn_out_dim_edges: *gnn_dim_edges # original 128 - graph_output_nn_out_dims: *gnn_dim # original 256 - graph_output_nn_hidden_dims: *gnn_dim # original 256 - node_output_nn_out_dims: *gnn_dim # original 256 - node_output_nn_hidden_dims: *gnn_dim # original 256 - l1000_vcap_hidden_dims: 384 # original 128 - l1000_mcf7_hidden_dims: 384 # original 128 - pcba_1328_hidden_dims: 192 # original 64 - pcqm4m_g25_hidden_dims: 96 # original 32 - pcqm4m_n4_hidden_dims: 96 # original 32 + pre_nn_out_dim: 768 + pre_nn_hidden_dims: 3072 + pre_nn_edges_out_dim: 384 + pre_nn_edges_hidden_dims: 1536 + gnn_out_dim: 768 + gnn_hidden_dims: 768 + mpnn_in_dim: 768 + mpnn_out_dim: 768 + mpnn_in_dim_edges: 384 + mpnn_out_dim_edges: 384 + graph_output_nn_out_dims: 768 + graph_output_nn_hidden_dims: 768 + node_output_nn_out_dims: 768 + node_output_nn_hidden_dims: 768 + l1000_vcap_hidden_dims: 384 + l1000_mcf7_hidden_dims: 384 + pcba_1328_hidden_dims: 192 + pcqm4m_g25_hidden_dims: 96 + pcqm4m_n4_hidden_dims: 96 accelerator: type: ipu # cpu or ipu or gpu @@ -45,38 +45,43 @@ accelerator: args: ipu_dataloader_training_opts: mode: async - max_num_nodes_per_graph: 56 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 96 + # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 + # max_num_edges_per_graph: 116 + max_num_nodes: 140 + max_num_edges: 298 ipu_dataloader_inference_opts: mode: async - max_num_nodes_per_graph: 56 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 96 + # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 + # max_num_edges_per_graph: 116 + max_num_nodes: 140 + max_num_edges: 298 # Data handling-related - batch_size_training: 4 - batch_size_inference: 4 + batch_size_training: 3 + batch_size_inference: 3 predictor: metrics_every_n_train_steps: 1000 optim_kwargs: - loss_scaling: 16000 + loss_scaling: 1024 trainer: trainer: precision: 16-true - accumulate_grad_batches: 240 + accumulate_grad_batches: 320 ipu_config: - deviceIterations(5) # IPU would require large batches to be ready for the model. - replicationFactor(1) + # - 'setAvailableMemoryProportion({"IPU0": 0.05})' # - enableProfiling("graph_analyser") # The folder where the profile will be stored # - enableExecutableCaching("pop_compiler_cache") # - TensorLocations.numIOTiles(128) # - _Popart.set("defaultBufferingDepth", 96) + - _Popart.set("saveInitializersToFile", "weights.onnx") - Precision.enableStochasticRounding(True) # - Precision.enableFloatingPointExceptions(True) - ipu_inference_config: # set device iteration and replication factor to 1 during inference # gradient accumulation was set to 1 in the code - - deviceIterations(20) + - deviceIterations(16) - replicationFactor(1) - Precision.enableStochasticRounding(False) @@ -84,8 +89,7 @@ accelerator: _accelerator: "ipu" gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] - #gnn_layers_per_ipu: [4,4,4,4] - + datamodule: module_type: "MultitaskFromSmilesDataModule" # module_type: "FakeDataModule" # Option to use generated data @@ -196,9 +200,9 @@ datamodule: pos_type: rw_return_probs ksteps: 16 - num_workers: 30 # -1 to use all - persistent_workers: True # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. + num_workers: 16 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. #Task-specific predictor: @@ -303,7 +307,7 @@ trainer: name: ${constants.name} project: ${constants.name} model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ + dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/ filename: ${constants.name} # monitor: *monitor # mode: *mode @@ -367,7 +371,7 @@ architecture: first_normalization: "layer_norm" #"batch_norm" or "layer_norm" gnn: # Set as null to avoid a post-nn network - #in_dim: 256 # should be consistent with pre_nn.out_dim + # in_dim: 256 # should be consistent with pre_nn.out_dim out_dim: ${dimensions.gnn_out_dim} hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) depth: 16 diff --git a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++_debug.yaml b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++_debug.yaml deleted file mode 100644 index d7fe43e6e..000000000 --- a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++_debug.yaml +++ /dev/null @@ -1,481 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/neurips2023-large/" - epochs: 100 - name: SF_378M_sweep_LargeMix_mpnn - wandb: - entity: multitask-gnn - name: SF_378M_sweep_LargeMix_mpnn - project: neurips2023_graphcore_scaling_mpnn - - # This whole sections is for minimizing mistakes for the scaling experiments. - # This is the ONLY place where dimensions ahve to change. - # No other dimensions have to be changed in the architecture part. - -dimensions: - pre_nn_out_dim: 768 - pre_nn_hidden_dims: 3072 - pre_nn_edges_out_dim: 384 - pre_nn_edges_hidden_dims: 1536 - gnn_out_dim: 768 - gnn_hidden_dims: 768 - mpnn_in_dim: 768 - mpnn_out_dim: 768 - mpnn_in_dim_edges: 384 - mpnn_out_dim_edges: 384 - graph_output_nn_out_dims: 768 - graph_output_nn_hidden_dims: 768 - node_output_nn_out_dims: 768 - node_output_nn_hidden_dims: 768 - l1000_vcap_hidden_dims: 384 - l1000_mcf7_hidden_dims: 384 - pcba_1328_hidden_dims: 192 - pcqm4m_g25_hidden_dims: 96 - pcqm4m_n4_hidden_dims: 96 - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 - # max_num_edges_per_graph: 116 - max_num_nodes: 140 - max_num_edges: 290 - ipu_dataloader_inference_opts: - mode: async - # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 - # max_num_edges_per_graph: 116 - max_num_nodes: 140 - max_num_edges: 290 - # Data handling-related - batch_size_training: 3 - batch_size_inference: 3 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 320 - - ipu_config: - - deviceIterations(5) # IPU would require large batches to be ready for the model. - - replicationFactor(1) - # - 'setAvailableMemoryProportion({"IPU0": 0.05})' - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - # - TensorLocations.numIOTiles(128) - # - _Popart.set("defaultBufferingDepth", 96) - - _Popart.set("saveInitializersToFile", "weights.onnx") - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(16) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] - # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - l1000_vcap: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` - epoch_sampling_fraction: 1.0 - - l1000_mcf7: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcba_1328: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: assayID-* # assayID-* means all columns starting with "assayID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcqm4m_g25: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: graph_* # graph_* means all columns starting with "graph_" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - pcqm4m_n4: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: node_* # node_* means all columns starting with "node_" - # sample_size: 2000 # use sample_size for test - task_level: node - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - seed: 42 - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: disk - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - max_num_atoms: 100 - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - metrics_on_training_set: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - loss_fun: - l1000_vcap: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: 0.5 - l1000_mcf7: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: ${predictor.loss_fun.l1000_vcap.alpha} - pcba_1328: bce_logits_ipu - pcqm4m_g25: mae_ipu - pcqm4m_n4: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - l1000_vcap: &classif_metrics - - name: auroc - metric: auroc - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - - name: avpr - metric: averageprecision - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - l1000_mcf7: *classif_metrics - pcba_1328: - # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly - - name: auroc - metric: auroc - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - - name: avpr - metric: averageprecision - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - pcqm4m_g25: &pcqm_metrics - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2 - metric: r2_score_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - pcqm4m_n4: *pcqm_metrics - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/neurips2023-large/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} - min_epochs: 1 - check_val_every_n_epoch: 20 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: ${dimensions.pre_nn_out_dim} - hidden_dims: ${dimensions.pre_nn_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: ${dimensions.pre_nn_edges_out_dim} - hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: ${dimensions.gnn_out_dim} - hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: ${dimensions.graph_output_nn_out_dims} - hidden_dims: ${dimensions.graph_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: ${dimensions.node_output_nn_out_dims} - hidden_dims: ${dimensions.node_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - l1000_vcap: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_vcap_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - l1000_mcf7: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcba_1328: - task_level: graph - out_dim: 1328 - hidden_dims: ${dimensions.pcba_1328_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_g25: - task_level: graph - out_dim: 25 - hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_n4: - task_level: node - out_dim: 4 - hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - diff --git a/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml index a7b42ce53..04c393733 100644 --- a/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml @@ -195,7 +195,7 @@ datamodule: pos_type: rw_return_probs ksteps: 16 - num_workers: 32 # -1 to use all + num_workers: 16 # -1 to use all persistent_workers: False # if use persistent worker at the start of each epoch. # Using persistent_workers false might make the start of each epoch very long. @@ -302,7 +302,7 @@ trainer: name: ${constants.name} project: ${constants.name} model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ + dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/ filename: ${constants.name} # monitor: *monitor # mode: *mode diff --git a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml index c6719001a..bf884762b 100644 --- a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml @@ -7,10 +7,10 @@ constants: raise_train_error: true # Whether the code should raise an error if it crashes during training datacache_path: "../../neurips2023-large/" epochs: 100 - name: SF_671M_sweep_LargeMix_mpnn + name: SF_590M_sweep_LargeMix_mpnn wandb: entity: multitask-gnn - name: SF_671M_sweep_LargeMix_mpnn + name: SF_590M_sweep_LargeMix_mpnn project: neurips2023_graphcore_scaling_mpnn # This whole sections is for minimizing mistakes for the scaling experiments. @@ -48,13 +48,13 @@ accelerator: # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 # max_num_edges_per_graph: 116 max_num_nodes: 140 - max_num_edges: 290 + max_num_edges: 298 ipu_dataloader_inference_opts: mode: async # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 # max_num_edges_per_graph: 116 max_num_nodes: 140 - max_num_edges: 290 + max_num_edges: 298 # Data handling-related batch_size_training: 3 batch_size_inference: 3 @@ -201,7 +201,7 @@ datamodule: pos_type: rw_return_probs ksteps: 16 - num_workers: 32 # -1 to use all + num_workers: 16 # -1 to use all persistent_workers: True # if use persistent worker at the start of each epoch. # Using persistent_workers false might make the start of each epoch very long. @@ -308,7 +308,7 @@ trainer: name: ${constants.name} project: ${constants.name} model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ + dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/ filename: ${constants.name} # monitor: *monitor # mode: *mode diff --git a/graphium/cli/train_finetune_test.py b/graphium/cli/train_finetune_test.py index 298da8df6..e91fd7681 100644 --- a/graphium/cli/train_finetune_test.py +++ b/graphium/cli/train_finetune_test.py @@ -56,6 +56,13 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None: cfg = OmegaConf.to_container(cfg, resolve=True) + # Get the current date and time + now = datetime.now() + # Format the datetime as a string + filename_datetime_suffix = now.strftime("%Y%m%d_%H%M%S") + # Append the datetime string to the existing filename in the cfg dictionary + cfg['trainer']['model_checkpoint']['filename'] += f"_{filename_datetime_suffix}" + dst_dir = cfg["constants"].get("results_dir") hydra_cfg = HydraConfig.get() output_dir = hydra_cfg["runtime"]["output_dir"] @@ -163,8 +170,7 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None: if wandb_cfg is not None: # Save initial model state - and upload checkpoint to wandb if cfg["trainer"]["model_checkpoint"]["save_last"] is True: - checkpoint_path = f"{cfg['trainer']['model_checkpoint']['dirpath']}{cfg['trainer']['model_checkpoint']['filename']}_final_model.ckpt" - torch.save(predictor.model.state_dict(), checkpoint_path) + checkpoint_path = f"{cfg['trainer']['model_checkpoint']['dirpath']}{cfg['trainer']['model_checkpoint']['filename']}.ckpt" # Log the initial model checkpoint to wandb wandb.save(checkpoint_path) wandb.finish() diff --git a/graphium/features/featurizer.py b/graphium/features/featurizer.py index 56e8bd036..cb19e3b4b 100644 --- a/graphium/features/featurizer.py +++ b/graphium/features/featurizer.py @@ -1067,12 +1067,7 @@ def mol_to_graph_dict( num_atoms = mol.GetNumAtoms() num_bonds = mol.GetNumBonds() if (max_num_atoms is not None) and (num_atoms > max_num_atoms): - logger.info("removing based on atoms") raise ValueError(f"Maximum number of atoms greater than permitted {num_atoms}>{max_num_atoms}") - elif (max_num_atoms is not None) and (num_bonds > max_num_bonds): - logger.info("removing based on edges") - raise ValueError(f"Maximum number of atoms greater than permitted {num_atoms}>{max_num_atoms}") - else: ( adj, ndata, From cee7adb4cd16d7e9956f190dd211fcc4e66bcf5a Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Mon, 9 Oct 2023 15:35:52 +0000 Subject: [PATCH 40/58] Adding the edge residual, making node residual more explicit, and adding activation scaling --- graphium/config/_loader.py | 2 +- graphium/nn/pyg_layers/gps_pyg.py | 58 +++++++++++++++++++++++++++++-- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py index 0c9b42aa5..9861f7436 100644 --- a/graphium/config/_loader.py +++ b/graphium/config/_loader.py @@ -76,7 +76,6 @@ def _get_ipu_opts(config: Union[omegaconf.DictConfig, Dict[str, Any]]) -> Tuple[ if accelerator_type != "ipu": return None, None - ipu_opts = accelerator_options["ipu_config"] ipu_inference_opts = accelerator_options.get("ipu_inference_config", None) @@ -126,6 +125,7 @@ def load_datamodule( ipu_inference_opts=ipu_inference_opts, precision=config["trainer"]["trainer"].get("precision"), ) + # Define the Dataloader options for the IPU on the training sets bz_train = cfg_data["batch_size_training"] ipu_dataloader_training_opts = IPUDataloaderOptions( diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py index 555d8bef0..5b00c3d07 100644 --- a/graphium/nn/pyg_layers/gps_pyg.py +++ b/graphium/nn/pyg_layers/gps_pyg.py @@ -47,9 +47,10 @@ def __init__( activation: Union[Callable, str] = "relu", dropout: float = 0.0, node_residual: Optional[bool] = True, + edge_residual: Optional[bool] = True, normalization: Union[str, Callable] = "none", mpnn_type: str = "pyg:gine", - mpnn_kwargs=None, + mpnn_kwargs: Optional[dict] = None, attn_type: str = "full-attention", precision: str = "32", biased_attention_key: Optional[str] = None, @@ -57,6 +58,7 @@ def __init__( droppath_rate_attn: float = 0.0, droppath_rate_ffn: float = 0.0, hidden_dim_scaling: float = 4.0, + output_scale: float = 1.0, **kwargs, ): r""" @@ -99,6 +101,9 @@ def __init__( node_residual: If node residual is used after on the gnn layer output + edge_residual: + If edge residual is used after on the gnn layer output + normalization: Normalization to use. Choices: @@ -140,6 +145,10 @@ def __init__( attn_kwargs: Keyword arguments to pass to the attention layer + + output_scale: + Float value that will be used to scale the activations, helps reudce growth of activations + as the model gets deeper. Default value of 1.0 leaves the layer unchanged. """ @@ -165,6 +174,7 @@ def __init__( # Residual connections self.node_residual = node_residual + self.edge_residual = edge_residual self.precision = precision @@ -189,6 +199,38 @@ def __init__( # Initialize the MPNN and Attention layers self.mpnn = self._parse_mpnn_layer(mpnn_type, mpnn_kwargs) self.attn_layer = self._parse_attn_layer(attn_type, self.biased_attention_key, attn_kwargs) + + self.output_scale = torch.tensor(output_scale) + self.use_edges = self.mpnn.use_edges + + + def residual_add(self, feature: Tensor, input_feature: Tensor) -> Tensor: + r""" + Residual additition layer. Allows information to propagate through the model + by skipping the computational layers. + Parameters: + feature: The feature (typically nodes or edges) after message passing + input_feature: The same feature from before message passing + Returns: + The addition of the two tensors. + """ + feature += input_feature + return feature + + def scale_activations(self, feature: Tensor, scale_factor: Tensor) -> Tensor: + """Scale Activations by a constant factor to stop growth of activation scale + and reduce numerical stability issues at low precision + + Args: + feature (Tensor): The feature to scale + scale_factor (float): The floating point scale factor + + Returns: + Tensor: The scaled features + """ + feature *= scale_factor.to(dtype=feature.dtype) + return feature + def forward(self, batch: Batch) -> Batch: r""" @@ -200,6 +242,8 @@ def forward(self, batch: Batch) -> Batch: """ # pe, feat, edge_index, edge_feat = batch.pos_enc_feats_sign_flip, batch.feat, batch.edge_index, batch.edge_feat feat = batch.feat + # TODO: samuelm - check if edges are being used here + edges_feat_in = batch.edge_feat feat_in = feat # for first residual connection @@ -208,10 +252,20 @@ def forward(self, batch: Batch) -> Batch: if self.mpnn is not None: batch_out = self.mpnn(batch_out) h_local = batch_out.feat + e_local = batch_out.edge_feat if self.dropout_local is not None: h_local = self.dropout_local(h_local) + # Apply the residual connection for the node features if self.node_residual: - h_local = feat_in + h_local # Residual connection for nodes, not used in gps++. + h_local = self.residual_add(h_local, feat_in) + # Scale the activations by some value to help reduce activation growth + h_local = self.scale_activations(h_local, self.output_scale) + # Apply the residual connection for the edge features + if self.edge_residual: + e_local = self.residual_add(e_local, edges_feat_in) + # Scale the activations by some value to help reduce activation growth + e_local = self.scale_activations(e_local, self.output_scale) + if self.norm_layer_local is not None: h_local = self.norm_layer_local(h_local) From 19df1c3e9950c3164e643c5aafb12deecf06d4c3 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Mon, 9 Oct 2023 15:42:16 +0000 Subject: [PATCH 41/58] linting --- graphium/nn/pyg_layers/gps_pyg.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py index 5b00c3d07..f232d2954 100644 --- a/graphium/nn/pyg_layers/gps_pyg.py +++ b/graphium/nn/pyg_layers/gps_pyg.py @@ -145,10 +145,10 @@ def __init__( attn_kwargs: Keyword arguments to pass to the attention layer - + output_scale: Float value that will be used to scale the activations, helps reudce growth of activations - as the model gets deeper. Default value of 1.0 leaves the layer unchanged. + as the model gets deeper. Default value of 1.0 leaves the layer unchanged. """ @@ -199,15 +199,14 @@ def __init__( # Initialize the MPNN and Attention layers self.mpnn = self._parse_mpnn_layer(mpnn_type, mpnn_kwargs) self.attn_layer = self._parse_attn_layer(attn_type, self.biased_attention_key, attn_kwargs) - + self.output_scale = torch.tensor(output_scale) self.use_edges = self.mpnn.use_edges - def residual_add(self, feature: Tensor, input_feature: Tensor) -> Tensor: r""" Residual additition layer. Allows information to propagate through the model - by skipping the computational layers. + by skipping the computational layers. Parameters: feature: The feature (typically nodes or edges) after message passing input_feature: The same feature from before message passing @@ -216,21 +215,20 @@ def residual_add(self, feature: Tensor, input_feature: Tensor) -> Tensor: """ feature += input_feature return feature - + def scale_activations(self, feature: Tensor, scale_factor: Tensor) -> Tensor: """Scale Activations by a constant factor to stop growth of activation scale and reduce numerical stability issues at low precision Args: feature (Tensor): The feature to scale - scale_factor (float): The floating point scale factor + scale_factor (float): The floating point scale factor Returns: Tensor: The scaled features """ feature *= scale_factor.to(dtype=feature.dtype) return feature - def forward(self, batch: Batch) -> Batch: r""" @@ -255,17 +253,17 @@ def forward(self, batch: Batch) -> Batch: e_local = batch_out.edge_feat if self.dropout_local is not None: h_local = self.dropout_local(h_local) - # Apply the residual connection for the node features + # Apply the residual connection for the node features if self.node_residual: h_local = self.residual_add(h_local, feat_in) # Scale the activations by some value to help reduce activation growth h_local = self.scale_activations(h_local, self.output_scale) - # Apply the residual connection for the edge features + # Apply the residual connection for the edge features if self.edge_residual: e_local = self.residual_add(e_local, edges_feat_in) # Scale the activations by some value to help reduce activation growth - e_local = self.scale_activations(e_local, self.output_scale) - + e_local = self.scale_activations(e_local, self.output_scale) + if self.norm_layer_local is not None: h_local = self.norm_layer_local(h_local) From f6274b016a66af3e0920c5b12979786a94166697 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Mon, 9 Oct 2023 16:20:39 +0000 Subject: [PATCH 42/58] Fixing use_edges --- graphium/nn/pyg_layers/gps_pyg.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py index f232d2954..d96c4ee1c 100644 --- a/graphium/nn/pyg_layers/gps_pyg.py +++ b/graphium/nn/pyg_layers/gps_pyg.py @@ -201,7 +201,7 @@ def __init__( self.attn_layer = self._parse_attn_layer(attn_type, self.biased_attention_key, attn_kwargs) self.output_scale = torch.tensor(output_scale) - self.use_edges = self.mpnn.use_edges + self.use_edges = True if self.in_dim_edges is not None else False def residual_add(self, feature: Tensor, input_feature: Tensor) -> Tensor: r""" @@ -241,7 +241,8 @@ def forward(self, batch: Batch) -> Batch: # pe, feat, edge_index, edge_feat = batch.pos_enc_feats_sign_flip, batch.feat, batch.edge_index, batch.edge_feat feat = batch.feat # TODO: samuelm - check if edges are being used here - edges_feat_in = batch.edge_feat + if self.use_edges: + edges_feat_in = batch.edge_feat feat_in = feat # for first residual connection @@ -259,10 +260,11 @@ def forward(self, batch: Batch) -> Batch: # Scale the activations by some value to help reduce activation growth h_local = self.scale_activations(h_local, self.output_scale) # Apply the residual connection for the edge features - if self.edge_residual: + if self.edge_residual and self.use_edges: e_local = self.residual_add(e_local, edges_feat_in) # Scale the activations by some value to help reduce activation growth - e_local = self.scale_activations(e_local, self.output_scale) + if self.use_edges: + e_local = self.scale_activations(e_local, self.output_scale) if self.norm_layer_local is not None: h_local = self.norm_layer_local(h_local) From 3ac163ce756949c6b45ee7a63979e08afd867a0c Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Mon, 9 Oct 2023 16:22:55 +0000 Subject: [PATCH 43/58] remove todo --- graphium/nn/pyg_layers/gps_pyg.py | 1 - 1 file changed, 1 deletion(-) diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py index d96c4ee1c..530f42d32 100644 --- a/graphium/nn/pyg_layers/gps_pyg.py +++ b/graphium/nn/pyg_layers/gps_pyg.py @@ -240,7 +240,6 @@ def forward(self, batch: Batch) -> Batch: """ # pe, feat, edge_index, edge_feat = batch.pos_enc_feats_sign_flip, batch.feat, batch.edge_index, batch.edge_feat feat = batch.feat - # TODO: samuelm - check if edges are being used here if self.use_edges: edges_feat_in = batch.edge_feat From 400fa98d54b3e61d8e684013eb82db73ba0d9c9e Mon Sep 17 00:00:00 2001 From: Ikko Eltociear Ashimine Date: Tue, 10 Oct 2023 01:52:34 +0900 Subject: [PATCH 44/58] Fix typo in README.md permamently -> permanently --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 11b707bba..a83f7ab40 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ To change parameters specific to this experiment like switching from `fp16` to ` ```bash graphium-train dataset=toymix model=gcn trainer.trainer.precision=32 ``` -or change them permamently in the dedicated experiment config under `expts/hydra-configs/toymix_gcn.yaml`. +or change them permanently in the dedicated experiment config under `expts/hydra-configs/toymix_gcn.yaml`. Integrating `hydra` also allows you to quickly switch between accelerators. E.g., running ```bash graphium-train dataset=toymix model=gcn accelerator=gpu From eef2f0f81b1a0f446c57a10468b42789502c174d Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Mon, 9 Oct 2023 18:15:39 +0000 Subject: [PATCH 45/58] Device requirement for IPU training --- graphium/nn/pyg_layers/gps_pyg.py | 64 +++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 4 deletions(-) diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py index f3da56979..2a649373b 100644 --- a/graphium/nn/pyg_layers/gps_pyg.py +++ b/graphium/nn/pyg_layers/gps_pyg.py @@ -47,9 +47,10 @@ def __init__( activation: Union[Callable, str] = "relu", dropout: float = 0.0, node_residual: Optional[bool] = True, + edge_residual: Optional[bool] = True, normalization: Union[str, Callable] = "none", mpnn_type: str = "pyg:gine", - mpnn_kwargs=None, + mpnn_kwargs: Optional[dict] = None, attn_type: str = "full-attention", precision: str = "32", biased_attention_key: Optional[str] = None, @@ -57,6 +58,7 @@ def __init__( droppath_rate_attn: float = 0.0, droppath_rate_ffn: float = 0.0, hidden_dim_scaling: float = 4.0, + output_scale: float = 1.0, **kwargs, ): r""" @@ -99,6 +101,9 @@ def __init__( node_residual: If node residual is used after on the gnn layer output + edge_residual: + If edge residual is used after on the gnn layer output + normalization: Normalization to use. Choices: @@ -141,6 +146,10 @@ def __init__( attn_kwargs: Keyword arguments to pass to the attention layer + output_scale: + Float value that will be used to scale the activations, helps reudce growth of activations + as the model gets deeper. Default value of 1.0 leaves the layer unchanged. + """ super().__init__( @@ -165,6 +174,7 @@ def __init__( # Residual connections self.node_residual = node_residual + self.edge_residual = edge_residual self.precision = precision @@ -190,6 +200,37 @@ def __init__( self.mpnn = self._parse_mpnn_layer(mpnn_type, mpnn_kwargs) self.attn_layer = self._parse_attn_layer(attn_type, self.biased_attention_key, attn_kwargs) + self.output_scale = output_scale + self.use_edges = True if self.in_dim_edges is not None else False + + def residual_add(self, feature: Tensor, input_feature: Tensor) -> Tensor: + r""" + Residual additition layer. Allows information to propagate through the model + by skipping the computational layers. + Parameters: + feature: The feature (typically nodes or edges) after message passing + input_feature: The same feature from before message passing + Returns: + The addition of the two tensors. + """ + feature += input_feature + return feature + + def scale_activations(self, feature: Tensor, scale_factor: Tensor) -> Tensor: + """Scale Activations by a constant factor to stop growth of activation scale + and reduce numerical stability issues at low precision + + Args: + feature (Tensor): The feature to scale + scale_factor (float): The floating point scale factor + + Returns: + Tensor: The scaled features + """ + scale_factor = torch.tensor(scale_factor).to(feature.device) + feature *= scale_factor.to(dtype=feature.dtype) + return feature + def forward(self, batch: Batch) -> Batch: r""" forward function of the layer @@ -200,6 +241,8 @@ def forward(self, batch: Batch) -> Batch: """ # pe, feat, edge_index, edge_feat = batch.pos_enc_feats_sign_flip, batch.feat, batch.edge_index, batch.edge_feat feat = batch.feat + if self.use_edges: + edges_feat_in = batch.edge_feat feat_in = feat # for first residual connection @@ -208,10 +251,21 @@ def forward(self, batch: Batch) -> Batch: if self.mpnn is not None: batch_out = self.mpnn(batch_out) h_local = batch_out.feat + e_local = batch_out.edge_feat if self.dropout_local is not None: h_local = self.dropout_local(h_local) + # Apply the residual connection for the node features if self.node_residual: - h_local = feat_in + h_local # Residual connection for nodes, not used in gps++. + h_local = self.residual_add(h_local, feat_in) + # Scale the activations by some value to help reduce activation growth + h_local = self.scale_activations(h_local, self.output_scale) + # Apply the residual connection for the edge features + if self.edge_residual and self.use_edges: + e_local = self.residual_add(e_local, edges_feat_in) + # Scale the activations by some value to help reduce activation growth + if self.use_edges: + e_local = self.scale_activations(e_local, self.output_scale) + if self.norm_layer_local is not None: h_local = self.norm_layer_local(h_local) @@ -240,7 +294,7 @@ def forward(self, batch: Batch) -> Batch: def _parse_mpnn_layer(self, mpnn_type, mpnn_kwargs: Dict[str, Any]) -> Optional[Module]: """Parse the MPNN layer.""" - if mpnn_type is None: + if mpnn_type is None or mpnn_type == "none": return mpnn_kwargs = deepcopy(mpnn_kwargs) @@ -375,7 +429,7 @@ def _self_attention_block(self, feat: Tensor, feat_in: Tensor, batch: Batch) -> ) attn_bias = None - if self.biased_attention_key is not None: + if self.biased_attention_key is not None and self.biased_attention_key != "none": attn_bias = batch[self.biased_attention_key] # h_dense[num_graphs, max_num_nodes, hidden_dim] -> feat_attn[num_graphs, max_num_nodes, hidden_dim] @@ -463,6 +517,8 @@ def layer_outputs_edges(self) -> bool: bool: Always ``False`` for the current class """ + if self.mpnn is None: + return False return self.mpnn.layer_outputs_edges @property From 7154c0afc494b4e6d87995a1d8e8cdf6fb40ce76 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Mon, 9 Oct 2023 18:17:56 +0000 Subject: [PATCH 46/58] Config changes --- expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml index 140bc6ad3..251bdae15 100644 --- a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml @@ -380,7 +380,9 @@ architecture: use_virtual_edges: true layer_type: 'pyg:gps' layer_kwargs: - node_residual: false + node_residual: True + edge_residual: True + output_scale: 1.0 mpnn_type: 'pyg:mpnnplus' mpnn_kwargs: in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) From 1133ea50f0d9f9a465b6d5dfd7c73ac6cbbc8e36 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Mon, 9 Oct 2023 18:18:47 +0000 Subject: [PATCH 47/58] Device changes --- graphium/nn/pyg_layers/gps_pyg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py index 530f42d32..2a649373b 100644 --- a/graphium/nn/pyg_layers/gps_pyg.py +++ b/graphium/nn/pyg_layers/gps_pyg.py @@ -200,7 +200,7 @@ def __init__( self.mpnn = self._parse_mpnn_layer(mpnn_type, mpnn_kwargs) self.attn_layer = self._parse_attn_layer(attn_type, self.biased_attention_key, attn_kwargs) - self.output_scale = torch.tensor(output_scale) + self.output_scale = output_scale self.use_edges = True if self.in_dim_edges is not None else False def residual_add(self, feature: Tensor, input_feature: Tensor) -> Tensor: @@ -227,6 +227,7 @@ def scale_activations(self, feature: Tensor, scale_factor: Tensor) -> Tensor: Returns: Tensor: The scaled features """ + scale_factor = torch.tensor(scale_factor).to(feature.device) feature *= scale_factor.to(dtype=feature.dtype) return feature From a66c546ae4e2ec633b3bbc40259bb76af820fd5c Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Mon, 9 Oct 2023 18:19:50 +0000 Subject: [PATCH 48/58] typo - review resolved --- graphium/nn/pyg_layers/gps_pyg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py index 2a649373b..35adb27d8 100644 --- a/graphium/nn/pyg_layers/gps_pyg.py +++ b/graphium/nn/pyg_layers/gps_pyg.py @@ -147,7 +147,7 @@ def __init__( Keyword arguments to pass to the attention layer output_scale: - Float value that will be used to scale the activations, helps reudce growth of activations + Float value that will be used to scale the activations, helps reduce growth of activations as the model gets deeper. Default value of 1.0 leaves the layer unchanged. """ From 16100a7990dc4cb156a282956a792995b124321e Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Tue, 10 Oct 2023 09:51:38 +0000 Subject: [PATCH 49/58] Node + edge residual example --- .../foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml index 12bdf9806..02b99e0bf 100644 --- a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml @@ -380,8 +380,10 @@ architecture: use_virtual_edges: true layer_type: 'pyg:gps' layer_kwargs: - node_residual: false mpnn_type: 'pyg:mpnnplus' + node_residual: True + edge_residual: True + output_scale: 1.0 mpnn_kwargs: in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) From b949e700b66318c56008ea6c567854ee5d497303 Mon Sep 17 00:00:00 2001 From: kerstink-GC Date: Tue, 10 Oct 2023 10:20:53 +0000 Subject: [PATCH 50/58] cleaned up configs --- .../100M_config_LargeMix_mpnn_GPS++.yaml | 462 ----------------- .../200M_config_LargeMix_mpnn_GPS++.yaml | 462 ----------------- .../400M_config_LargeMix_mpnn_GPS++.yaml | 463 ----------------- .../50M_config_LargeMix_mpnn_GPS++.yaml | 462 ----------------- .../SF_11M_config_LargeMix_mpnn_GPS++.yaml | 6 +- .../SF_169M_config_LargeMix_mpnn_GPS++.yaml | 4 +- .../SF_378M_config_LargeMix_mpnn_GPS++.yaml | 2 +- .../SF_590M_config_LargeMix_mpnn_GPS++.yaml | 2 +- .../SF_671M_config_LargeMix_mpnn_GPS++.yaml | 482 ------------------ ..._config_LargeMix_mpnn_GPS++_compiling.yaml | 482 ------------------ 10 files changed, 7 insertions(+), 2820 deletions(-) delete mode 100644 expts/foundation_model/100M_config_LargeMix_mpnn_GPS++.yaml delete mode 100644 expts/foundation_model/200M_config_LargeMix_mpnn_GPS++.yaml delete mode 100644 expts/foundation_model/400M_config_LargeMix_mpnn_GPS++.yaml delete mode 100644 expts/foundation_model/50M_config_LargeMix_mpnn_GPS++.yaml delete mode 100644 expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml delete mode 100644 expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml diff --git a/expts/foundation_model/100M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/100M_config_LargeMix_mpnn_GPS++.yaml deleted file mode 100644 index c8eefd4d2..000000000 --- a/expts/foundation_model/100M_config_LargeMix_mpnn_GPS++.yaml +++ /dev/null @@ -1,462 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/neurips2023-large/" - epochs: 100 - name: LargeMix_mpnn_100M - wandb: - entity: multitask-gnn - name: LargeMix_mpnn_100M - project: neurips2023_graphcore_scaling_mpnn - - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 50 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 100 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 55 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 100 - # Data handling-related - batch_size_training: 12 - batch_size_inference: 12 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 12 - - ipu_config: - - deviceIterations(30) # IPU would require large batches to be ready for the model. - - replicationFactor(4) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [4,4,4,4] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - l1000_vcap: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` - epoch_sampling_fraction: 1.0 - - l1000_mcf7: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcba_1328: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: assayID-* # assayID-* means all columns starting with "assayID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcqm4m_g25: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: graph_* # graph_* means all columns starting with "graph_" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - pcqm4m_n4: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: node_* # node_* means all columns starting with "node_" - # sample_size: 2000 # use sample_size for test - task_level: node - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - seed: 42 - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: disk - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - metrics_on_training_set: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - loss_fun: - l1000_vcap: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: 0.5 - l1000_mcf7: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: ${predictor.loss_fun.l1000_vcap.alpha} - pcba_1328: bce_logits_ipu - pcqm4m_g25: mae_ipu - pcqm4m_n4: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - l1000_vcap: &classif_metrics - - name: auroc - metric: auroc - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - - name: avpr - metric: averageprecision - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - l1000_mcf7: *classif_metrics - pcba_1328: - # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly - - name: auroc - metric: auroc - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - - name: avpr - metric: averageprecision - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - pcqm4m_g25: &pcqm_metrics - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2 - metric: r2_score_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - pcqm4m_n4: *pcqm_metrics - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/neurips2023-large/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} - min_epochs: 1 - check_val_every_n_epoch: 20 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: &gnn_dim 428 - hidden_dims: 1024 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: 128 - hidden_dims: 512 - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: *gnn_dim - hidden_dims: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - l1000_vcap: - task_level: graph - out_dim: 2934 - hidden_dims: 128 - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - l1000_mcf7: - task_level: graph - out_dim: 2934 - hidden_dims: 128 - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcba_1328: - task_level: graph - out_dim: 1328 - hidden_dims: 64 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_g25: - task_level: graph - out_dim: 25 - hidden_dims: 32 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_n4: - task_level: node - out_dim: 4 - hidden_dims: 32 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - - # task_heads: - # homolumo: - # task_level: graph - # out_dim: 1 - # hidden_dims: 256 - # depth: 2 - # activation: relu - # last_activation: none - # dropout: *dropout - # normalization: *normalization - # last_normalization: "none" - # residual_type: none diff --git a/expts/foundation_model/200M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/200M_config_LargeMix_mpnn_GPS++.yaml deleted file mode 100644 index 298d68109..000000000 --- a/expts/foundation_model/200M_config_LargeMix_mpnn_GPS++.yaml +++ /dev/null @@ -1,462 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/neurips2023-large/" - epochs: 100 - name: LargeMix_mpnn_200M - wandb: - entity: multitask-gnn - name: LargeMix_mpnn_200M - project: neurips2023_graphcore_scaling_mpnn - - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 116 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 116 - # Data handling-related - batch_size_training: 8 - batch_size_inference: 8 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 16 - - ipu_config: - - deviceIterations(30) # IPU would require large batches to be ready for the model. - - replicationFactor(2) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - l1000_vcap: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` - epoch_sampling_fraction: 1.0 - - l1000_mcf7: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcba_1328: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: assayID-* # assayID-* means all columns starting with "assayID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcqm4m_g25: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: graph_* # graph_* means all columns starting with "graph_" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - pcqm4m_n4: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: node_* # node_* means all columns starting with "node_" - # sample_size: 2000 # use sample_size for test - task_level: node - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - seed: 42 - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: disk - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - metrics_on_training_set: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - loss_fun: - l1000_vcap: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: 0.5 - l1000_mcf7: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: ${predictor.loss_fun.l1000_vcap.alpha} - pcba_1328: bce_logits_ipu - pcqm4m_g25: mae_ipu - pcqm4m_n4: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - l1000_vcap: &classif_metrics - - name: auroc - metric: auroc - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - - name: avpr - metric: averageprecision - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - l1000_mcf7: *classif_metrics - pcba_1328: - # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly - - name: auroc - metric: auroc - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - - name: avpr - metric: averageprecision - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - pcqm4m_g25: &pcqm_metrics - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2 - metric: r2_score_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - pcqm4m_n4: *pcqm_metrics - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/neurips2023-large/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} - min_epochs: 1 - check_val_every_n_epoch: 20 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: &gnn_dim 628 - hidden_dims: 1024 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: 128 - hidden_dims: 512 - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: *gnn_dim - hidden_dims: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - l1000_vcap: - task_level: graph - out_dim: 2934 - hidden_dims: 128 - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - l1000_mcf7: - task_level: graph - out_dim: 2934 - hidden_dims: 128 - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcba_1328: - task_level: graph - out_dim: 1328 - hidden_dims: 64 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_g25: - task_level: graph - out_dim: 25 - hidden_dims: 32 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_n4: - task_level: node - out_dim: 4 - hidden_dims: 32 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - - # task_heads: - # homolumo: - # task_level: graph - # out_dim: 1 - # hidden_dims: 256 - # depth: 2 - # activation: relu - # last_activation: none - # dropout: *dropout - # normalization: *normalization - # last_normalization: "none" - # residual_type: none diff --git a/expts/foundation_model/400M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/400M_config_LargeMix_mpnn_GPS++.yaml deleted file mode 100644 index 08820d330..000000000 --- a/expts/foundation_model/400M_config_LargeMix_mpnn_GPS++.yaml +++ /dev/null @@ -1,463 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/neurips2023-large/" - epochs: 100 - name: LargeMix_mpnn_400M - wandb: - entity: multitask-gnn - name: LargeMix_mpnn_400M - project: neurips2023_graphcore_scaling_mpnn - - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 70 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 150 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 70 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 150 - # Data handling-related - batch_size_training: 4 - batch_size_inference: 4 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 32 - - ipu_config: - - deviceIterations(30) # IPU would require large batches to be ready for the model. - - replicationFactor(1) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] - #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - l1000_vcap: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` - epoch_sampling_fraction: 1.0 - - l1000_mcf7: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcba_1328: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: assayID-* # assayID-* means all columns starting with "assayID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcqm4m_g25: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: graph_* # graph_* means all columns starting with "graph_" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - pcqm4m_n4: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: node_* # node_* means all columns starting with "node_" - # sample_size: 2000 # use sample_size for test - task_level: node - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - seed: 42 - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: disk - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - metrics_on_training_set: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - loss_fun: - l1000_vcap: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: 0.5 - l1000_mcf7: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: ${predictor.loss_fun.l1000_vcap.alpha} - pcba_1328: bce_logits_ipu - pcqm4m_g25: mae_ipu - pcqm4m_n4: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - l1000_vcap: &classif_metrics - - name: auroc - metric: auroc - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - - name: avpr - metric: averageprecision - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - l1000_mcf7: *classif_metrics - pcba_1328: - # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly - - name: auroc - metric: auroc - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - - name: avpr - metric: averageprecision - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - pcqm4m_g25: &pcqm_metrics - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2 - metric: r2_score_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - pcqm4m_n4: *pcqm_metrics - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/neurips2023-large/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} - min_epochs: 1 - check_val_every_n_epoch: 20 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: &gnn_dim 910 - hidden_dims: 1024 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: 128 - hidden_dims: 512 - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: *gnn_dim - hidden_dims: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - l1000_vcap: - task_level: graph - out_dim: 2934 - hidden_dims: 128 - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - l1000_mcf7: - task_level: graph - out_dim: 2934 - hidden_dims: 128 - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcba_1328: - task_level: graph - out_dim: 1328 - hidden_dims: 64 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_g25: - task_level: graph - out_dim: 25 - hidden_dims: 32 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_n4: - task_level: node - out_dim: 4 - hidden_dims: 32 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - - # task_heads: - # homolumo: - # task_level: graph - # out_dim: 1 - # hidden_dims: 256 - # depth: 2 - # activation: relu - # last_activation: none - # dropout: *dropout - # normalization: *normalization - # last_normalization: "none" - # residual_type: none diff --git a/expts/foundation_model/50M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/50M_config_LargeMix_mpnn_GPS++.yaml deleted file mode 100644 index b75d7d2e0..000000000 --- a/expts/foundation_model/50M_config_LargeMix_mpnn_GPS++.yaml +++ /dev/null @@ -1,462 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/neurips2023-large/" - epochs: 100 - name: LargeMix_mpnn_50M - wandb: - entity: multitask-gnn - name: LargeMix_mpnn_50M - project: neurips2023_graphcore_scaling_mpnn - - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 40 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 100 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 45 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 100 - # Data handling-related - batch_size_training: 16 - batch_size_inference: 16 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 16 - - ipu_config: - - deviceIterations(30) # IPU would require large batches to be ready for the model. - - replicationFactor(4) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [4,4,4,4] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - l1000_vcap: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` - epoch_sampling_fraction: 1.0 - - l1000_mcf7: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcba_1328: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: assayID-* # assayID-* means all columns starting with "assayID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcqm4m_g25: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: graph_* # graph_* means all columns starting with "graph_" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - pcqm4m_n4: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: node_* # node_* means all columns starting with "node_" - # sample_size: 2000 # use sample_size for test - task_level: node - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - seed: 42 - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: disk - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - metrics_on_training_set: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - loss_fun: - l1000_vcap: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: 0.5 - l1000_mcf7: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: ${predictor.loss_fun.l1000_vcap.alpha} - pcba_1328: bce_logits_ipu - pcqm4m_g25: mae_ipu - pcqm4m_n4: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - l1000_vcap: &classif_metrics - - name: auroc - metric: auroc - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - - name: avpr - metric: averageprecision - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - l1000_mcf7: *classif_metrics - pcba_1328: - # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly - - name: auroc - metric: auroc - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - - name: avpr - metric: averageprecision - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - pcqm4m_g25: &pcqm_metrics - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2 - metric: r2_score_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - pcqm4m_n4: *pcqm_metrics - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/neurips2023-large/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} - min_epochs: 1 - check_val_every_n_epoch: 20 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: &gnn_dim 282 - hidden_dims: 1024 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: 128 - hidden_dims: 512 - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: *gnn_dim - hidden_dims: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: *gnn_dim # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - l1000_vcap: - task_level: graph - out_dim: 2934 - hidden_dims: 128 - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - l1000_mcf7: - task_level: graph - out_dim: 2934 - hidden_dims: 128 - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcba_1328: - task_level: graph - out_dim: 1328 - hidden_dims: 64 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_g25: - task_level: graph - out_dim: 25 - hidden_dims: 32 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_n4: - task_level: node - out_dim: 4 - hidden_dims: 32 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - - # task_heads: - # homolumo: - # task_level: graph - # out_dim: 1 - # hidden_dims: 256 - # depth: 2 - # activation: relu - # last_activation: none - # dropout: *dropout - # normalization: *normalization - # last_normalization: "none" - # residual_type: none diff --git a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml index 140bc6ad3..3580c5c35 100644 --- a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml @@ -5,7 +5,7 @@ constants: seed: 42 raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/net/group/research/kerstink/neurips2023-large/" + datacache_path: "/localdata/neurips2023-large/" epochs: 100 name: SF_11M_sweep_LargeMix_mpnn wandb: @@ -195,7 +195,7 @@ datamodule: pos_type: rw_return_probs ksteps: 16 - num_workers: 32 # -1 to use all + num_workers: 16 # -1 to use all persistent_workers: False # if use persistent worker at the start of each epoch. # Using persistent_workers false might make the start of each epoch very long. @@ -302,7 +302,7 @@ trainer: name: ${constants.name} project: ${constants.name} model_checkpoint: - dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/ + dirpath: ./models_checkpoints/${constants.name}/ filename: ${constants.name} # monitor: *monitor # mode: *mode diff --git a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml index 12bdf9806..9d7aad096 100644 --- a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml @@ -57,7 +57,7 @@ accelerator: predictor: metrics_every_n_train_steps: 1000 optim_kwargs: - loss_scaling: 1024 + loss_scaling: 8192 trainer: trainer: precision: 16-true @@ -79,7 +79,7 @@ accelerator: - deviceIterations(30) - replicationFactor(1) - Precision.enableStochasticRounding(False) - + accelerator_kwargs: _accelerator: "ipu" #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] diff --git a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml index 85668d003..6dba7e4ad 100644 --- a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml @@ -61,7 +61,7 @@ accelerator: predictor: metrics_every_n_train_steps: 1000 optim_kwargs: - loss_scaling: 1024 + loss_scaling: 1 trainer: trainer: precision: 16-true diff --git a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml index bf884762b..5e537c039 100644 --- a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml @@ -5,7 +5,7 @@ constants: seed: 42 raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "../../neurips2023-large/" + datacache_path: "/localdata/neurips2023-large/" epochs: 100 name: SF_590M_sweep_LargeMix_mpnn wandb: diff --git a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml deleted file mode 100644 index c6719001a..000000000 --- a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml +++ /dev/null @@ -1,482 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "../../neurips2023-large/" - epochs: 100 - name: SF_671M_sweep_LargeMix_mpnn - wandb: - entity: multitask-gnn - name: SF_671M_sweep_LargeMix_mpnn - project: neurips2023_graphcore_scaling_mpnn - - # This whole sections is for minimizing mistakes for the scaling experiments. - # This is the ONLY place where dimensions ahve to change. - # No other dimensions have to be changed in the architecture part. - -dimensions: - pre_nn_out_dim: 960 - pre_nn_hidden_dims: 3840 - pre_nn_edges_out_dim: 480 - pre_nn_edges_hidden_dims: 1920 - gnn_out_dim: 960 - gnn_hidden_dims: 960 - mpnn_in_dim: 960 - mpnn_out_dim: 960 - mpnn_in_dim_edges: 480 - mpnn_out_dim_edges: 480 - graph_output_nn_out_dims: 960 - graph_output_nn_hidden_dims: 960 - node_output_nn_out_dims: 960 - node_output_nn_hidden_dims: 960 - l1000_vcap_hidden_dims: 480 - l1000_mcf7_hidden_dims: 480 - pcba_1328_hidden_dims: 240 - pcqm4m_g25_hidden_dims: 120 - pcqm4m_n4_hidden_dims: 120 - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 - # max_num_edges_per_graph: 116 - max_num_nodes: 140 - max_num_edges: 290 - ipu_dataloader_inference_opts: - mode: async - # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 - # max_num_edges_per_graph: 116 - max_num_nodes: 140 - max_num_edges: 290 - # Data handling-related - batch_size_training: 3 - batch_size_inference: 3 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 320 - - ipu_config: - - deviceIterations(5) # IPU would require large batches to be ready for the model. - - replicationFactor(1) - # - 'setAvailableMemoryProportion({"IPU0": 0.05})' - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - # - TensorLocations.numIOTiles(128) - # - _Popart.set("defaultBufferingDepth", 96) - - _Popart.set("saveInitializersToFile", "weights.onnx") - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(16) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] - # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] - - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - l1000_vcap: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` - epoch_sampling_fraction: 1.0 - - l1000_mcf7: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcba_1328: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: assayID-* # assayID-* means all columns starting with "assayID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcqm4m_g25: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: graph_* # graph_* means all columns starting with "graph_" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - pcqm4m_n4: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: node_* # node_* means all columns starting with "node_" - # sample_size: 2000 # use sample_size for test - task_level: node - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - seed: 42 - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: disk - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - max_num_atoms: 100 - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - num_workers: 32 # -1 to use all - persistent_workers: True # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - metrics_on_training_set: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - loss_fun: - l1000_vcap: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: 0.5 - l1000_mcf7: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: ${predictor.loss_fun.l1000_vcap.alpha} - pcba_1328: bce_logits_ipu - pcqm4m_g25: mae_ipu - pcqm4m_n4: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - l1000_vcap: &classif_metrics - - name: auroc - metric: auroc - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - - name: avpr - metric: averageprecision - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - l1000_mcf7: *classif_metrics - pcba_1328: - # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly - - name: auroc - metric: auroc - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - - name: avpr - metric: averageprecision - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - pcqm4m_g25: &pcqm_metrics - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2 - metric: r2_score_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - pcqm4m_n4: *pcqm_metrics - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/neurips2023-large/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} - min_epochs: 1 - check_val_every_n_epoch: 20 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: ${dimensions.pre_nn_out_dim} - hidden_dims: ${dimensions.pre_nn_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: ${dimensions.pre_nn_edges_out_dim} - hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: ${dimensions.gnn_out_dim} - hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: ${dimensions.graph_output_nn_out_dims} - hidden_dims: ${dimensions.graph_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: ${dimensions.node_output_nn_out_dims} - hidden_dims: ${dimensions.node_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - l1000_vcap: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_vcap_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - l1000_mcf7: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcba_1328: - task_level: graph - out_dim: 1328 - hidden_dims: ${dimensions.pcba_1328_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_g25: - task_level: graph - out_dim: 25 - hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_n4: - task_level: node - out_dim: 4 - hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - diff --git a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml deleted file mode 100644 index 01b0bf315..000000000 --- a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++_compiling.yaml +++ /dev/null @@ -1,482 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/net/group/research/kerstink/neurips2023-large/" - epochs: 100 - name: SF_671M_sweep_LargeMix_mpnn - wandb: - entity: multitask-gnn - name: SF_671M_sweep_LargeMix_mpnn - project: neurips2023_graphcore_scaling_mpnn - - # This whole sections is for minimizing mistakes for the scaling experiments. - # This is the ONLY place where dimensions ahve to change. - # No other dimensions have to be changed in the architecture part. - -dimensions: - pre_nn_out_dim: &gnn_dim 1024 # original 256 - pre_nn_hidden_dims: 4096 # original 1024 - pre_nn_edges_out_dim: &gnn_dim_edges 512 # original 128 - pre_nn_edges_hidden_dims: 2048 # original 512 - gnn_out_dim: *gnn_dim # original 256 - gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - mpnn_in_dim: *gnn_dim # original 256 - mpnn_out_dim: *gnn_dim # original 256 - mpnn_in_dim_edges: *gnn_dim_edges # original 128 - mpnn_out_dim_edges: *gnn_dim_edges # original 128 - graph_output_nn_out_dims: *gnn_dim # original 256 - graph_output_nn_hidden_dims: *gnn_dim # original 256 - node_output_nn_out_dims: *gnn_dim # original 256 - node_output_nn_hidden_dims: *gnn_dim # original 256 - l1000_vcap_hidden_dims: 512 # original 128 - l1000_mcf7_hidden_dims: 512 # original 128 - pcba_1328_hidden_dims: 256 # original 64 - pcqm4m_g25_hidden_dims: 128 # original 32 - pcqm4m_n4_hidden_dims: 128 # original 32 - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 - # max_num_edges_per_graph: 116 - max_num_nodes: 128 - max_num_edges: 272 - ipu_dataloader_inference_opts: - mode: async - # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 - # max_num_edges_per_graph: 116 - max_num_nodes: 128 - max_num_edges: 272 - # Data handling-related - batch_size_training: 3 - batch_size_inference: 2 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 320 - - ipu_config: - - deviceIterations(1) # IPU would require large batches to be ready for the model. - - replicationFactor(1) - - 'setAvailableMemoryProportion({"IPU0": 0.1})' - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - # - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - _Popart.set("saveInitializersToFile", "weights.onnx") - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] - # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] - - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - l1000_vcap: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` - epoch_sampling_fraction: 1.0 - - l1000_mcf7: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcba_1328: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: assayID-* # assayID-* means all columns starting with "assayID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcqm4m_g25: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: graph_* # graph_* means all columns starting with "graph_" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - pcqm4m_n4: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: node_* # node_* means all columns starting with "node_" - # sample_size: 2000 # use sample_size for test - task_level: node - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - seed: 42 - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: disk - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - max_num_atoms: 100 - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - metrics_on_training_set: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - loss_fun: - l1000_vcap: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: 0.5 - l1000_mcf7: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: ${predictor.loss_fun.l1000_vcap.alpha} - pcba_1328: bce_logits_ipu - pcqm4m_g25: mae_ipu - pcqm4m_n4: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - l1000_vcap: &classif_metrics - - name: auroc - metric: auroc - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - - name: avpr - metric: averageprecision - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - l1000_mcf7: *classif_metrics - pcba_1328: - # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly - - name: auroc - metric: auroc - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - - name: avpr - metric: averageprecision - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - pcqm4m_g25: &pcqm_metrics - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2 - metric: r2_score_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - pcqm4m_n4: *pcqm_metrics - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/neurips2023-large/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} - min_epochs: 1 - check_val_every_n_epoch: 20 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: ${dimensions.pre_nn_out_dim} - hidden_dims: ${dimensions.pre_nn_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: ${dimensions.pre_nn_edges_out_dim} - hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: ${dimensions.gnn_out_dim} - hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: ${dimensions.graph_output_nn_out_dims} - hidden_dims: ${dimensions.graph_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: ${dimensions.node_output_nn_out_dims} - hidden_dims: ${dimensions.node_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - l1000_vcap: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_vcap_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - l1000_mcf7: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcba_1328: - task_level: graph - out_dim: 1328 - hidden_dims: ${dimensions.pcba_1328_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_g25: - task_level: graph - out_dim: 25 - hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_n4: - task_level: node - out_dim: 4 - hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - From ba48d3e40429f910e545a816de61d895a555a88f Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Wed, 11 Oct 2023 10:10:32 +0000 Subject: [PATCH 51/58] Simple attempt at logging epochs --- expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml | 3 ++- graphium/trainer/predictor.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml index 251bdae15..ad803ffa2 100644 --- a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml @@ -6,6 +6,7 @@ constants: seed: 42 raise_train_error: true # Whether the code should raise an error if it crashes during training datacache_path: "/net/group/research/kerstink/neurips2023-large/" + # datacache_path: "/localdata/neurips2023-large/" epochs: 100 name: SF_11M_sweep_LargeMix_mpnn wandb: @@ -64,7 +65,7 @@ accelerator: accumulate_grad_batches: 2 ipu_config: - - deviceIterations(30) # IPU would require large batches to be ready for the model. + - deviceIterations(16) # IPU would require large batches to be ready for the model. - replicationFactor(16) # - enableProfiling("graph_analyser") # The folder where the profile will be stored # - enableExecutableCaching("pop_compiler_cache") diff --git a/graphium/trainer/predictor.py b/graphium/trainer/predictor.py index c4e700895..6f8939e8a 100644 --- a/graphium/trainer/predictor.py +++ b/graphium/trainer/predictor.py @@ -461,6 +461,9 @@ def on_train_batch_end(self, outputs, batch: Any, batch_idx: int) -> None: # Get the metrics that are logged at every step (loss, grad_norm, batch_time, batch_tput) concatenated_metrics_logs = {} concatenated_metrics_logs["train/loss"] = outputs["loss"] + concatenated_metrics_logs["epoch_count"] = self.current_epoch + # TODO: Samuelm - we need a number of samples here as well if this works? + # import ipdb; ipdb.set_trace() # report the training loss for each individual tasks for task in self.tasks: From d3d1ff3cac93be78ef1d2a2ed667686987b719b0 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Thu, 12 Oct 2023 15:23:52 +0000 Subject: [PATCH 52/58] Samples seen --- graphium/cli/train_finetune_test.py | 51 +++++++++++++++++++++++++++++ graphium/config/_loader.py | 9 +++++ graphium/trainer/predictor.py | 17 ++++++++-- 3 files changed, 75 insertions(+), 2 deletions(-) diff --git a/graphium/cli/train_finetune_test.py b/graphium/cli/train_finetune_test.py index e91fd7681..044bba3e6 100644 --- a/graphium/cli/train_finetune_test.py +++ b/graphium/cli/train_finetune_test.py @@ -48,6 +48,48 @@ def cli(cfg: DictConfig) -> None: """ return run_training_finetuning_testing(cfg) +def get_replication_factor(cfg): + try: + ipu_config = cfg.get("accelerator", {}).get("ipu_config", []) + for item in ipu_config: + if "replicationFactor" in item: + # Extract the number between parentheses + start = item.find("(") + 1 + end = item.find(")") + if start != 0 and end != -1: + return int(item[start:end]) + except Exception as e: + print(f"An error occurred: {e}") + + # Return default value if replicationFactor is not found or an error occurred + return 1 + +def get_gradient_accumulation_factor(cfg): + try: + # Navigate through the nested dictionaries and get the gradient accumulation factor + grad_accumulation_factor = cfg.get("accelerator", {}).get("config_override", {}).get("trainer", {}).get("trainer", {}).get("accumulate_grad_batches", 1) + + # Ensure that the extracted value is an integer + return int(grad_accumulation_factor) + except Exception as e: + print(f"An error occurred: {e}") + + # Return default value if an error occurred + return 1 + +def get_training_batch_size(cfg): + try: + # Navigate through the nested dictionaries and get the training batch size + batch_size_training = cfg.get("accelerator", {}).get("config_override", {}).get("datamodule", {}).get("args", {}).get("batch_size_training", 1) + + # Ensure that the extracted value is an integer + return int(batch_size_training) + except Exception as e: + print(f"An error occurred: {e}") + + # Return default value if an error occurred + return 1 + def run_training_finetuning_testing(cfg: DictConfig) -> None: """ @@ -80,6 +122,12 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None: st = timeit.default_timer() + replicas = get_replication_factor(cfg) + gradient_acc = get_gradient_accumulation_factor(cfg) + micro_bs = get_training_batch_size(cfg) + + global_bs = replicas * gradient_acc * micro_bs + # Disable wandb if the user is not logged in. wandb_cfg = cfg["constants"].get("wandb") if wandb_cfg is not None and wandb.login() is False: @@ -124,6 +172,9 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None: accelerator_type=accelerator_type, featurization=datamodule.featurization, task_norms=datamodule.task_norms, + replicas=replicas, + gradient_acc=gradient_acc, + global_bs=global_bs, ) logger.info(predictor.model) diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py index 0764492ff..342e9f869 100644 --- a/graphium/config/_loader.py +++ b/graphium/config/_loader.py @@ -289,6 +289,9 @@ def load_predictor( accelerator_type: str, featurization: Dict[str, str] = None, task_norms: Optional[Dict[Callable, Any]] = None, + replicas: int = 1, + gradient_acc: int = 1, + global_bs: int = 1, ) -> PredictorModule: """ Defining the predictor module, which handles the training logic from `lightning.LighningModule` @@ -314,6 +317,9 @@ def load_predictor( task_levels=task_levels, featurization=featurization, task_norms=task_norms, + replicas=replicas, + gradient_acc=gradient_acc, + global_bs=global_bs, **cfg_pred, ) @@ -327,6 +333,9 @@ def load_predictor( model_class=model_class, model_kwargs=scaled_model_kwargs, metrics=metrics, + replicas=replicas, + gradient_acc=gradient_acc, + global_bs=global_bs, **cfg_pred, ) diff --git a/graphium/trainer/predictor.py b/graphium/trainer/predictor.py index 6f8939e8a..9be10eee0 100644 --- a/graphium/trainer/predictor.py +++ b/graphium/trainer/predictor.py @@ -46,6 +46,9 @@ def __init__( flag_kwargs: Dict[str, Any] = None, task_norms: Optional[Dict[Callable, Any]] = None, metrics_every_n_train_steps: Optional[int] = None, + replicas: int = 1, + gradient_acc: int = 1, + global_bs: Optional[int] = 1, ): """ The Lightning module responsible for handling the predictions, losses, metrics, optimization, etc. @@ -175,6 +178,9 @@ def __init__( self.metrics_every_n_train_steps = metrics_every_n_train_steps # Wether save preds and targets for each training step. + self.samples_seen = 0 + self.global_bs = global_bs + def forward( self, inputs: Dict ) -> Dict[str, Union[Tensor, Dict[str, Tensor], Dict[str, Dict[str, Tensor]]]]: @@ -377,6 +383,9 @@ def _general_step(self, batch: Dict[str, Tensor], step_name: str, to_cpu: bool) # print("loss ", self.global_step, self.current_epoch, loss) step_dict["task_losses"] = task_losses step_dict["gradient_norm"] = self.get_gradient_norm() + # samuelm + # self.samples_seen += 1 + # step_dict["samples_seen"] = self.samples_seen return step_dict def flag_step(self, batch: Dict[str, Tensor], step_name: str, to_cpu: bool) -> Dict[str, Any]: @@ -446,6 +455,9 @@ def flag_step(self, batch: Dict[str, Tensor], step_name: str, to_cpu: bool) -> D step_dict[f"loss/{step_name}"] = loss.detach().cpu() step_dict["loss"] = loss step_dict["task_losses"] = task_losses + # samuelm + # self.samples_seen += 1 + # step_dict["samples_seen"] = self.samples_seen return step_dict def on_train_batch_start(self, batch: Any, batch_idx: int) -> Optional[int]: @@ -462,8 +474,9 @@ def on_train_batch_end(self, outputs, batch: Any, batch_idx: int) -> None: concatenated_metrics_logs = {} concatenated_metrics_logs["train/loss"] = outputs["loss"] concatenated_metrics_logs["epoch_count"] = self.current_epoch - # TODO: Samuelm - we need a number of samples here as well if this works? - # import ipdb; ipdb.set_trace() + # Incriment by the batch size + self.samples_seen += self.global_bs + concatenated_metrics_logs["samples_seen"] = self.samples_seen # report the training loss for each individual tasks for task in self.tasks: From 127f66ba15a1e95156aa48f96986cb1b59374669 Mon Sep 17 00:00:00 2001 From: kerstink-GC Date: Thu, 12 Oct 2023 15:26:03 +0000 Subject: [PATCH 53/58] updated configs for merge --- .../SF_169M_config_LargeMix_mpnn_GPS++.yaml | 29 +- .../SF_378M_config_LargeMix_mpnn_GPS++.yaml | 4 +- .../SF_590M_config_LargeMix_mpnn_GPS++.yaml | 2 + .../SF_671M_config_LargeMix_mpnn_GPS++.yaml | 486 ++++++++++++++++++ 4 files changed, 506 insertions(+), 15 deletions(-) create mode 100644 expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml diff --git a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml index 0b0a562a0..8cbc58681 100644 --- a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml @@ -45,27 +45,27 @@ accelerator: args: ipu_dataloader_training_opts: mode: async - max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 116 + max_num_nodes: 140 + max_num_edges: 298 ipu_dataloader_inference_opts: mode: async - max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 116 + max_num_nodes: 140 + max_num_edges: 298 # Data handling-related - batch_size_training: 8 - batch_size_inference: 8 + batch_size_training: 3 + batch_size_inference: 3 predictor: metrics_every_n_train_steps: 1000 optim_kwargs: - loss_scaling: 8192 + loss_scaling: 1024 trainer: trainer: precision: 16-true - accumulate_grad_batches: 16 + accumulate_grad_batches: 320 ipu_config: - - deviceIterations(30) # IPU would require large batches to be ready for the model. - - replicationFactor(2) + - deviceIterations(5) # IPU would require large batches to be ready for the model. + - replicationFactor(1) # - enableProfiling("graph_analyser") # The folder where the profile will be stored # - enableExecutableCaching("pop_compiler_cache") - TensorLocations.numIOTiles(128) @@ -82,8 +82,8 @@ accelerator: accelerator_kwargs: _accelerator: "ipu" - #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] - gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] + gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] + #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] #gnn_layers_per_ipu: [4,4,4,4] datamodule: @@ -176,6 +176,7 @@ datamodule: add_self_loop: False explicit_H: False # if H is included use_bonds_weights: False + max_num_atoms: 100 pos_encoding_as_features: # encoder dropout 0.18 pos_types: lap_eigvec: @@ -381,8 +382,8 @@ architecture: layer_type: 'pyg:gps' layer_kwargs: mpnn_type: 'pyg:mpnnplus' - node_residual: True - edge_residual: True + node_residual: false + edge_residual: false output_scale: 1.0 mpnn_kwargs: in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) diff --git a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml index 6dba7e4ad..692c8e0de 100644 --- a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml @@ -61,7 +61,7 @@ accelerator: predictor: metrics_every_n_train_steps: 1000 optim_kwargs: - loss_scaling: 1 + loss_scaling: 1024 trainer: trainer: precision: 16-true @@ -386,6 +386,8 @@ architecture: layer_type: 'pyg:gps' layer_kwargs: node_residual: false + edge_residual: false + output_scale: 1.0 mpnn_type: 'pyg:mpnnplus' mpnn_kwargs: in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) diff --git a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml index 5e537c039..944f6eb0b 100644 --- a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml +++ b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml @@ -387,6 +387,8 @@ architecture: layer_type: 'pyg:gps' layer_kwargs: node_residual: false + edge_residual: false + output_scale: 1.0 mpnn_type: 'pyg:mpnnplus' mpnn_kwargs: in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) diff --git a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml new file mode 100644 index 000000000..6bfc447c4 --- /dev/null +++ b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml @@ -0,0 +1,486 @@ +# Running the mpnn model with the largemix dataset on IPU. + +# @package _global_ + +constants: + seed: 42 + raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/neurips2023-large/" + epochs: 100 + name: SF_671M_sweep_LargeMix_mpnn + wandb: + entity: multitask-gnn + name: SF_671M_sweep_LargeMix_mpnn + project: neurips2023_graphcore_scaling_mpnn + + # This whole sections is for minimizing mistakes for the scaling experiments. + # This is the ONLY place where dimensions ahve to change. + # No other dimensions have to be changed in the architecture part. + +dimensions: + pre_nn_out_dim: 1024 + pre_nn_hidden_dims: 4096 + pre_nn_edges_out_dim: 512 + pre_nn_edges_hidden_dims: 2048 + gnn_out_dim: 1024 + gnn_hidden_dims: 1024 + mpnn_in_dim: 1024 + mpnn_out_dim: 1024 + mpnn_in_dim_edges: 512 + mpnn_out_dim_edges: 512 + graph_output_nn_out_dims: 1024 + graph_output_nn_hidden_dims: 1024 + node_output_nn_out_dims: 1024 + node_output_nn_hidden_dims: 1024 + l1000_vcap_hidden_dims: 512 + l1000_mcf7_hidden_dims: 512 + pcba_1328_hidden_dims: 256 + pcqm4m_g25_hidden_dims: 128 + pcqm4m_n4_hidden_dims: 128 + +accelerator: + type: ipu # cpu or ipu or gpu + config_override: + datamodule: + args: + ipu_dataloader_training_opts: + mode: async + # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 + # max_num_edges_per_graph: 116 + max_num_nodes: 144 + max_num_edges: 304 + ipu_dataloader_inference_opts: + mode: async + # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 + # max_num_edges_per_graph: 116 + max_num_nodes: 144 + max_num_edges: 304 + # Data handling-related + batch_size_training: 3 + batch_size_inference: 2 + predictor: + metrics_every_n_train_steps: 1000 + optim_kwargs: + loss_scaling: 1024 + trainer: + trainer: + precision: 16-true + accumulate_grad_batches: 320 + + ipu_config: + - deviceIterations(5) # IPU would require large batches to be ready for the model. + - replicationFactor(1) + # - 'setAvailableMemoryProportion({"IPU0": 0.05})' + # - enableProfiling("graph_analyser") # The folder where the profile will be stored + # - enableExecutableCaching("pop_compiler_cache") + # - TensorLocations.numIOTiles(128) + # - _Popart.set("defaultBufferingDepth", 96) + - _Popart.set("saveInitializersToFile", "weights.onnx") + - Precision.enableStochasticRounding(True) + # - Precision.enableFloatingPointExceptions(True) + - 'setAvailableMemoryProportion({"IPU0": 0.1})' + + ipu_inference_config: + # set device iteration and replication factor to 1 during inference + # gradient accumulation was set to 1 in the code + - deviceIterations(16) + - replicationFactor(1) + - Precision.enableStochasticRounding(False) + + accelerator_kwargs: + _accelerator: "ipu" + gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] + # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] + + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + l1000_vcap: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` + epoch_sampling_fraction: 1.0 + + l1000_mcf7: + df: null + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: geneID-* # geneID-* means all columns starting with "geneID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcba_1328: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet + # or set path as the URL directly + smiles_col: "SMILES" + label_cols: assayID-* # assayID-* means all columns starting with "assayID-" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` + epoch_sampling_fraction: 1.0 + + pcqm4m_g25: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: graph_* # graph_* means all columns starting with "graph_" + # sample_size: 2000 # use sample_size for test + task_level: graph + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + pcqm4m_n4: + df: null + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet + # or set path as the URL directly + smiles_col: "ordered_smiles" + label_cols: node_* # node_* means all columns starting with "node_" + # sample_size: 2000 # use sample_size for test + task_level: node + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` + seed: 42 + label_normalization: + normalize_val_test: True + method: "normal" + epoch_sampling_fraction: 1.0 + + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + dataloading_from: disk + processed_graph_data_path: ${constants.datacache_path} + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + max_num_atoms: 100 + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 + + num_workers: 16 # -1 to use all + persistent_workers: True # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + +#Task-specific +predictor: + metrics_on_progress_bar: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + metrics_on_training_set: + l1000_vcap: [] + l1000_mcf7: [] + pcba_1328: [] + pcqm4m_g25: [] + pcqm4m_n4: [] + loss_fun: + l1000_vcap: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: 0.5 + l1000_mcf7: + name: hybrid_ce_ipu + n_brackets: 3 + alpha: ${predictor.loss_fun.l1000_vcap.alpha} + pcba_1328: bce_logits_ipu + pcqm4m_g25: mae_ipu + pcqm4m_n4: mae_ipu + random_seed: ${constants.seed} + optim_kwargs: + lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor qm9/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + l1000_vcap: &classif_metrics + - name: auroc + metric: auroc + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + - name: avpr + metric: averageprecision + num_classes: 3 + task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 + multitask_handling: mean-per-label + threshold_kwargs: null + l1000_mcf7: *classif_metrics + pcba_1328: + # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly + - name: auroc + metric: auroc + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + - name: avpr + metric: averageprecision + task: binary + multitask_handling: mean-per-label + target_nan_mask: ignore + threshold_kwargs: null + pcqm4m_g25: &pcqm_metrics + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2 + metric: r2_score_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + pcqm4m_n4: *pcqm_metrics + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/neurips2023-large/ + name: ${constants.name} + project: ${constants.name} + model_checkpoint: + dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/ + filename: ${constants.name} + # monitor: *monitor + # mode: *mode + # save_top_k: 1 + save_last: True + trainer: + max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} + min_epochs: 1 + check_val_every_n_epoch: 20 + +architecture: + model_type: FullGraphMultiTaskNetwork + mup_base_path: null + pre_nn: + out_dim: ${dimensions.pre_nn_out_dim} + hidden_dims: ${dimensions.pre_nn_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: ${dimensions.pre_nn_edges_out_dim} + hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: 0.18 + normalization: ${architecture.pre_nn.normalization} + last_normalization: ${architecture.pre_nn.normalization} + residual_type: none + + pe_encoders: + out_dim: 32 + pool: "sum" #"mean" "max" + last_norm: None #"batch_norm", "layer_norm" + encoders: #la_pos | rw_pos + la_pos: # Set as null to avoid a pre-nn network + encoder_type: "laplacian_pe" + input_keys: ["laplacian_eigvec", "laplacian_eigval"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + model_type: 'DeepSet' #'Transformer' or 'DeepSet' + num_layers: 2 + num_layers_post: 1 # Num. layers to apply after pooling + dropout: 0.1 + first_normalization: "none" #"batch_norm" or "layer_norm" + rw_pos: + encoder_type: "mlp" + input_keys: ["rw_return_probs"] + output_keys: ["feat"] + hidden_dim: 64 + out_dim: 32 + num_layers: 2 + dropout: 0.1 + normalization: "layer_norm" #"batch_norm" or "layer_norm" + first_normalization: "layer_norm" #"batch_norm" or "layer_norm" + + gnn: # Set as null to avoid a post-nn network + # in_dim: 256 # should be consistent with pre_nn.out_dim + out_dim: ${dimensions.gnn_out_dim} + hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 16 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'sum' + use_virtual_edges: true + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false + edge_residual: false + output_scale: 1.0 + mpnn_type: 'pyg:mpnnplus' + mpnn_kwargs: + in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null + droppath_rate_ffn: 0.3 + + graph_output_nn: + graph: + pooling: [sum] + out_dim: ${dimensions.graph_output_nn_out_dims} + hidden_dims: ${dimensions.graph_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + node: + pooling: [sum] + out_dim: ${dimensions.node_output_nn_out_dims} + hidden_dims: ${dimensions.node_output_nn_hidden_dims} + depth: 1 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + task_heads: + l1000_vcap: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_vcap_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + l1000_mcf7: + task_level: graph + out_dim: 2934 + hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} + depth: 2 + activation: none + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcba_1328: + task_level: graph + out_dim: 1328 + hidden_dims: ${dimensions.pcba_1328_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_g25: + task_level: graph + out_dim: 25 + hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pcqm4m_n4: + task_level: node + out_dim: 4 + hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} + depth: 2 + activation: relu + last_activation: none + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + + From e3b21ae9f398f593836e40485e2c5ef0eb2fae5a Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Thu, 12 Oct 2023 18:17:37 +0000 Subject: [PATCH 54/58] removing configs and tidying the edge max limit --- .../SF_11M_config_LargeMix_mpnn_GPS++.yaml | 478 ----------------- .../SF_169M_config_LargeMix_mpnn_GPS++.yaml | 479 ----------------- .../SF_378M_config_LargeMix_mpnn_GPS++.yaml | 483 ----------------- .../SF_42M_config_LargeMix_mpnn_GPS++.yaml | 476 ----------------- .../SF_590M_config_LargeMix_mpnn_GPS++.yaml | 484 ----------------- .../SF_671M_config_LargeMix_mpnn_GPS++.yaml | 486 ------------------ expts/foundation_model/__init__.py | 0 .../config_LargeMix_mpnn_GPS++.yaml | 462 ----------------- .../config_ogbpcq_mpnn_GPS++_newDATA.yaml | 295 ----------- expts/iclr2023_configs/__init__.py | 0 .../base_config/ogbpcqm4mv2.yaml | 288 ----------- .../base_config/ogbpcqm4mv2_GPS++.yaml | 288 ----------- .../config_ogbpcq_GCN_16layers.yaml | 263 ---------- .../config_ogbpcq_GCN_4layers.yaml | 263 ---------- .../config_ogbpcq_GINE_16layers.yaml | 273 ---------- .../config_ogbpcq_GINE_4layers.yaml | 273 ---------- .../iclr2023_configs/config_ogbpcq_mpnn.yaml | 57 -- .../config_ogbpcq_mpnn_GPS++.yaml | 295 ----------- .../config_ogbpcq_mpnn_GPS++_newDATA.yaml | 295 ----------- ...fig_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml | 295 ----------- .../config_ogbpcq_mpnn_JosefOG.yaml | 57 -- .../config_ogbpcq_mpnn_hydradims.yaml | 101 ---- .../config_ogbpcq_mpnn_largerffn.yaml | 58 --- graphium/features/featurizer.py | 3 - 24 files changed, 6452 deletions(-) delete mode 100644 expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml delete mode 100644 expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml delete mode 100644 expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml delete mode 100644 expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml delete mode 100644 expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml delete mode 100644 expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml delete mode 100644 expts/foundation_model/__init__.py delete mode 100644 expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml delete mode 100644 expts/foundation_model/config_ogbpcq_mpnn_GPS++_newDATA.yaml delete mode 100644 expts/iclr2023_configs/__init__.py delete mode 100644 expts/iclr2023_configs/base_config/ogbpcqm4mv2.yaml delete mode 100644 expts/iclr2023_configs/base_config/ogbpcqm4mv2_GPS++.yaml delete mode 100644 expts/iclr2023_configs/config_ogbpcq_GCN_16layers.yaml delete mode 100644 expts/iclr2023_configs/config_ogbpcq_GCN_4layers.yaml delete mode 100644 expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml delete mode 100644 expts/iclr2023_configs/config_ogbpcq_GINE_4layers.yaml delete mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn.yaml delete mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml delete mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA.yaml delete mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml delete mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_JosefOG.yaml delete mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_hydradims.yaml delete mode 100644 expts/iclr2023_configs/config_ogbpcq_mpnn_largerffn.yaml diff --git a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml deleted file mode 100644 index 124b1d250..000000000 --- a/expts/foundation_model/SF_11M_config_LargeMix_mpnn_GPS++.yaml +++ /dev/null @@ -1,478 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/neurips2023-large/" - epochs: 100 - name: SF_11M_sweep_LargeMix_mpnn - wandb: - entity: multitask-gnn - name: SF_11M_sweep_LargeMix_mpnn - project: neurips2023_graphcore_scaling_mpnn - - # This whole sections is for minimizing mistakes for the scaling experiments. - # This is the ONLY place where dimensions ahve to change. - # No other dimensions have to be changed in the architecture part. - -dimensions: - pre_nn_out_dim: &gnn_dim 256 # original 256 - pre_nn_hidden_dims: 1024 # original 1024 - pre_nn_edges_out_dim: &gnn_dim_edges 128 # original 128 - pre_nn_edges_hidden_dims: 512 # original 512 - gnn_out_dim: *gnn_dim # original 256 - gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - mpnn_in_dim: *gnn_dim # original 256 - mpnn_out_dim: *gnn_dim # original 256 - mpnn_in_dim_edges: *gnn_dim_edges # original 128 - mpnn_out_dim_edges: *gnn_dim_edges # original 128 - graph_output_nn_out_dims: *gnn_dim # original 256 - graph_output_nn_hidden_dims: *gnn_dim # original 256 - node_output_nn_out_dims: *gnn_dim # original 256 - node_output_nn_hidden_dims: *gnn_dim # original 256 - l1000_vcap_hidden_dims: 128 # original 128 - l1000_mcf7_hidden_dims: 128 # original 128 - pcba_1328_hidden_dims: 64 # original 64 - pcqm4m_g25_hidden_dims: 32 # original 32 - pcqm4m_n4_hidden_dims: 32 # original 32 - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 100 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 100 - # Data handling-related - batch_size_training: 30 - batch_size_inference: 30 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 2 - - ipu_config: - - deviceIterations(16) # IPU would require large batches to be ready for the model. - - replicationFactor(16) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - # accelerator_kwargs: - #_accelerator: "ipu" - #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] - #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] - #gnn_layers_per_ipu: [4,4,4,4] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - l1000_vcap: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` - epoch_sampling_fraction: 1.0 - - l1000_mcf7: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcba_1328: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: assayID-* # assayID-* means all columns starting with "assayID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcqm4m_g25: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: graph_* # graph_* means all columns starting with "graph_" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - pcqm4m_n4: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: node_* # node_* means all columns starting with "node_" - # sample_size: 2000 # use sample_size for test - task_level: node - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - seed: 42 - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: disk - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - num_workers: 16 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - metrics_on_training_set: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - loss_fun: - l1000_vcap: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: 0.5 - l1000_mcf7: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: ${predictor.loss_fun.l1000_vcap.alpha} - pcba_1328: bce_logits_ipu - pcqm4m_g25: mae_ipu - pcqm4m_n4: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - l1000_vcap: &classif_metrics - - name: auroc - metric: auroc - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - - name: avpr - metric: averageprecision - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - l1000_mcf7: *classif_metrics - pcba_1328: - # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly - - name: auroc - metric: auroc - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - - name: avpr - metric: averageprecision - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - pcqm4m_g25: &pcqm_metrics - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2 - metric: r2_score_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - pcqm4m_n4: *pcqm_metrics - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/neurips2023-large/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: ./models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} - min_epochs: 1 - check_val_every_n_epoch: 20 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: ${dimensions.pre_nn_out_dim} - hidden_dims: ${dimensions.pre_nn_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: ${dimensions.pre_nn_edges_out_dim} - hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: ${dimensions.gnn_out_dim} - hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 4 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: True - edge_residual: True - output_scale: 1.0 - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: ${dimensions.graph_output_nn_out_dims} - hidden_dims: ${dimensions.graph_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: ${dimensions.node_output_nn_out_dims} - hidden_dims: ${dimensions.node_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - l1000_vcap: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_vcap_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - l1000_mcf7: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcba_1328: - task_level: graph - out_dim: 1328 - hidden_dims: ${dimensions.pcba_1328_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_g25: - task_level: graph - out_dim: 25 - hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_n4: - task_level: node - out_dim: 4 - hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - diff --git a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml deleted file mode 100644 index 8cbc58681..000000000 --- a/expts/foundation_model/SF_169M_config_LargeMix_mpnn_GPS++.yaml +++ /dev/null @@ -1,479 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/neurips2023-large/" - epochs: 100 - name: SF_169M_sweep_LargeMix_mpnn - wandb: - entity: multitask-gnn - name: SF_169M_sweep_LargeMix_mpnn - project: neurips2023_graphcore_scaling_mpnn - - # This whole sections is for minimizing mistakes for the scaling experiments. - # This is the ONLY place where dimensions ahve to change. - # No other dimensions have to be changed in the architecture part. - -dimensions: - pre_nn_out_dim: &gnn_dim 512 # original 256 - pre_nn_hidden_dims: 2048 # original 1024 - pre_nn_edges_out_dim: &gnn_dim_edges 256 # original 128 - pre_nn_edges_hidden_dims: 1024 # original 512 - gnn_out_dim: *gnn_dim # original 256 - gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - mpnn_in_dim: *gnn_dim # original 256 - mpnn_out_dim: *gnn_dim # original 256 - mpnn_in_dim_edges: *gnn_dim_edges # original 128 - mpnn_out_dim_edges: *gnn_dim_edges # original 128 - graph_output_nn_out_dims: *gnn_dim # original 256 - graph_output_nn_hidden_dims: *gnn_dim # original 256 - node_output_nn_out_dims: *gnn_dim # original 256 - node_output_nn_hidden_dims: *gnn_dim # original 256 - l1000_vcap_hidden_dims: 256 # original 128 - l1000_mcf7_hidden_dims: 256 # original 128 - pcba_1328_hidden_dims: 128 # original 64 - pcqm4m_g25_hidden_dims: 64 # original 32 - pcqm4m_n4_hidden_dims: 64 # original 32 - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes: 140 - max_num_edges: 298 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes: 140 - max_num_edges: 298 - # Data handling-related - batch_size_training: 3 - batch_size_inference: 3 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 320 - - ipu_config: - - deviceIterations(5) # IPU would require large batches to be ready for the model. - - replicationFactor(1) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] - #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] - #gnn_layers_per_ipu: [4,4,4,4] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - l1000_vcap: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` - epoch_sampling_fraction: 1.0 - - l1000_mcf7: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcba_1328: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: assayID-* # assayID-* means all columns starting with "assayID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcqm4m_g25: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: graph_* # graph_* means all columns starting with "graph_" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - pcqm4m_n4: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: node_* # node_* means all columns starting with "node_" - # sample_size: 2000 # use sample_size for test - task_level: node - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - seed: 42 - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: disk - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - max_num_atoms: 100 - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - num_workers: 16 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - metrics_on_training_set: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - loss_fun: - l1000_vcap: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: 0.5 - l1000_mcf7: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: ${predictor.loss_fun.l1000_vcap.alpha} - pcba_1328: bce_logits_ipu - pcqm4m_g25: mae_ipu - pcqm4m_n4: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - l1000_vcap: &classif_metrics - - name: auroc - metric: auroc - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - - name: avpr - metric: averageprecision - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - l1000_mcf7: *classif_metrics - pcba_1328: - # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly - - name: auroc - metric: auroc - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - - name: avpr - metric: averageprecision - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - pcqm4m_g25: &pcqm_metrics - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2 - metric: r2_score_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - pcqm4m_n4: *pcqm_metrics - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/neurips2023-large/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} - min_epochs: 1 - check_val_every_n_epoch: 20 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: ${dimensions.pre_nn_out_dim} - hidden_dims: ${dimensions.pre_nn_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: ${dimensions.pre_nn_edges_out_dim} - hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: ${dimensions.gnn_out_dim} - hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - mpnn_type: 'pyg:mpnnplus' - node_residual: false - edge_residual: false - output_scale: 1.0 - mpnn_kwargs: - in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: ${dimensions.graph_output_nn_out_dims} - hidden_dims: ${dimensions.graph_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: ${dimensions.node_output_nn_out_dims} - hidden_dims: ${dimensions.node_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - l1000_vcap: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_vcap_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - l1000_mcf7: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcba_1328: - task_level: graph - out_dim: 1328 - hidden_dims: ${dimensions.pcba_1328_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_g25: - task_level: graph - out_dim: 25 - hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_n4: - task_level: node - out_dim: 4 - hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - diff --git a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml deleted file mode 100644 index 692c8e0de..000000000 --- a/expts/foundation_model/SF_378M_config_LargeMix_mpnn_GPS++.yaml +++ /dev/null @@ -1,483 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/neurips2023-large/" - epochs: 100 - name: SF_378M_sweep_LargeMix_mpnn - wandb: - entity: multitask-gnn - name: SF_378M_sweep_LargeMix_mpnn - project: neurips2023_graphcore_scaling_mpnn - - # This whole sections is for minimizing mistakes for the scaling experiments. - # This is the ONLY place where dimensions ahve to change. - # No other dimensions have to be changed in the architecture part. - -dimensions: - pre_nn_out_dim: 768 - pre_nn_hidden_dims: 3072 - pre_nn_edges_out_dim: 384 - pre_nn_edges_hidden_dims: 1536 - gnn_out_dim: 768 - gnn_hidden_dims: 768 - mpnn_in_dim: 768 - mpnn_out_dim: 768 - mpnn_in_dim_edges: 384 - mpnn_out_dim_edges: 384 - graph_output_nn_out_dims: 768 - graph_output_nn_hidden_dims: 768 - node_output_nn_out_dims: 768 - node_output_nn_hidden_dims: 768 - l1000_vcap_hidden_dims: 384 - l1000_mcf7_hidden_dims: 384 - pcba_1328_hidden_dims: 192 - pcqm4m_g25_hidden_dims: 96 - pcqm4m_n4_hidden_dims: 96 - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 - # max_num_edges_per_graph: 116 - max_num_nodes: 140 - max_num_edges: 298 - ipu_dataloader_inference_opts: - mode: async - # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 - # max_num_edges_per_graph: 116 - max_num_nodes: 140 - max_num_edges: 298 - # Data handling-related - batch_size_training: 3 - batch_size_inference: 3 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 320 - - ipu_config: - - deviceIterations(5) # IPU would require large batches to be ready for the model. - - replicationFactor(1) - # - 'setAvailableMemoryProportion({"IPU0": 0.05})' - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - # - TensorLocations.numIOTiles(128) - # - _Popart.set("defaultBufferingDepth", 96) - - _Popart.set("saveInitializersToFile", "weights.onnx") - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(16) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] - # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - l1000_vcap: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` - epoch_sampling_fraction: 1.0 - - l1000_mcf7: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcba_1328: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: assayID-* # assayID-* means all columns starting with "assayID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcqm4m_g25: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: graph_* # graph_* means all columns starting with "graph_" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - pcqm4m_n4: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: node_* # node_* means all columns starting with "node_" - # sample_size: 2000 # use sample_size for test - task_level: node - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - seed: 42 - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: disk - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - max_num_atoms: 100 - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - num_workers: 16 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - metrics_on_training_set: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - loss_fun: - l1000_vcap: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: 0.5 - l1000_mcf7: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: ${predictor.loss_fun.l1000_vcap.alpha} - pcba_1328: bce_logits_ipu - pcqm4m_g25: mae_ipu - pcqm4m_n4: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - l1000_vcap: &classif_metrics - - name: auroc - metric: auroc - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - - name: avpr - metric: averageprecision - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - l1000_mcf7: *classif_metrics - pcba_1328: - # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly - - name: auroc - metric: auroc - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - - name: avpr - metric: averageprecision - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - pcqm4m_g25: &pcqm_metrics - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2 - metric: r2_score_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - pcqm4m_n4: *pcqm_metrics - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/neurips2023-large/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} - min_epochs: 1 - check_val_every_n_epoch: 20 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: ${dimensions.pre_nn_out_dim} - hidden_dims: ${dimensions.pre_nn_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: ${dimensions.pre_nn_edges_out_dim} - hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: ${dimensions.gnn_out_dim} - hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - edge_residual: false - output_scale: 1.0 - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: ${dimensions.graph_output_nn_out_dims} - hidden_dims: ${dimensions.graph_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: ${dimensions.node_output_nn_out_dims} - hidden_dims: ${dimensions.node_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - l1000_vcap: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_vcap_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - l1000_mcf7: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcba_1328: - task_level: graph - out_dim: 1328 - hidden_dims: ${dimensions.pcba_1328_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_g25: - task_level: graph - out_dim: 25 - hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_n4: - task_level: node - out_dim: 4 - hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - diff --git a/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml deleted file mode 100644 index 04c393733..000000000 --- a/expts/foundation_model/SF_42M_config_LargeMix_mpnn_GPS++.yaml +++ /dev/null @@ -1,476 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/neurips2023-large/" - epochs: 100 - name: SF_42M_sweep_LargeMix_mpnn - wandb: - entity: multitask-gnn - name: SF_42M_sweep_LargeMix_mpnn - project: neurips2023_graphcore_scaling_mpnn - - # This whole sections is for minimizing mistakes for the scaling experiments. - # This is the ONLY place where dimensions ahve to change. - # No other dimensions have to be changed in the architecture part. - -dimensions: - pre_nn_out_dim: &gnn_dim 256 # original 256 - pre_nn_hidden_dims: 1024 # original 1024 - pre_nn_edges_out_dim: &gnn_dim_edges 128 # original 128 - pre_nn_edges_hidden_dims: 512 # original 512 - gnn_out_dim: *gnn_dim # original 256 - gnn_hidden_dims: *gnn_dim # original 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - mpnn_in_dim: *gnn_dim # original 256 - mpnn_out_dim: *gnn_dim # original 256 - mpnn_in_dim_edges: *gnn_dim_edges # original 128 - mpnn_out_dim_edges: *gnn_dim_edges # original 128 - graph_output_nn_out_dims: *gnn_dim # original 256 - graph_output_nn_hidden_dims: *gnn_dim # original 256 - node_output_nn_out_dims: *gnn_dim # original 256 - node_output_nn_hidden_dims: *gnn_dim # original 256 - l1000_vcap_hidden_dims: 128 # original 128 - l1000_mcf7_hidden_dims: 128 # original 128 - pcba_1328_hidden_dims: 64 # original 64 - pcqm4m_g25_hidden_dims: 32 # original 32 - pcqm4m_n4_hidden_dims: 32 # original 32 - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 40 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 100 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 45 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 100 - # Data handling-related - batch_size_training: 16 - batch_size_inference: 16 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 16 - - ipu_config: - - deviceIterations(30) # IPU would require large batches to be ready for the model. - - replicationFactor(4) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - #gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] - #gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] - gnn_layers_per_ipu: [4,4,4,4] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - l1000_vcap: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` - epoch_sampling_fraction: 1.0 - - l1000_mcf7: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcba_1328: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: assayID-* # assayID-* means all columns starting with "assayID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcqm4m_g25: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: graph_* # graph_* means all columns starting with "graph_" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - pcqm4m_n4: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: node_* # node_* means all columns starting with "node_" - # sample_size: 2000 # use sample_size for test - task_level: node - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - seed: 42 - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: disk - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - num_workers: 16 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - metrics_on_training_set: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - loss_fun: - l1000_vcap: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: 0.5 - l1000_mcf7: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: ${predictor.loss_fun.l1000_vcap.alpha} - pcba_1328: bce_logits_ipu - pcqm4m_g25: mae_ipu - pcqm4m_n4: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - l1000_vcap: &classif_metrics - - name: auroc - metric: auroc - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - - name: avpr - metric: averageprecision - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - l1000_mcf7: *classif_metrics - pcba_1328: - # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly - - name: auroc - metric: auroc - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - - name: avpr - metric: averageprecision - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - pcqm4m_g25: &pcqm_metrics - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2 - metric: r2_score_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - pcqm4m_n4: *pcqm_metrics - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/neurips2023-large/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} - min_epochs: 1 - check_val_every_n_epoch: 20 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: ${dimensions.pre_nn_out_dim} - hidden_dims: ${dimensions.pre_nn_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: ${dimensions.pre_nn_edges_out_dim} - hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: ${dimensions.gnn_out_dim} - hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: ${dimensions.graph_output_nn_out_dims} - hidden_dims: ${dimensions.graph_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: ${dimensions.node_output_nn_out_dims} - hidden_dims: ${dimensions.node_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - l1000_vcap: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_vcap_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - l1000_mcf7: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcba_1328: - task_level: graph - out_dim: 1328 - hidden_dims: ${dimensions.pcba_1328_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_g25: - task_level: graph - out_dim: 25 - hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_n4: - task_level: node - out_dim: 4 - hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - diff --git a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml deleted file mode 100644 index 944f6eb0b..000000000 --- a/expts/foundation_model/SF_590M_config_LargeMix_mpnn_GPS++.yaml +++ /dev/null @@ -1,484 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/neurips2023-large/" - epochs: 100 - name: SF_590M_sweep_LargeMix_mpnn - wandb: - entity: multitask-gnn - name: SF_590M_sweep_LargeMix_mpnn - project: neurips2023_graphcore_scaling_mpnn - - # This whole sections is for minimizing mistakes for the scaling experiments. - # This is the ONLY place where dimensions ahve to change. - # No other dimensions have to be changed in the architecture part. - -dimensions: - pre_nn_out_dim: 960 - pre_nn_hidden_dims: 3840 - pre_nn_edges_out_dim: 480 - pre_nn_edges_hidden_dims: 1920 - gnn_out_dim: 960 - gnn_hidden_dims: 960 - mpnn_in_dim: 960 - mpnn_out_dim: 960 - mpnn_in_dim_edges: 480 - mpnn_out_dim_edges: 480 - graph_output_nn_out_dims: 960 - graph_output_nn_hidden_dims: 960 - node_output_nn_out_dims: 960 - node_output_nn_hidden_dims: 960 - l1000_vcap_hidden_dims: 480 - l1000_mcf7_hidden_dims: 480 - pcba_1328_hidden_dims: 240 - pcqm4m_g25_hidden_dims: 120 - pcqm4m_n4_hidden_dims: 120 - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 - # max_num_edges_per_graph: 116 - max_num_nodes: 140 - max_num_edges: 298 - ipu_dataloader_inference_opts: - mode: async - # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 - # max_num_edges_per_graph: 116 - max_num_nodes: 140 - max_num_edges: 298 - # Data handling-related - batch_size_training: 3 - batch_size_inference: 3 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 320 - - ipu_config: - - deviceIterations(5) # IPU would require large batches to be ready for the model. - - replicationFactor(1) - # - 'setAvailableMemoryProportion({"IPU0": 0.05})' - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - # - TensorLocations.numIOTiles(128) - # - _Popart.set("defaultBufferingDepth", 96) - - _Popart.set("saveInitializersToFile", "weights.onnx") - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(16) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] - # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] - - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - l1000_vcap: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` - epoch_sampling_fraction: 1.0 - - l1000_mcf7: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcba_1328: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: assayID-* # assayID-* means all columns starting with "assayID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcqm4m_g25: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: graph_* # graph_* means all columns starting with "graph_" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - pcqm4m_n4: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: node_* # node_* means all columns starting with "node_" - # sample_size: 2000 # use sample_size for test - task_level: node - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - seed: 42 - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: disk - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - max_num_atoms: 100 - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - num_workers: 16 # -1 to use all - persistent_workers: True # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - metrics_on_training_set: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - loss_fun: - l1000_vcap: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: 0.5 - l1000_mcf7: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: ${predictor.loss_fun.l1000_vcap.alpha} - pcba_1328: bce_logits_ipu - pcqm4m_g25: mae_ipu - pcqm4m_n4: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - l1000_vcap: &classif_metrics - - name: auroc - metric: auroc - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - - name: avpr - metric: averageprecision - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - l1000_mcf7: *classif_metrics - pcba_1328: - # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly - - name: auroc - metric: auroc - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - - name: avpr - metric: averageprecision - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - pcqm4m_g25: &pcqm_metrics - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2 - metric: r2_score_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - pcqm4m_n4: *pcqm_metrics - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/neurips2023-large/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} - min_epochs: 1 - check_val_every_n_epoch: 20 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: ${dimensions.pre_nn_out_dim} - hidden_dims: ${dimensions.pre_nn_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: ${dimensions.pre_nn_edges_out_dim} - hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: ${dimensions.gnn_out_dim} - hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - edge_residual: false - output_scale: 1.0 - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: ${dimensions.graph_output_nn_out_dims} - hidden_dims: ${dimensions.graph_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: ${dimensions.node_output_nn_out_dims} - hidden_dims: ${dimensions.node_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - l1000_vcap: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_vcap_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - l1000_mcf7: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcba_1328: - task_level: graph - out_dim: 1328 - hidden_dims: ${dimensions.pcba_1328_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_g25: - task_level: graph - out_dim: 25 - hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_n4: - task_level: node - out_dim: 4 - hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - diff --git a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml deleted file mode 100644 index 6bfc447c4..000000000 --- a/expts/foundation_model/SF_671M_config_LargeMix_mpnn_GPS++.yaml +++ /dev/null @@ -1,486 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/neurips2023-large/" - epochs: 100 - name: SF_671M_sweep_LargeMix_mpnn - wandb: - entity: multitask-gnn - name: SF_671M_sweep_LargeMix_mpnn - project: neurips2023_graphcore_scaling_mpnn - - # This whole sections is for minimizing mistakes for the scaling experiments. - # This is the ONLY place where dimensions ahve to change. - # No other dimensions have to be changed in the architecture part. - -dimensions: - pre_nn_out_dim: 1024 - pre_nn_hidden_dims: 4096 - pre_nn_edges_out_dim: 512 - pre_nn_edges_hidden_dims: 2048 - gnn_out_dim: 1024 - gnn_hidden_dims: 1024 - mpnn_in_dim: 1024 - mpnn_out_dim: 1024 - mpnn_in_dim_edges: 512 - mpnn_out_dim_edges: 512 - graph_output_nn_out_dims: 1024 - graph_output_nn_hidden_dims: 1024 - node_output_nn_out_dims: 1024 - node_output_nn_hidden_dims: 1024 - l1000_vcap_hidden_dims: 512 - l1000_mcf7_hidden_dims: 512 - pcba_1328_hidden_dims: 256 - pcqm4m_g25_hidden_dims: 128 - pcqm4m_n4_hidden_dims: 128 - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - # max_num_nodes_per_graph: 75 # train max nodes: 20, max_edges: 54 - # max_num_edges_per_graph: 116 - max_num_nodes: 144 - max_num_edges: 304 - ipu_dataloader_inference_opts: - mode: async - # max_num_nodes_per_graph: 80 # valid max nodes: 51, max_edges: 118 - # max_num_edges_per_graph: 116 - max_num_nodes: 144 - max_num_edges: 304 - # Data handling-related - batch_size_training: 3 - batch_size_inference: 2 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 320 - - ipu_config: - - deviceIterations(5) # IPU would require large batches to be ready for the model. - - replicationFactor(1) - # - 'setAvailableMemoryProportion({"IPU0": 0.05})' - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - # - TensorLocations.numIOTiles(128) - # - _Popart.set("defaultBufferingDepth", 96) - - _Popart.set("saveInitializersToFile", "weights.onnx") - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - 'setAvailableMemoryProportion({"IPU0": 0.1})' - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(16) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] - # gnn_layers_per_ipu: [2,2,2,2,2,2,2,2] - - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - l1000_vcap: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` - epoch_sampling_fraction: 1.0 - - l1000_mcf7: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcba_1328: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: assayID-* # assayID-* means all columns starting with "assayID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcqm4m_g25: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: graph_* # graph_* means all columns starting with "graph_" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - pcqm4m_n4: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: node_* # node_* means all columns starting with "node_" - # sample_size: 2000 # use sample_size for test - task_level: node - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - seed: 42 - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: disk - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - max_num_atoms: 100 - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - num_workers: 16 # -1 to use all - persistent_workers: True # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - metrics_on_training_set: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - loss_fun: - l1000_vcap: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: 0.5 - l1000_mcf7: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: ${predictor.loss_fun.l1000_vcap.alpha} - pcba_1328: bce_logits_ipu - pcqm4m_g25: mae_ipu - pcqm4m_n4: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - l1000_vcap: &classif_metrics - - name: auroc - metric: auroc - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - - name: avpr - metric: averageprecision - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - l1000_mcf7: *classif_metrics - pcba_1328: - # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly - - name: auroc - metric: auroc - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - - name: avpr - metric: averageprecision - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - pcqm4m_g25: &pcqm_metrics - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2 - metric: r2_score_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - pcqm4m_n4: *pcqm_metrics - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/neurips2023-large/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: /net/group/all-ipu/graphium/model_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} - min_epochs: 1 - check_val_every_n_epoch: 20 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: ${dimensions.pre_nn_out_dim} - hidden_dims: ${dimensions.pre_nn_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: ${dimensions.pre_nn_edges_out_dim} - hidden_dims: ${dimensions.pre_nn_edges_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: ${dimensions.gnn_out_dim} - hidden_dims: ${dimensions.gnn_hidden_dims} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - edge_residual: false - output_scale: 1.0 - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: ${dimensions.mpnn_in_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: ${dimensions.mpnn_out_dim} # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: ${dimensions.mpnn_in_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: ${dimensions.mpnn_out_dim_edges} # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: ${dimensions.graph_output_nn_out_dims} - hidden_dims: ${dimensions.graph_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: ${dimensions.node_output_nn_out_dims} - hidden_dims: ${dimensions.node_output_nn_hidden_dims} - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - l1000_vcap: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_vcap_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - l1000_mcf7: - task_level: graph - out_dim: 2934 - hidden_dims: ${dimensions.l1000_mcf7_hidden_dims} - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcba_1328: - task_level: graph - out_dim: 1328 - hidden_dims: ${dimensions.pcba_1328_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_g25: - task_level: graph - out_dim: 25 - hidden_dims: ${dimensions.pcqm4m_g25_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_n4: - task_level: node - out_dim: 4 - hidden_dims: ${dimensions.pcqm4m_n4_hidden_dims} - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - diff --git a/expts/foundation_model/__init__.py b/expts/foundation_model/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml b/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml deleted file mode 100644 index ce6914154..000000000 --- a/expts/foundation_model/config_LargeMix_mpnn_GPS++.yaml +++ /dev/null @@ -1,462 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/neurips2023-large/" - epochs: 100 - name: LargeMix_mpnn_40M - wandb: - entity: multitask-gnn - name: LargeMix_mpnn_40M - project: neurips2023_graphcore_scaling_mpnn - - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 100 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 100 - # Data handling-related - batch_size_training: 16 - batch_size_inference: 16 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 16 - - ipu_config: - - deviceIterations(30) # IPU would require large batches to be ready for the model. - - replicationFactor(4) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [4,4,4,4] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - l1000_vcap: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt` - epoch_sampling_fraction: 1.0 - - l1000_mcf7: - df: null - df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: geneID-* # geneID-* means all columns starting with "geneID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcba_1328: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet - # or set path as the URL directly - smiles_col: "SMILES" - label_cols: assayID-* # assayID-* means all columns starting with "assayID-" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt` - epoch_sampling_fraction: 1.0 - - pcqm4m_g25: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: graph_* # graph_* means all columns starting with "graph_" - # sample_size: 2000 # use sample_size for test - task_level: graph - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - pcqm4m_n4: - df: null - df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet - # or set path as the URL directly - smiles_col: "ordered_smiles" - label_cols: node_* # node_* means all columns starting with "node_" - # sample_size: 2000 # use sample_size for test - task_level: node - splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - seed: 42 - label_normalization: - normalize_val_test: True - method: "normal" - epoch_sampling_fraction: 1.0 - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: disk - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - metrics_on_training_set: - l1000_vcap: [] - l1000_mcf7: [] - pcba_1328: [] - pcqm4m_g25: [] - pcqm4m_n4: [] - loss_fun: - l1000_vcap: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: 0.5 - l1000_mcf7: - name: hybrid_ce_ipu - n_brackets: 3 - alpha: ${predictor.loss_fun.l1000_vcap.alpha} - pcba_1328: bce_logits_ipu - pcqm4m_g25: mae_ipu - pcqm4m_n4: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - l1000_vcap: &classif_metrics - - name: auroc - metric: auroc - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - - name: avpr - metric: averageprecision - num_classes: 3 - task: multiclass - target_to_int: True - target_nan_mask: -1000 - ignore_index: -1000 - multitask_handling: mean-per-label - threshold_kwargs: null - l1000_mcf7: *classif_metrics - pcba_1328: - # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly - - name: auroc - metric: auroc - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - - name: avpr - metric: averageprecision - task: binary - multitask_handling: mean-per-label - target_nan_mask: ignore - threshold_kwargs: null - pcqm4m_g25: &pcqm_metrics - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2 - metric: r2_score_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - pcqm4m_n4: *pcqm_metrics - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/neurips2023-large/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${predictor.torch_scheduler_kwargs.max_num_epochs} - min_epochs: 1 - check_val_every_n_epoch: 20 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: 256 - hidden_dims: 1024 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: 128 - hidden_dims: 512 - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: 256 - hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - l1000_vcap: - task_level: graph - out_dim: 2934 - hidden_dims: 128 - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - l1000_mcf7: - task_level: graph - out_dim: 2934 - hidden_dims: 128 - depth: 2 - activation: none - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcba_1328: - task_level: graph - out_dim: 1328 - hidden_dims: 64 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_g25: - task_level: graph - out_dim: 25 - hidden_dims: 32 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pcqm4m_n4: - task_level: node - out_dim: 4 - hidden_dims: 32 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - - # task_heads: - # homolumo: - # task_level: graph - # out_dim: 1 - # hidden_dims: 256 - # depth: 2 - # activation: relu - # last_activation: none - # dropout: *dropout - # normalization: *normalization - # last_normalization: "none" - # residual_type: none diff --git a/expts/foundation_model/config_ogbpcq_mpnn_GPS++_newDATA.yaml b/expts/foundation_model/config_ogbpcq_mpnn_GPS++_newDATA.yaml deleted file mode 100644 index b97402680..000000000 --- a/expts/foundation_model/config_ogbpcq_mpnn_GPS++_newDATA.yaml +++ /dev/null @@ -1,295 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/PCQM4Mv2/" - epochs: 100 - name: ogb_pcqm4mv2_mpnn_NewData - wandb: - entity: multitask-gnn - name: neurips2023_scaling_mpnn_NewData - project: neurips2023_graphcore_scaling_mpnn - - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 100 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 100 - # Data handling-related - batch_size_training: 30 - batch_size_inference: 30 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 8 - - ipu_config: - - deviceIterations(60) # IPU would require large batches to be ready for the model. - - replicationFactor(4) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [4, 4, 4, 4] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - homolumo: - df: null - task_level: graph - df_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/raw/data.csv - # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` - smiles_col: "smiles" - label_cols: ["homolumogap"] # graph_* # graph_* means all columns starting with "graph_" - # sample_size: 100000 # use sample_size for test - splits_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/split_dict.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` - split_names: ["train", "valid", "test-dev"] - seed: ${constants.seed} - label_normalization: - normalize_val_test: true - method: "normal" - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: "disk" - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - # cache_data_path: . - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - homolumo: [] - metrics_on_training_set: - homolumo: [] - loss_fun: - homolumo: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: ${constants.epochs} - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - homolumo: - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/ogb_pcqm4mv2/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${constants.epochs} - min_epochs: 1 - check_val_every_n_epoch: 5 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: 256 - hidden_dims: 1024 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: 128 - hidden_dims: 512 - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: 256 - hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - homolumo: - task_level: graph - out_dim: 1 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none diff --git a/expts/iclr2023_configs/__init__.py b/expts/iclr2023_configs/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/expts/iclr2023_configs/base_config/ogbpcqm4mv2.yaml b/expts/iclr2023_configs/base_config/ogbpcqm4mv2.yaml deleted file mode 100644 index 571788581..000000000 --- a/expts/iclr2023_configs/base_config/ogbpcqm4mv2.yaml +++ /dev/null @@ -1,288 +0,0 @@ -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/PCQM4Mv2/" - epochs: 100 - wandb: - entity: multitask-gnn - name: neurips2023_scaling_mpnn - project: neurips2023_graphcore_scaling_mpnn - - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 100 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 100 - # Data handling-related - batch_size_training: 30 - batch_size_inference: 30 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 8 - - ipu_config: - - deviceIterations(60) # IPU would require large batches to be ready for the model. - - replicationFactor(4) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [4, 4, 4, 4] - -# accelerator: -# type: cpu # cpu or ipu or gpu -# config_override: -# datamodule: -# args: -# batch_size_training: 64 -# batch_size_inference: 256 -# trainer: -# trainer: -# precision: 32 -# accumulate_grad_batches: 1 - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - homolumo: - df: null - task_level: graph - df_path: graphium/data/PCQM4M/pcqm4mv2.csv - # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` - smiles_col: "cxsmiles" - label_cols: ["homo_lumo_gap"] # graph_* # graph_* means all columns starting with "graph_" - # sample_size: 100000 # use sample_size for test - splits_path: graphium/data/PCQM4M/split_dict_v2.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` - split_names: ["train", "valid", "test-dev"] - seed: ${constants.seed} - label_normalization: - normalize_val_test: true - method: "normal" - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: "disk" - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - # cache_data_path: . - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: # Set as null to avoid a pre-nn network - out_dim: 64 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: null # Set as null to avoid a pre-nn network - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - - - gnn: # Set as null to avoid a post-nn network - in_dim: 64 # or otherwise the correct value - out_dim: &gnn_dim 768 - hidden_dims: *gnn_dim - depth: 4 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'none' - - - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - homolumo: - task_level: graph - out_dim: 1 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - -#Task-specific -predictor: - metrics_on_progress_bar: - homolumo: [] - metrics_on_training_set: - homolumo: [] - loss_fun: - homolumo: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: ${constants.epochs} - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - homolumo: - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/ogb_pcqm4mv2/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${constants.epochs} - min_epochs: 1 - check_val_every_n_epoch: 5 diff --git a/expts/iclr2023_configs/base_config/ogbpcqm4mv2_GPS++.yaml b/expts/iclr2023_configs/base_config/ogbpcqm4mv2_GPS++.yaml deleted file mode 100644 index 09174a096..000000000 --- a/expts/iclr2023_configs/base_config/ogbpcqm4mv2_GPS++.yaml +++ /dev/null @@ -1,288 +0,0 @@ -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/PCQM4Mv2/" - epochs: 100 - wandb: - entity: multitask-gnn - name: neurips2023_scaling_mpnn - project: neurips2023_graphcore_scaling_mpnn - - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 100 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 100 - # Data handling-related - batch_size_training: 30 - batch_size_inference: 30 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 8 - - ipu_config: - - deviceIterations(60) # IPU would require large batches to be ready for the model. - - replicationFactor(4) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [4, 4, 4, 4] - -# accelerator: -# type: cpu # cpu or ipu or gpu -# config_override: -# datamodule: -# args: -# batch_size_training: 64 -# batch_size_inference: 256 -# trainer: -# trainer: -# precision: 32 -# accumulate_grad_batches: 1 - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - homolumo: - df: null - task_level: graph - df_path: graphium/data/PCQM4M/pcqm4mv2.csv - # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` - smiles_col: "cxsmiles" - label_cols: ["homo_lumo_gap"] # graph_* # graph_* means all columns starting with "graph_" - # sample_size: 100000 # use sample_size for test - splits_path: graphium/data/PCQM4M/split_dict_v2.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` - split_names: ["train", "valid", "test-dev"] - seed: ${constants.seed} - label_normalization: - normalize_val_test: true - method: "normal" - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: "disk" - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - # cache_data_path: . - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: # Set as null to avoid a pre-nn network - out_dim: 64 - hidden_dims: 1024 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: null # Set as null to avoid a pre-nn network - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - - - gnn: # Set as null to avoid a post-nn network - in_dim: 64 # or otherwise the correct value - out_dim: &gnn_dim 1024 - hidden_dims: *gnn_dim - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'none' - - - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - homolumo: - task_level: graph - out_dim: 1 - hidden_dims: 512 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - -#Task-specific -predictor: - metrics_on_progress_bar: - homolumo: [] - metrics_on_training_set: - homolumo: [] - loss_fun: - homolumo: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: ${constants.epochs} - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - homolumo: - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/ogb_pcqm4mv2/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${constants.epochs} - min_epochs: 1 - check_val_every_n_epoch: 5 diff --git a/expts/iclr2023_configs/config_ogbpcq_GCN_16layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GCN_16layers.yaml deleted file mode 100644 index bffa2ee04..000000000 --- a/expts/iclr2023_configs/config_ogbpcq_GCN_16layers.yaml +++ /dev/null @@ -1,263 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/PCQM4Mv2/" - epochs: 100 - name: ogb_pcqm4mv2_gcn - wandb: - entity: multitask-gnn - name: neurips2023_scaling_gcn - project: neurips2023_graphcore_scaling_mpnn - - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 100 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 100 - # Data handling-related - batch_size_training: 30 - batch_size_inference: 30 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 8 - - ipu_config: - - deviceIterations(60) # IPU would require large batches to be ready for the model. - - replicationFactor(4) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [4, 4, 4, 4] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - homolumo: - df: null - task_level: graph - df_path: graphium/data/PCQM4M/pcqm4mv2.csv - # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` - smiles_col: "cxsmiles" - label_cols: ["homo_lumo_gap"] # graph_* # graph_* means all columns starting with "graph_" - # sample_size: 100000 # use sample_size for test - splits_path: graphium/data/PCQM4M/split_dict_v2.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` - split_names: ["train", "valid", "test-dev"] - seed: ${constants.seed} - label_normalization: - normalize_val_test: true - method: "normal" - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: "disk" - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - # cache_data_path: . - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - homolumo: [] - metrics_on_training_set: - homolumo: [] - loss_fun: - homolumo: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: ${constants.epochs} - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - homolumo: - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/ogb_pcqm4mv2/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${constants.epochs} - min_epochs: 1 - check_val_every_n_epoch: 5 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: # Set as null to avoid a pre-nn network - out_dim: 64 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.18 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: null - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - in_dim: 64 # or otherwise the correct value - out_dim: &gnn_dim 768 - hidden_dims: *gnn_dim - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'none' - layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps - - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - homolumo: - task_level: graph - out_dim: 1 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none diff --git a/expts/iclr2023_configs/config_ogbpcq_GCN_4layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GCN_4layers.yaml deleted file mode 100644 index ef46cda2a..000000000 --- a/expts/iclr2023_configs/config_ogbpcq_GCN_4layers.yaml +++ /dev/null @@ -1,263 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/PCQM4Mv2/" - epochs: 100 - name: ogb_pcqm4mv2_gcn - wandb: - entity: multitask-gnn - name: neurips2023_scaling_gcn - project: neurips2023_graphcore_scaling_mpnn - - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 100 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 100 - # Data handling-related - batch_size_training: 30 - batch_size_inference: 30 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 2 - - ipu_config: - - deviceIterations(60) # IPU would require large batches to be ready for the model. - - replicationFactor(16) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - # accelerator_kwargs: - # _accelerator: "ipu" - # gnn_layers_per_ipu: [4, 4, 4, 4] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - homolumo: - df: null - task_level: graph - df_path: graphium/data/PCQM4M/pcqm4mv2.csv - # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` - smiles_col: "cxsmiles" - label_cols: ["homo_lumo_gap"] # graph_* # graph_* means all columns starting with "graph_" - # sample_size: 100000 # use sample_size for test - splits_path: graphium/data/PCQM4M/split_dict_v2.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` - split_names: ["train", "valid", "test-dev"] - seed: ${constants.seed} - label_normalization: - normalize_val_test: true - method: "normal" - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: "disk" - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - # cache_data_path: . - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - homolumo: [] - metrics_on_training_set: - homolumo: [] - loss_fun: - homolumo: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: ${constants.epochs} - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - homolumo: - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/ogb_pcqm4mv2/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${constants.epochs} - min_epochs: 1 - check_val_every_n_epoch: 5 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: # Set as null to avoid a pre-nn network - out_dim: 64 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.18 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: null - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - in_dim: 64 # or otherwise the correct value - out_dim: &gnn_dim 768 - hidden_dims: *gnn_dim - depth: 4 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'none' - layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps - - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - homolumo: - task_level: graph - out_dim: 1 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none diff --git a/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml deleted file mode 100644 index bde17fe0d..000000000 --- a/expts/iclr2023_configs/config_ogbpcq_GINE_16layers.yaml +++ /dev/null @@ -1,273 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/PCQM4Mv2/" - epochs: 100 - name: ogb_pcqm4mv2_gine - wandb: - entity: multitask-gnn - name: neurips2023_scaling_gine - project: neurips2023_graphcore_scaling_mpnn - - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 100 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 100 - # Data handling-related - batch_size_training: 30 - batch_size_inference: 30 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 8 - - ipu_config: - - deviceIterations(60) # IPU would require large batches to be ready for the model. - - replicationFactor(4) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [4, 4, 4, 4] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - homolumo: - df: null - task_level: graph - df_path: graphium/data/PCQM4M/pcqm4mv2.csv - # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` - smiles_col: "cxsmiles" - label_cols: ["homo_lumo_gap"] # graph_* # graph_* means all columns starting with "graph_" - # sample_size: 100000 # use sample_size for test - splits_path: graphium/data/PCQM4M/split_dict_v2.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` - split_names: ["train", "valid", "test-dev"] - seed: ${constants.seed} - label_normalization: - normalize_val_test: true - method: "normal" - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: "disk" - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - # cache_data_path: . - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - homolumo: [] - metrics_on_training_set: - homolumo: [] - loss_fun: - homolumo: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: ${constants.epochs} - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - homolumo: - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/ogb_pcqm4mv2/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${constants.epochs} - min_epochs: 1 - check_val_every_n_epoch: 5 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: # Set as null to avoid a pre-nn network - out_dim: 64 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.18 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: # Set as null to avoid a pre-nn network - out_dim: 32 - hidden_dims: 128 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: *normalization - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - - gnn: # Set as null to avoid a post-nn network - out_dim: &gnn_dim 704 - hidden_dims: *gnn_dim - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'none' - layer_type: 'pyg:gine' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps - - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - - task_heads: - homolumo: - task_level: graph - out_dim: 1 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none diff --git a/expts/iclr2023_configs/config_ogbpcq_GINE_4layers.yaml b/expts/iclr2023_configs/config_ogbpcq_GINE_4layers.yaml deleted file mode 100644 index ff173adde..000000000 --- a/expts/iclr2023_configs/config_ogbpcq_GINE_4layers.yaml +++ /dev/null @@ -1,273 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/PCQM4Mv2/" - epochs: 100 - name: ogb_pcqm4mv2_gine - wandb: - entity: multitask-gnn - name: neurips2023_scaling_gine - project: neurips2023_graphcore_scaling_mpnn - - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 100 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 100 - # Data handling-related - batch_size_training: 30 - batch_size_inference: 30 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 2 - - ipu_config: - - deviceIterations(60) # IPU would require large batches to be ready for the model. - - replicationFactor(16) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - # accelerator_kwargs: - # _accelerator: "ipu" - # gnn_layers_per_ipu: [4, 4, 4, 4] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - homolumo: - df: null - task_level: graph - df_path: graphium/data/PCQM4M/pcqm4mv2.csv - # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` - smiles_col: "cxsmiles" - label_cols: ["homo_lumo_gap"] # graph_* # graph_* means all columns starting with "graph_" - # sample_size: 100000 # use sample_size for test - splits_path: graphium/data/PCQM4M/split_dict_v2.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` - split_names: ["train", "valid", "test-dev"] - seed: ${constants.seed} - label_normalization: - normalize_val_test: true - method: "normal" - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: "disk" - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - # cache_data_path: . - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - homolumo: [] - metrics_on_training_set: - homolumo: [] - loss_fun: - homolumo: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: ${constants.epochs} - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - homolumo: - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/ogb_pcqm4mv2/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${constants.epochs} - min_epochs: 1 - check_val_every_n_epoch: 5 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: # Set as null to avoid a pre-nn network - out_dim: 64 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.18 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: # Set as null to avoid a pre-nn network - out_dim: 32 - hidden_dims: 128 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: *normalization - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - - gnn: # Set as null to avoid a post-nn network - out_dim: &gnn_dim 704 - hidden_dims: *gnn_dim - depth: 4 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'none' - layer_type: 'pyg:gine' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps - - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - - task_heads: - homolumo: - task_level: graph - out_dim: 1 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn.yaml deleted file mode 100644 index e57a403d7..000000000 --- a/expts/iclr2023_configs/config_ogbpcq_mpnn.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -defaults: - - base_config: ogbpcqm4mv2 - - _self_ - -constants: - name: ogb_pcqm4mv2_mpnn - -architecture: - - pre_nn: - out_dim: 160 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: 64 - hidden_dims: 128 - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - gnn: # Set as null to avoid a post-nn network - in_dim: 160 # should be consistent with pre_nn.out_dim - out_dim: 256 - hidden_dims: &gnn_dim 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 4 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'none' - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml deleted file mode 100644 index 61f645c44..000000000 --- a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++.yaml +++ /dev/null @@ -1,295 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/PCQM4Mv2/" - epochs: 100 - name: ogb_pcqm4mv2_mpnn_no1hot - wandb: - entity: multitask-gnn - name: neurips2023_scaling_mpnn_no1hot - project: neurips2023_graphcore_scaling_mpnn - - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 100 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 100 - # Data handling-related - batch_size_training: 30 - batch_size_inference: 30 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 8 - - ipu_config: - - deviceIterations(60) # IPU would require large batches to be ready for the model. - - replicationFactor(4) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [4, 4, 4, 4] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - homolumo: - df: null - task_level: graph - df_path: graphium/data/PCQM4M/pcqm4mv2.csv - # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` - smiles_col: "cxsmiles" - label_cols: ["homo_lumo_gap"] # graph_* # graph_* means all columns starting with "graph_" - # sample_size: 100000 # use sample_size for test - splits_path: graphium/data/PCQM4M/split_dict_v2.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` - split_names: ["train", "valid", "test-dev"] - seed: ${constants.seed} - label_normalization: - normalize_val_test: true - method: "normal" - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: "disk" - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - # atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [atomic-number, group, period, total-valence, degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - # cache_data_path: . - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - homolumo: [] - metrics_on_training_set: - homolumo: [] - loss_fun: - homolumo: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: ${constants.epochs} - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - homolumo: - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/ogb_pcqm4mv2/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${constants.epochs} - min_epochs: 1 - check_val_every_n_epoch: 5 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: 256 - hidden_dims: 1024 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: 128 - hidden_dims: 512 - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: 256 - hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - homolumo: - task_level: graph - out_dim: 1 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA.yaml deleted file mode 100644 index b97402680..000000000 --- a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA.yaml +++ /dev/null @@ -1,295 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/PCQM4Mv2/" - epochs: 100 - name: ogb_pcqm4mv2_mpnn_NewData - wandb: - entity: multitask-gnn - name: neurips2023_scaling_mpnn_NewData - project: neurips2023_graphcore_scaling_mpnn - - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 100 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 100 - # Data handling-related - batch_size_training: 30 - batch_size_inference: 30 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 8 - - ipu_config: - - deviceIterations(60) # IPU would require large batches to be ready for the model. - - replicationFactor(4) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(30) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - accelerator_kwargs: - _accelerator: "ipu" - gnn_layers_per_ipu: [4, 4, 4, 4] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - homolumo: - df: null - task_level: graph - df_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/raw/data.csv - # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` - smiles_col: "smiles" - label_cols: ["homolumogap"] # graph_* # graph_* means all columns starting with "graph_" - # sample_size: 100000 # use sample_size for test - splits_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/split_dict.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` - split_names: ["train", "valid", "test-dev"] - seed: ${constants.seed} - label_normalization: - normalize_val_test: true - method: "normal" - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: "disk" - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - # cache_data_path: . - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - homolumo: [] - metrics_on_training_set: - homolumo: [] - loss_fun: - homolumo: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: ${constants.epochs} - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - homolumo: - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/ogb_pcqm4mv2/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${constants.epochs} - min_epochs: 1 - check_val_every_n_epoch: 5 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: 256 - hidden_dims: 1024 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: 128 - hidden_dims: 512 - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: 256 - hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 16 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - homolumo: - task_level: graph - out_dim: 1 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml deleted file mode 100644 index 94535b7cc..000000000 --- a/expts/iclr2023_configs/config_ogbpcq_mpnn_GPS++_newDATA_4layers.yaml +++ /dev/null @@ -1,295 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -# @package _global_ - -constants: - seed: 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - datacache_path: "/localdata/PCQM4Mv2/" - epochs: 100 - name: ogb_pcqm4mv2_mpnn_NewData_4layers - wandb: - entity: multitask-gnn - name: neurips2023_scaling_mpnn_NewData_4layers - project: neurips2023_graphcore_scaling_mpnn - - -accelerator: - type: ipu # cpu or ipu or gpu - config_override: - datamodule: - args: - ipu_dataloader_training_opts: - mode: async - max_num_nodes_per_graph: 30 # train max nodes: 20, max_edges: 54 - max_num_edges_per_graph: 100 - ipu_dataloader_inference_opts: - mode: async - max_num_nodes_per_graph: 35 # valid max nodes: 51, max_edges: 118 - max_num_edges_per_graph: 100 - # Data handling-related - batch_size_training: 30 - batch_size_inference: 30 - predictor: - metrics_every_n_train_steps: 1000 - optim_kwargs: - loss_scaling: 1024 - trainer: - trainer: - precision: 16-true - accumulate_grad_batches: 2 - - ipu_config: - - deviceIterations(60) # IPU would require large batches to be ready for the model. - - replicationFactor(16) - # - enableProfiling("graph_analyser") # The folder where the profile will be stored - # - enableExecutableCaching("pop_compiler_cache") - - TensorLocations.numIOTiles(128) - - _Popart.set("defaultBufferingDepth", 96) - - Precision.enableStochasticRounding(True) - # - Precision.enableFloatingPointExceptions(True) - - ipu_inference_config: - # set device iteration and replication factor to 1 during inference - # gradient accumulation was set to 1 in the code - - deviceIterations(1) - - replicationFactor(1) - - Precision.enableStochasticRounding(False) - - # accelerator_kwargs: - # _accelerator: "ipu" - # gnn_layers_per_ipu: [4, 4, 4, 4] - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - homolumo: - df: null - task_level: graph - df_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/raw/data.csv - # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv` - smiles_col: "smiles" - label_cols: ["homolumogap"] # graph_* # graph_* means all columns starting with "graph_" - # sample_size: 100000 # use sample_size for test - splits_path: /nethome/kerstink/QM9_finetuning/ogb-lsc/pcqm4mv2_conformers_28features/split_dict.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` - split_names: ["train", "valid", "test-dev"] - seed: ${constants.seed} - label_normalization: - normalize_val_test: true - method: "normal" - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - dataloading_from: "disk" - processed_graph_data_path: ${constants.datacache_path} - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - # cache_data_path: . - num_workers: 32 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - -#Task-specific -predictor: - metrics_on_progress_bar: - homolumo: [] - metrics_on_training_set: - homolumo: [] - loss_fun: - homolumo: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: ${constants.epochs} - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor qm9/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - homolumo: - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/ogb_pcqm4mv2/ - name: ${constants.name} - project: ${constants.name} - model_checkpoint: - dirpath: models_checkpoints/${constants.name}/ - filename: ${constants.name} - # monitor: *monitor - # mode: *mode - # save_top_k: 1 - save_last: True - trainer: - max_epochs: ${constants.epochs} - min_epochs: 1 - check_val_every_n_epoch: 5 - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: - out_dim: 256 - hidden_dims: 1024 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: 128 - hidden_dims: 512 - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - gnn: # Set as null to avoid a post-nn network - # in_dim: 256 # should be consistent with pre_nn.out_dim - out_dim: 256 - hidden_dims: &gnn_dim 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 4 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'sum' - use_virtual_edges: true - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: 256 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: 128 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - droppath_rate_ffn: 0.3 - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - node: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - homolumo: - task_level: graph - out_dim: 1 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_JosefOG.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_JosefOG.yaml deleted file mode 100644 index e57a403d7..000000000 --- a/expts/iclr2023_configs/config_ogbpcq_mpnn_JosefOG.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -defaults: - - base_config: ogbpcqm4mv2 - - _self_ - -constants: - name: ogb_pcqm4mv2_mpnn - -architecture: - - pre_nn: - out_dim: 160 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: 64 - hidden_dims: 128 - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - gnn: # Set as null to avoid a post-nn network - in_dim: 160 # should be consistent with pre_nn.out_dim - out_dim: 256 - hidden_dims: &gnn_dim 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 4 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'none' - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_hydradims.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_hydradims.yaml deleted file mode 100644 index 0dbea9917..000000000 --- a/expts/iclr2023_configs/config_ogbpcq_mpnn_hydradims.yaml +++ /dev/null @@ -1,101 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -defaults: - - base_config: ogbpcqm4mv2 - - _self_ - -constants: - name: ogb_pcqm4mv2_mpnn - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: # Set as null to avoid a pre-nn network - out_dim: 256 - hidden_dims: 1024 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: # Set as null to avoid a pre-nn network - out_dim: 128 - hidden_dims: 512 - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: *normalization - last_normalization: *normalization - residual_type: none - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - - - gnn: # Set as null to avoid a post-nn network - out_dim: 256 - hidden_dims: 256 - depth: 4 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - layer_type: 'pyg:gps' - layer_kwargs: # Parameters for the model itself. You could define dropout_attn: 0.1 - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: 256 - out_dim: 256 - in_dim_edges: 128 - out_dim_edges: 128 - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - virtual_node: 'sum' - use_virtual_edges: true - - graph_output_nn: - graph: - pooling: [sum] - out_dim: 256 - hidden_dims: 256 - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - diff --git a/expts/iclr2023_configs/config_ogbpcq_mpnn_largerffn.yaml b/expts/iclr2023_configs/config_ogbpcq_mpnn_largerffn.yaml deleted file mode 100644 index 1a51857fe..000000000 --- a/expts/iclr2023_configs/config_ogbpcq_mpnn_largerffn.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# Running the mpnn model with the largemix dataset on IPU. - -defaults: - - base_config: ogbpcqm4mv2 - - _self_ - -constants: - name: ogb_pcqm4mv2_mpnn - -architecture: - - pre_nn: - out_dim: 280 - hidden_dims: 512 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.1 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: - out_dim: 64 - hidden_dims: 128 - depth: 2 - activation: relu - last_activation: none - dropout: 0.18 - normalization: ${architecture.pre_nn.normalization} - last_normalization: ${architecture.pre_nn.normalization} - residual_type: none - - gnn: # Set as null to avoid a post-nn network - in_dim: 280 # should be consistent with pre_nn.out_dim - out_dim: 256 - hidden_dims: &gnn_dim 280 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) - depth: 4 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - layer_type: 'pyg:gps' - layer_kwargs: - node_residual: false - mpnn_type: 'pyg:mpnnplus' - mpnn_kwargs: - in_dim: 280 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - out_dim: 280 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) - in_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - out_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) - attn_type: "none" # "full-attention", "none" - # biased_attention: false - attn_kwargs: null - virtual_node: 'sum' - use_virtual_edges: true diff --git a/graphium/features/featurizer.py b/graphium/features/featurizer.py index cb19e3b4b..d8efdb2ab 100644 --- a/graphium/features/featurizer.py +++ b/graphium/features/featurizer.py @@ -1062,10 +1062,7 @@ def mol_to_graph_dict( mol = Chem.AddHs(mol) else: mol = Chem.RemoveHs(mol) - # SAMUELM: Temp fix - max_num_bonds = 265 num_atoms = mol.GetNumAtoms() - num_bonds = mol.GetNumBonds() if (max_num_atoms is not None) and (num_atoms > max_num_atoms): raise ValueError(f"Maximum number of atoms greater than permitted {num_atoms}>{max_num_atoms}") ( From 120a447f5bab47f0eebd816d191f0f2b43192d6b Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander Date: Thu, 12 Oct 2023 18:22:00 +0000 Subject: [PATCH 55/58] Linting --- graphium/cli/train_finetune_test.py | 29 ++++++++++++++++++++++------- graphium/config/_loader.py | 2 +- graphium/trainer/predictor.py | 4 ++-- 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/graphium/cli/train_finetune_test.py b/graphium/cli/train_finetune_test.py index 4a2de6a04..d21fe3c3c 100644 --- a/graphium/cli/train_finetune_test.py +++ b/graphium/cli/train_finetune_test.py @@ -51,6 +51,7 @@ def cli(cfg: DictConfig) -> None: """ return run_training_finetuning_testing(cfg) + def get_replication_factor(cfg): try: ipu_config = cfg.get("accelerator", {}).get("ipu_config", []) @@ -63,15 +64,22 @@ def get_replication_factor(cfg): return int(item[start:end]) except Exception as e: print(f"An error occurred: {e}") - + # Return default value if replicationFactor is not found or an error occurred return 1 + def get_gradient_accumulation_factor(cfg): try: # Navigate through the nested dictionaries and get the gradient accumulation factor - grad_accumulation_factor = cfg.get("accelerator", {}).get("config_override", {}).get("trainer", {}).get("trainer", {}).get("accumulate_grad_batches", 1) - + grad_accumulation_factor = ( + cfg.get("accelerator", {}) + .get("config_override", {}) + .get("trainer", {}) + .get("trainer", {}) + .get("accumulate_grad_batches", 1) + ) + # Ensure that the extracted value is an integer return int(grad_accumulation_factor) except Exception as e: @@ -80,10 +88,17 @@ def get_gradient_accumulation_factor(cfg): # Return default value if an error occurred return 1 + def get_training_batch_size(cfg): try: # Navigate through the nested dictionaries and get the training batch size - batch_size_training = cfg.get("accelerator", {}).get("config_override", {}).get("datamodule", {}).get("args", {}).get("batch_size_training", 1) + batch_size_training = ( + cfg.get("accelerator", {}) + .get("config_override", {}) + .get("datamodule", {}) + .get("args", {}) + .get("batch_size_training", 1) + ) # Ensure that the extracted value is an integer return int(batch_size_training) @@ -107,7 +122,7 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None: # Format the datetime as a string filename_datetime_suffix = now.strftime("%Y%m%d_%H%M%S") # Append the datetime string to the existing filename in the cfg dictionary - cfg['trainer']['model_checkpoint']['filename'] += f"_{filename_datetime_suffix}" + cfg["trainer"]["model_checkpoint"]["filename"] += f"_{filename_datetime_suffix}" dst_dir = cfg["constants"].get("results_dir") hydra_cfg = HydraConfig.get() @@ -129,9 +144,9 @@ def run_training_finetuning_testing(cfg: DictConfig) -> None: replicas = get_replication_factor(cfg) gradient_acc = get_gradient_accumulation_factor(cfg) micro_bs = get_training_batch_size(cfg) - + global_bs = replicas * gradient_acc * micro_bs - + # Disable wandb if the user is not logged in. wandb_cfg = cfg["constants"].get("wandb") if wandb_cfg is not None and wandb.login() is False: diff --git a/graphium/config/_loader.py b/graphium/config/_loader.py index fc005890c..cf2c80a3f 100644 --- a/graphium/config/_loader.py +++ b/graphium/config/_loader.py @@ -337,7 +337,7 @@ def load_predictor( featurization=featurization, task_norms=task_norms, replicas=replicas, - gradient_acc=gradient_acc, + gradient_acc=gradient_acc, global_bs=global_bs, **cfg_pred, ) diff --git a/graphium/trainer/predictor.py b/graphium/trainer/predictor.py index fe270bc44..dd6e77002 100644 --- a/graphium/trainer/predictor.py +++ b/graphium/trainer/predictor.py @@ -179,7 +179,7 @@ def __init__( # Wether save preds and targets for each training step. self.samples_seen = 0 - self.global_bs = global_bs + self.global_bs = global_bs def forward( self, inputs: Dict @@ -475,7 +475,7 @@ def on_train_batch_end(self, outputs, batch: Any, batch_idx: int) -> None: concatenated_metrics_logs = {} concatenated_metrics_logs["train/loss"] = outputs["loss"] concatenated_metrics_logs["epoch_count"] = self.current_epoch - # Incriment by the batch size + # Incriment by the batch size self.samples_seen += self.global_bs concatenated_metrics_logs["samples_seen"] = self.samples_seen From bef2b61f4b8dc960b816337e9706c73a462bd537 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander <68224909+s-maddrellmander@users.noreply.github.com> Date: Thu, 12 Oct 2023 19:36:08 +0100 Subject: [PATCH 56/58] Update graphium/trainer/predictor.py --- graphium/trainer/predictor.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/graphium/trainer/predictor.py b/graphium/trainer/predictor.py index dd6e77002..db029c345 100644 --- a/graphium/trainer/predictor.py +++ b/graphium/trainer/predictor.py @@ -456,9 +456,6 @@ def flag_step(self, batch: Dict[str, Tensor], step_name: str, to_cpu: bool) -> D step_dict[f"loss/{step_name}"] = loss.detach().cpu() step_dict["loss"] = loss step_dict["task_losses"] = task_losses - # samuelm - # self.samples_seen += 1 - # step_dict["samples_seen"] = self.samples_seen return step_dict def on_train_batch_start(self, batch: Any, batch_idx: int) -> Optional[int]: From 97558ffc7299c64f923f8c0e739a60b7c3450b48 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander <68224909+s-maddrellmander@users.noreply.github.com> Date: Thu, 12 Oct 2023 19:36:18 +0100 Subject: [PATCH 57/58] Update graphium/nn/pyg_layers/gps_pyg.py --- graphium/nn/pyg_layers/gps_pyg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphium/nn/pyg_layers/gps_pyg.py b/graphium/nn/pyg_layers/gps_pyg.py index 6982d6750..7af7107ac 100644 --- a/graphium/nn/pyg_layers/gps_pyg.py +++ b/graphium/nn/pyg_layers/gps_pyg.py @@ -147,7 +147,7 @@ def __init__( Keyword arguments to pass to the attention layer output_scale: - Float value that will be used to scale the activations, helps reudce growth of activations + Float value that will be used to scale the activations, helps reduce growth of activations as the model gets deeper. Default value of 1.0 leaves the layer unchanged. From 4002482b0821a2cba71f942510e9d94fd57346a5 Mon Sep 17 00:00:00 2001 From: Sam Maddrell-Mander <68224909+s-maddrellmander@users.noreply.github.com> Date: Thu, 12 Oct 2023 19:36:25 +0100 Subject: [PATCH 58/58] Update graphium/trainer/predictor.py --- graphium/trainer/predictor.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/graphium/trainer/predictor.py b/graphium/trainer/predictor.py index db029c345..588d7e3f2 100644 --- a/graphium/trainer/predictor.py +++ b/graphium/trainer/predictor.py @@ -384,9 +384,6 @@ def _general_step(self, batch: Dict[str, Tensor], step_name: str, to_cpu: bool) # print("loss ", self.global_step, self.current_epoch, loss) step_dict["task_losses"] = task_losses step_dict["gradient_norm"] = self.get_gradient_norm() - # samuelm - # self.samples_seen += 1 - # step_dict["samples_seen"] = self.samples_seen return step_dict def flag_step(self, batch: Dict[str, Tensor], step_name: str, to_cpu: bool) -> Dict[str, Any]: