diff --git a/expts/hydra-configs/accelerator/ipu.yaml b/expts/hydra-configs/accelerator/ipu.yaml index 43e4455ef..3e6fb4429 100644 --- a/expts/hydra-configs/accelerator/ipu.yaml +++ b/expts/hydra-configs/accelerator/ipu.yaml @@ -1,6 +1,8 @@ type: ipu ipu_config: - - deviceIterations(30) # IPU would require large batches to be ready for the model. + - deviceIterations(60) # IPU would require large batches to be ready for the model. + # 60 for PCQM4mv2 + # 30 for largemix - replicationFactor(16) # - enableProfiling("graph_analyser") # The folder where the profile will be stored # - enableExecutableCaching("pop_compiler_cache") diff --git a/expts/hydra-configs/dataset/pcqm4m.yaml b/expts/hydra-configs/architecture/pcqm4m.yaml similarity index 58% rename from expts/hydra-configs/dataset/pcqm4m.yaml rename to expts/hydra-configs/architecture/pcqm4m.yaml index 391bb21de..494875765 100644 --- a/expts/hydra-configs/dataset/pcqm4m.yaml +++ b/expts/hydra-configs/architecture/pcqm4m.yaml @@ -1,70 +1,5 @@ # @package _global_ -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - homolumo: - df: null - task_level: "graph" - df_path: graphium/data/PCQM4M/pcqm4mv2.csv - # wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv - # or set path as https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv directly - smiles_col: "cxsmiles" - label_cols: ["homo_lumo_gap"] - # sample_size: 8000 # use sample_size for test - splits_path: graphium/data/PCQM4M/split_dict_v2.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` - split_names: ["train", "valid", "test-dev"] - # graphium/data/PCQM4Mv2/split_dict.pt - # graphium/data/PCQM4Mv2/pcqm4m_split.csv - # split_val: 0.1 - # split_test: 0.1 - seed: ${constants.seed} - label_normalization: - method: "normal" - - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - processed_graph_data_path: "../datacache/PCQM4Mv2/" - featurization: - # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), - # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', - # 'num_chiral_centers (not included yet)'] - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: # encoder dropout 0.18 - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - # cache_data_path: . - num_workers: 30 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - # Using persistent_workers false might make the start of each epoch very long. - architecture: model_type: FullGraphMultiTaskNetwork mup_base_path: null @@ -144,78 +79,46 @@ architecture: last_normalization: "none" residual_type: none - task_heads: - homolumo: - task_level: graph - out_dim: 1 - hidden_dims: 256 - depth: 2 # Not needed if we have hidden_dims - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - -#Task-specific -predictor: - metrics_on_progress_bar: - homolumo: [] - metrics_on_training_set: - homolumo: ["pearsonr"] - loss_fun: - homolumo: mae_ipu - random_seed: ${constants.seed} - optim_kwargs: - lr: 4.e-4 # warmup can be scheduled using torch_scheduler_kwargs - # weight_decay: 1.e-7 - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 - warmup_epochs: 10 - verbose: False - scheduler_kwargs: - # monitor: &monitor homolumo/mae/train - # mode: min - # frequency: 1 - target_nan_mask: null # null: no mask, 0: 0 mask, ignore: ignore nan values from loss - flag_kwargs: - n_steps: 0 # 1 - alpha: 0.0 # 0.01 - -# Task-specific -metrics: - homolumo: - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + # Featurization + prepare_dict_or_graph: pyg:graph + featurization_n_jobs: 30 + featurization_progress: True + featurization_backend: "loky" + processed_graph_data_path: ${constants.datacache_path} + num_workers: 40 # -1 to use all + persistent_workers: False # if use persistent worker at the start of each epoch. + # Using persistent_workers false might make the start of each epoch very long. + featurization: + # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), + # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', + # 'num_chiral_centers (not included yet)'] + atom_property_list_onehot: [atomic-number, group, period, total-valence] + atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] + # OGB: ['possible_bond_type', 'possible_bond_stereo', 'possible_is_in_ring'] + edge_property_list: [bond-type-onehot, stereo, in-ring] + add_self_loop: False + explicit_H: False # if H is included + use_bonds_weights: False + pos_encoding_as_features: # encoder dropout 0.18 + pos_types: + lap_eigvec: + pos_level: node + pos_type: laplacian_eigvec + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + lap_eigval: + pos_level: node + pos_type: laplacian_eigval + num_pos: 8 + normalization: "none" # nomrlization already applied on the eigen vectors + disconnected_comp: True # if eigen values/vector for disconnected graph are included + rw_pos: # use same name as pe_encoder + pos_level: node + pos_type: rw_return_probs + ksteps: 16 -trainer: - seed: ${constants.seed} - logger: - save_dir: logs/PCQMv2 - name: ${constants.name} - project: PCQMv2_mpnn - #early_stopping: - # monitor: *monitor - # min_delta: 0 - # patience: 10 - # mode: &mode min - model_checkpoint: - dirpath: models_checkpoints/PCMQ4Mv2/ - filename: ${constants.name} - #monitor: *monitor - #mode: *mode - save_top_k: 1 - every_n_epochs: 100 - trainer: - max_epochs: *max_epochs - min_epochs: 1 - check_val_every_n_epoch: 20 diff --git a/expts/hydra-configs/main.yaml b/expts/hydra-configs/main.yaml index 24962eddc..d4b3beceb 100644 --- a/expts/hydra-configs/main.yaml +++ b/expts/hydra-configs/main.yaml @@ -13,4 +13,4 @@ defaults: # Specializations - training/accelerator: ${training}_${accelerator} - - training/model: ${training}_${model} \ No newline at end of file + - training/model: ${training}_${model} diff --git a/expts/hydra-configs/model/gpspp.yaml b/expts/hydra-configs/model/gpspp.yaml index 0b231fcf1..7ec357fe1 100644 --- a/expts/hydra-configs/model/gpspp.yaml +++ b/expts/hydra-configs/model/gpspp.yaml @@ -1,5 +1,15 @@ # @package _global_ +datamodule: + args: + batch_size_training: 32 + featurization: + conformer_property_list: [positions_3d] + +trainer: + trainer: + accumulate_grad_batches: 2 + architecture: pe_encoders: encoders: @@ -31,8 +41,3 @@ architecture: num_heads: 32 droppath_rate_attn: 0.0 droppath_rate_ffn: 0.0 - -datamodule: - args: # Matches that in the test_multitask_datamodule.py case. - featurization: - conformer_property_list: [positions_3d] diff --git a/expts/hydra-configs/model/mpnn.yaml b/expts/hydra-configs/model/mpnn.yaml index 4a8a428e8..dce40c932 100644 --- a/expts/hydra-configs/model/mpnn.yaml +++ b/expts/hydra-configs/model/mpnn.yaml @@ -1,5 +1,13 @@ # @package _global_ +datamodule: + args: + batch_size_training: 64 + +trainer: + trainer: + accumulate_grad_batches: 1 + architecture: gnn: layer_type: 'pyg:gps' diff --git a/expts/hydra-configs/tasks/pcqm4m.yaml b/expts/hydra-configs/tasks/pcqm4m.yaml new file mode 100644 index 000000000..d92d381f7 --- /dev/null +++ b/expts/hydra-configs/tasks/pcqm4m.yaml @@ -0,0 +1,62 @@ +# @package _global_ + +architecture: + task_heads: + homolumo: + task_level: graph + out_dim: 1 + hidden_dims: 256 + depth: 2 # Not needed if we have hidden_dims + activation: relu + last_activation: none + dropout: 0.18 + normalization: layer_norm + last_normalization: "none" + residual_type: none + +#Task-specific +predictor: + metrics_on_progress_bar: + homolumo: [] + metrics_on_training_set: + homolumo: ["pearsonr"] + loss_fun: + homolumo: mae_ipu + +# Task-specific +metrics: + homolumo: + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + homolumo: + df: null + task_level: "graph" + df_path: graphium/data/PCQM4M/pcqm4mv2.csv + # wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv + # or set path as https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv directly + smiles_col: "cxsmiles" + label_cols: ["homo_lumo_gap"] + # sample_size: 8000 # use sample_size for test + splits_path: graphium/data/PCQM4M/split_dict_v2.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` + split_names: ["train", "valid", "test-dev"] + # graphium/data/PCQM4Mv2/split_dict.pt + # graphium/data/PCQM4Mv2/pcqm4m_split.csv + # split_val: 0.1 + # split_test: 0.1 + seed: ${constants.seed} + label_normalization: + method: "normal" diff --git a/expts/hydra-configs/dataset/accelerator/pcqm4m_ipu.yaml b/expts/hydra-configs/training/accelerator/pcqm4m_ipu.yaml similarity index 84% rename from expts/hydra-configs/dataset/accelerator/pcqm4m_ipu.yaml rename to expts/hydra-configs/training/accelerator/pcqm4m_ipu.yaml index 6502f9414..a7e23f383 100644 --- a/expts/hydra-configs/dataset/accelerator/pcqm4m_ipu.yaml +++ b/expts/hydra-configs/training/accelerator/pcqm4m_ipu.yaml @@ -11,15 +11,13 @@ datamodule: max_num_nodes_per_graph: 30 # valid max nodes: 51, max_edges: 118 max_num_edges_per_graph: 120 # Data handling-related - batch_size_training: 32 batch_size_inference: 16 predictor: - metrics_every_n_train_steps: 1000 + metrics_every_n_train_steps: 100 optim_kwargs: loss_scaling: 1024 trainer: trainer: precision: 16-true - accumulate_grad_batches: 2 diff --git a/expts/hydra-configs/experiment/pcqm4m_gpspp.yaml b/expts/hydra-configs/training/model/pcqm4m_gpspp.yaml similarity index 89% rename from expts/hydra-configs/experiment/pcqm4m_gpspp.yaml rename to expts/hydra-configs/training/model/pcqm4m_gpspp.yaml index a321e835d..e13c44aa0 100644 --- a/expts/hydra-configs/experiment/pcqm4m_gpspp.yaml +++ b/expts/hydra-configs/training/model/pcqm4m_gpspp.yaml @@ -7,6 +7,7 @@ constants: seed: 42 max_epochs: 100 raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/PCQM4Mv2/" trainer: model_checkpoint: diff --git a/expts/hydra-configs/experiment/pcqm4m_mpnn.yaml b/expts/hydra-configs/training/model/pcqm4m_mpnn.yaml similarity index 88% rename from expts/hydra-configs/experiment/pcqm4m_mpnn.yaml rename to expts/hydra-configs/training/model/pcqm4m_mpnn.yaml index 08b1b1f3c..41b55eba1 100644 --- a/expts/hydra-configs/experiment/pcqm4m_mpnn.yaml +++ b/expts/hydra-configs/training/model/pcqm4m_mpnn.yaml @@ -7,6 +7,7 @@ constants: seed: 42 max_epochs: 100 raise_train_error: true # Whether the code should raise an error if it crashes during training + datacache_path: "/localdata/PCQM4Mv2/" trainer: model_checkpoint: diff --git a/expts/hydra-configs/training/pcqm4m.yaml b/expts/hydra-configs/training/pcqm4m.yaml new file mode 100644 index 000000000..910c78c67 --- /dev/null +++ b/expts/hydra-configs/training/pcqm4m.yaml @@ -0,0 +1,44 @@ +# @package _global_ + +predictor: + random_seed: ${constants.seed} + optim_kwargs: + lr: 4.e-4 # warmup can be scheduled using torch_scheduler_kwargs + # weight_decay: 1.e-7 + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 100 + warmup_epochs: 10 + verbose: False + scheduler_kwargs: + # monitor: &monitor homolumo/mae/train + # mode: min + # frequency: 1 + target_nan_mask: null # null: no mask, 0: 0 mask, ignore: ignore nan values from loss + flag_kwargs: + n_steps: 0 # 1 + alpha: 0.0 # 0.01 + + +trainer: + seed: ${constants.seed} + logger: + save_dir: logs/PCQMv2 + name: ${constants.name} + project: PCQMv2_mpnn + #early_stopping: + # monitor: *monitor + # min_delta: 0 + # patience: 10 + # mode: &mode min + model_checkpoint: + dirpath: models_checkpoints/PCMQ4Mv2/ + filename: ${constants.name} + #monitor: *monitor + #mode: *mode + save_top_k: 1 + every_n_epochs: 100 + trainer: + max_epochs: *max_epochs + min_epochs: 1 + check_val_every_n_epoch: 20 diff --git a/expts/neurips2023_configs/base_config/large.yaml b/expts/neurips2023_configs/base_config/large.yaml index da9760ab3..f7de28a6c 100644 --- a/expts/neurips2023_configs/base_config/large.yaml +++ b/expts/neurips2023_configs/base_config/large.yaml @@ -1,9 +1,10 @@ # @package _global_ constants: - seed: &seed 42 + seed: 42 raise_train_error: true # Whether the code should raise an error if it crashes during training entity: multitask-gnn + datacache_path: "/localdata/neurips2023-large/" accelerator: type: ipu # cpu or ipu or gpu @@ -125,7 +126,7 @@ datamodule: # sample_size: 2000 # use sample_size for test task_level: node splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt` - seed: *seed + seed: ${constants.seed} label_normalization: normalize_val_test: True method: "normal" @@ -136,7 +137,7 @@ datamodule: featurization_n_jobs: 30 featurization_progress: True featurization_backend: "loky" - processed_graph_data_path: "../datacache/neurips2023-large/" + processed_graph_data_path: ${constants.datacache_path} featurization: # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence), # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring', @@ -340,7 +341,7 @@ predictor: pcba_1328: bce_logits_ipu pcqm4m_g25: mae_ipu pcqm4m_n4: mae_ipu - random_seed: *seed + random_seed: ${constants.seed} optim_kwargs: lr: 1.e-4 # warmup can be scheduled using torch_scheduler_kwargs # weight_decay: 1.e-7 @@ -363,6 +364,9 @@ metrics: metric: auroc_ipu num_classes: 5 task: multiclass + target_to_int: True + target_nan_mask: -1000 + ignore_index: -1000 multitask_handling: mean-per-label threshold_kwargs: null - name: avpr @@ -376,15 +380,18 @@ metrics: threshold_kwargs: null l1000_mcf7: *classif_metrics pcba_1328: + # use auroc and averageprecision (non_ipu version) so tha nans are handled correctly - name: auroc - metric: auroc_ipu + metric: auroc task: binary multitask_handling: mean-per-label + target_nan_mask: ignore threshold_kwargs: null - name: avpr - metric: average_precision_ipu + metric: averageprecision task: binary multitask_handling: mean-per-label + target_nan_mask: ignore threshold_kwargs: null pcqm4m_g25: &pcqm_metrics - name: mae @@ -405,7 +412,7 @@ metrics: pcqm4m_n4: *pcqm_metrics trainer: - seed: *seed + seed: ${constants.seed} logger: save_dir: logs/neurips2023-large/ name: ${constants.name} diff --git a/expts/neurips2023_configs/config_large_gcn.yaml b/expts/neurips2023_configs/config_large_gcn.yaml index 1dc397998..84f4dcaa6 100644 --- a/expts/neurips2023_configs/config_large_gcn.yaml +++ b/expts/neurips2023_configs/config_large_gcn.yaml @@ -9,4 +9,4 @@ constants: architecture: gnn: # Set as null to avoid a post-nn network - layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps \ No newline at end of file + layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps diff --git a/expts/neurips2023_configs/config_large_mpnn.yaml b/expts/neurips2023_configs/config_large_mpnn.yaml index b56f8dd2d..365927473 100644 --- a/expts/neurips2023_configs/config_large_mpnn.yaml +++ b/expts/neurips2023_configs/config_large_mpnn.yaml @@ -1,4 +1,4 @@ -# Testing the mpnn only model with the PCQMv2 dataset on IPU. +# Running the mpnn model with the largemix dataset on IPU. defaults: - base_config: large @@ -7,8 +7,20 @@ constants: name: neurips2023_large_data_mpnn architecture: - pre_nn_edges: # Set as null to avoid a pre-nn network - out_dim: 32 + + pre_nn: + out_dim: 160 + hidden_dims: 256 + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.1 + normalization: &normalization layer_norm + last_normalization: *normalization + residual_type: none + + pre_nn_edges: + out_dim: 64 hidden_dims: 128 depth: 2 activation: relu @@ -19,9 +31,26 @@ architecture: residual_type: none gnn: # Set as null to avoid a post-nn network - out_dim: &gnn_dim 64 - hidden_dims: *gnn_dim - layer_type: 'pyg:gps' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps - layer_kwargs: # Parameters for the model itself. You could define dropout_attn: 0.1 + in_dim: 160 # should be consistent with pre_nn.out_dim + out_dim: 256 + hidden_dims: &gnn_dim 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (ffn layer) + depth: 4 + activation: gelu + last_activation: none + dropout: 0.1 + normalization: "layer_norm" + last_normalization: *normalization + residual_type: simple + virtual_node: 'none' + layer_type: 'pyg:gps' + layer_kwargs: + node_residual: false mpnn_type: 'pyg:mpnnplus' - out_dim_edges: 32 + mpnn_kwargs: + in_dim: 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + out_dim: 160 # should consistent with pre_nn.out_dim when multi-layer mpnn is used (node_model layer) + in_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + out_dim_edges: 64 # should consistent with pre_nn_edges.out_dim when multi-layer mpnn is used (edge_model layer) + attn_type: "none" # "full-attention", "none" + # biased_attention: false + attn_kwargs: null diff --git a/graphium/trainer/predictor.py b/graphium/trainer/predictor.py index e98833b61..6fc139aab 100644 --- a/graphium/trainer/predictor.py +++ b/graphium/trainer/predictor.py @@ -464,6 +464,7 @@ def on_train_batch_end(self, outputs, batch: Any, batch_idx: int) -> None: concatenated_metrics_logs[f"train/loss/{task}"] = outputs["task_losses"][task] # get the mean loss value for individual tasks as they are a tensor of size --> gradient accumulation * replication * device_iter + # filter zeros out for the individual losses for key in concatenated_metrics_logs: if isinstance(concatenated_metrics_logs[key], torch.Tensor): if concatenated_metrics_logs[key].numel() > 1: