diff --git a/README.md b/README.md index 290e84b1b..39764b620 100644 --- a/README.md +++ b/README.md @@ -85,21 +85,26 @@ If you are not familiar with [PyTorch](https://pytorch.org/docs) or [PyTorch-Lig ## Running an experiment We have setup Graphium with `hydra` for managing config files. To run an experiment go to the `expts/` folder. For example, to benchmark a GCN on the ToyMix dataset run ```bash -python main_run_multitask.py dataset=toymix model=gcn +graphium-train dataset=toymix model=gcn ``` To change parameters specific to this experiment like switching from `fp16` to `fp32` precision, you can either override them directly in the CLI via ```bash -python main_run_multitask.py dataset=toymix model=gcn trainer.trainer.precision=32 +graphium-train dataset=toymix model=gcn trainer.trainer.precision=32 ``` or change them permamently in the dedicated experiment config under `expts/hydra-configs/toymix_gcn.yaml`. Integrating `hydra` also allows you to quickly switch between accelerators. E.g., running ```bash -python main_run_multitask.py dataset=toymix model=gcn accelerator=gpu +graphium-train dataset=toymix model=gcn accelerator=gpu ``` automatically selects the correct configs to run the experiment on GPU. +Finally, you can also run a fine-tuning loop: +```bash +graphium-train +finetuning=admet +``` + To use a config file you built from scratch you can run ```bash -python main_run_multitask.py --config-path [PATH] --config-name [CONFIG] +graphium-train --config-path [PATH] --config-name [CONFIG] ``` Thanks to the modular nature of `hydra` you can reuse many of our config settings for your own experiments with Graphium. diff --git a/docs/cli_references.md b/docs/cli_references.md index 52d72720f..b65bb2fba 100644 --- a/docs/cli_references.md +++ b/docs/cli_references.md @@ -5,4 +5,5 @@ This page provides documentation for our command line tools. ::: mkdocs-click :module: graphium.cli :command: main_cli - :command: data_cli + :style: table + :prog_name: graphium diff --git a/docs/tutorials/model_training/running-multitask-ipu.ipynb b/docs/tutorials/model_training/running-multitask-ipu.ipynb index 8da432e4d..05a972e9b 100644 --- a/docs/tutorials/model_training/running-multitask-ipu.ipynb +++ b/docs/tutorials/model_training/running-multitask-ipu.ipynb @@ -420,7 +420,14 @@ "logger.info(metrics)\n", "\n", "predictor = load_predictor(\n", - " cfg, model_class, model_kwargs, metrics, accelerator_type, datamodule.task_norms\n", + " cfg,\n", + " model_class,\n", + " model_kwargs,\n", + " metrics,\n", + " datamodule.get_task_levels(),\n", + " accelerator_type,\n", + " datamodule.featurization,\n", + " datamodule.task_norms\n", ")\n", "logger.info(predictor.model)\n", "logger.info(ModelSummary(predictor, max_depth=4))" diff --git a/docs/tutorials/model_training/simple-molecular-model.ipynb b/docs/tutorials/model_training/simple-molecular-model.ipynb index 717e1413e..26a45cfa0 100644 --- a/docs/tutorials/model_training/simple-molecular-model.ipynb +++ b/docs/tutorials/model_training/simple-molecular-model.ipynb @@ -10,30 +10,30 @@ "\n", "The work flow of testing your code on the entire pipeline is as follows:\n", "\n", - "1. select a corresponding yaml file in the [expts/main_run_multitask.py](https://github.com/datamol-io/graphium/blob/master/expts/main_run_multitask.py) i.e. by `CONFIG_FILE = \"expts/configs/config_gps_10M_pcqm4m_mod.yaml\"`\n", - "2. modify the yaml config file\n", - "3. `python expts/main_run_multitask.py`\n", - "\n", - "There are multiple examples of YAML files located in the folder `graphium/expts/configs` that one can refer to when training a new model. The file `config_gps_10M_pcqm4m_mod.yaml` shows an example of running the GPS model on the pcqm4m dataset.\n", + "1. Select a subset of the [available configs](https://github.com/datamol-io/graphium/tree/main/expts/hydra-configs) as a starting point.\n", + "2. Create additional configs or modify the existing configs to suit your needs.\n", + "3. Train or fine-tune a model with the `graphium-train` CLI.\n", "\n", "## Creating the yaml file\n", "\n", - "The first step is to create a YAML file containing all the required configurations, with an example given at `graphium/expts/config_gps_10M_pcqm4m_mod.yaml`. We will go through each part of the configurations." + "The first step is to create a YAML file containing all the required configurations, with an example given at `graphium/expts/hydra-configs/main.yaml`. We will go through each part of the configurations. See also the README [here](https://github.com/datamol-io/graphium/tree/main/expts/hydra-configs)." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "import yaml\n", - "import omegaconf" + "import omegaconf\n", + "\n", + "from hydra import compose, initialize" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -57,8 +57,8 @@ ], "source": [ "# First, let's read the yaml configuration file\n", - "with open(\"../../../expts/configs/config_gps_10M_pcqm4m_mod.yaml\", \"r\") as file:\n", - " yaml_config = yaml.load(file, Loader=yaml.FullLoader)\n", + "with initialize(version_base=None, config_path=\"../../../expts/hydra-configs\"):\n", + " yaml_config = compose(config_name=\"main\")\n", "\n", "print(\"Yaml file loaded\")" ] @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -82,11 +82,11 @@ "output_type": "stream", "text": [ "constants:\n", - " name: pcqm4mv2_mpnn_4layer\n", + " name: neurips2023_small_data_gcn\n", " seed: 42\n", + " max_epochs: 100\n", + " data_dir: expts/data/neurips2023/small-dataset\n", " raise_train_error: true\n", - " accelerator:\n", - " type: gpu\n", "\n" ] } @@ -108,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -118,22 +118,14 @@ "datamodule:\n", " module_type: MultitaskFromSmilesDataModule\n", " args:\n", - " task_specific_args:\n", - " homolumo:\n", - " df: null\n", - " task_level: graph\n", - " df_path: ~/scratch/data/graphium/data/PCQM4M/pcqm4mv2-20k.csv\n", - " smiles_col: cxsmiles\n", - " label_cols:\n", - " - homo_lumo_gap\n", - " split_val: 0.1\n", - " split_test: 0.1\n", " prepare_dict_or_graph: pyg:graph\n", - " featurization_n_jobs: 30\n", + " featurization_n_jobs: 4\n", " featurization_progress: true\n", " featurization_backend: loky\n", + " processed_graph_data_path: ../datacache/neurips2023-small/\n", + " num_workers: 4\n", + " persistent_workers: false\n", " featurization:\n", - " mask_nan: 0\n", " atom_property_list_onehot:\n", " - atomic-number\n", " - group\n", @@ -149,55 +141,94 @@ " - bond-type-onehot\n", " - stereo\n", " - in-ring\n", - " conformer_property_list:\n", - " - positions_3d\n", " add_self_loop: false\n", " explicit_H: false\n", " use_bonds_weights: false\n", " pos_encoding_as_features:\n", " pos_types:\n", - " node_laplacian_eigvec:\n", - " pos_type: laplacian_eigvec\n", + " lap_eigvec:\n", " pos_level: node\n", + " pos_type: laplacian_eigvec\n", " num_pos: 8\n", " normalization: none\n", " disconnected_comp: true\n", - " node_laplacian_eigval:\n", - " pos_type: laplacian_eigval\n", + " lap_eigval:\n", " pos_level: node\n", + " pos_type: laplacian_eigval\n", " num_pos: 8\n", " normalization: none\n", " disconnected_comp: true\n", - " rw_return_probs:\n", - " pos_type: rw_return_probs\n", + " rw_pos:\n", " pos_level: node\n", - " ksteps:\n", - " - 4\n", - " - 8\n", - " nodepair_rw_transition_probs:\n", - " pos_type: rw_transition_probs\n", - " pos_level: edge\n", - " ksteps:\n", - " - 2\n", - " - 4\n", - " nodepair_rw_return_probs:\n", " pos_type: rw_return_probs\n", - " pos_level: nodepair\n", - " ksteps:\n", - " - 4\n", - " electrostatic:\n", - " pos_type: electrostatic\n", - " pos_level: node\n", - " edge_commute:\n", - " pos_type: commute\n", - " pos_level: edge\n", - " nodepair_graphormer:\n", - " pos_type: graphormer\n", - " pos_level: nodepair\n", - " batch_size_training: 64\n", - " batch_size_inference: 16\n", - " num_workers: 0\n", - " persistent_workers: false\n", + " ksteps: 16\n", + " task_specific_args:\n", + " qm9:\n", + " df: null\n", + " df_path: ${constants.data_dir}/qm9.csv.gz\n", + " smiles_col: smiles\n", + " label_cols:\n", + " - A\n", + " - B\n", + " - C\n", + " - mu\n", + " - alpha\n", + " - homo\n", + " - lumo\n", + " - gap\n", + " - r2\n", + " - zpve\n", + " - u0\n", + " - u298\n", + " - h298\n", + " - g298\n", + " - cv\n", + " - u0_atom\n", + " - u298_atom\n", + " - h298_atom\n", + " - g298_atom\n", + " splits_path: ${constants.data_dir}/qm9_random_splits.pt\n", + " seed: ${constants.seed}\n", + " task_level: graph\n", + " label_normalization:\n", + " normalize_val_test: true\n", + " method: normal\n", + " tox21:\n", + " df: null\n", + " df_path: ${constants.data_dir}/Tox21-7k-12-labels.csv.gz\n", + " smiles_col: smiles\n", + " label_cols:\n", + " - NR-AR\n", + " - NR-AR-LBD\n", + " - NR-AhR\n", + " - NR-Aromatase\n", + " - NR-ER\n", + " - NR-ER-LBD\n", + " - NR-PPAR-gamma\n", + " - SR-ARE\n", + " - SR-ATAD5\n", + " - SR-HSE\n", + " - SR-MMP\n", + " - SR-p53\n", + " splits_path: ${constants.data_dir}/Tox21_random_splits.pt\n", + " seed: ${constants.seed}\n", + " task_level: graph\n", + " zinc:\n", + " df: null\n", + " df_path: ${constants.data_dir}/ZINC12k.csv.gz\n", + " smiles_col: smiles\n", + " label_cols:\n", + " - SA\n", + " - logp\n", + " - score\n", + " splits_path: ${constants.data_dir}/ZINC12k_random_splits.pt\n", + " seed: ${constants.seed}\n", + " task_level: graph\n", + " label_normalization:\n", + " normalize_val_test: true\n", + " method: normal\n", + " batch_size_training: 200\n", + " batch_size_inference: 200\n", "\n" ] } @@ -226,7 +257,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -237,134 +268,106 @@ " model_type: FullGraphMultiTaskNetwork\n", " mup_base_path: null\n", " pre_nn:\n", - " out_dim: 32\n", - " hidden_dims: 64\n", + " out_dim: 64\n", + " hidden_dims: 256\n", " depth: 2\n", " activation: relu\n", " last_activation: none\n", - " dropout: 0.1\n", + " dropout: 0.18\n", " normalization: layer_norm\n", - " last_normalization: layer_norm\n", - " residual_type: none\n", - " pre_nn_edges:\n", - " out_dim: 16\n", - " hidden_dims: 32\n", - " depth: 2\n", - " activation: relu\n", - " last_activation: none\n", - " dropout: 0.1\n", - " normalization: layer_norm\n", - " last_normalization: layer_norm\n", + " last_normalization: ${architecture.pre_nn.normalization}\n", " residual_type: none\n", + " pre_nn_edges: null\n", " pe_encoders:\n", " out_dim: 32\n", - " edge_out_dim: 16\n", " pool: sum\n", " last_norm: None\n", " encoders:\n", - " emb_la_pos:\n", + " la_pos:\n", " encoder_type: laplacian_pe\n", " input_keys:\n", " - laplacian_eigvec\n", " - laplacian_eigval\n", " output_keys:\n", " - feat\n", - " hidden_dim: 32\n", + " hidden_dim: 64\n", + " out_dim: 32\n", " model_type: DeepSet\n", " num_layers: 2\n", " num_layers_post: 1\n", " dropout: 0.1\n", " first_normalization: none\n", - " emb_rwse:\n", + " rw_pos:\n", " encoder_type: mlp\n", " input_keys:\n", " - rw_return_probs\n", " output_keys:\n", " - feat\n", - " hidden_dim: 32\n", + " hidden_dim: 64\n", + " out_dim: 32\n", " num_layers: 2\n", " dropout: 0.1\n", " normalization: layer_norm\n", " first_normalization: layer_norm\n", - " emb_electrostatic:\n", - " encoder_type: mlp\n", - " input_keys:\n", - " - electrostatic\n", - " output_keys:\n", - " - feat\n", - " hidden_dim: 32\n", - " num_layers: 1\n", - " dropout: 0.1\n", - " normalization: layer_norm\n", - " first_normalization: layer_norm\n", - " emb_edge_rwse:\n", - " encoder_type: mlp\n", - " input_keys:\n", - " - edge_rw_transition_probs\n", - " output_keys:\n", - " - edge_feat\n", - " hidden_dim: 32\n", - " num_layers: 1\n", - " dropout: 0.1\n", - " normalization: layer_norm\n", - " emb_edge_pes:\n", - " encoder_type: cat_mlp\n", - " input_keys:\n", - " - edge_rw_transition_probs\n", - " - edge_commute\n", - " output_keys:\n", - " - edge_feat\n", - " hidden_dim: 32\n", - " num_layers: 1\n", - " dropout: 0.1\n", - " normalization: layer_norm\n", - " gaussian_pos:\n", - " encoder_type: gaussian_kernel\n", - " input_keys:\n", - " - positions_3d\n", - " output_keys:\n", - " - feat\n", - " - nodepair_gaussian_bias_3d\n", - " num_heads: 2\n", - " num_layers: 2\n", - " embed_dim: 32\n", - " use_input_keys_prefix: false\n", " gnn:\n", - " out_dim: 32\n", - " hidden_dims: 32\n", + " in_dim: 64\n", + " out_dim: 96\n", + " hidden_dims: 96\n", " depth: 4\n", " activation: gelu\n", " last_activation: none\n", - " dropout: 0.0\n", + " dropout: 0.1\n", " normalization: layer_norm\n", - " last_normalization: layer_norm\n", + " last_normalization: ${architecture.pre_nn.normalization}\n", " residual_type: simple\n", - " pooling:\n", - " - sum\n", " virtual_node: none\n", - " layer_type: pyg:gps\n", - " layer_kwargs:\n", - " node_residual: false\n", - " mpnn_type: pyg:mpnnplus\n", - " mpnn_kwargs:\n", - " in_dim: 32\n", - " out_dim: 32\n", - " in_dim_edges: 16\n", - " out_dim_edges: 16\n", - " attn_type: full-attention\n", - " attn_kwargs:\n", - " num_heads: 2\n", - " biased_attention_key: nodepair_gaussian_bias_3d\n", - " post_nn: null\n", + " layer_type: pyg:gcn\n", + " layer_kwargs: null\n", + " graph_output_nn:\n", + " graph:\n", + " pooling:\n", + " - sum\n", + " out_dim: 96\n", + " hidden_dims: 96\n", + " depth: 1\n", + " activation: relu\n", + " last_activation: none\n", + " dropout: ${architecture.pre_nn.dropout}\n", + " normalization: ${architecture.pre_nn.normalization}\n", + " last_normalization: none\n", + " residual_type: none\n", " task_heads:\n", - " homolumo:\n", - " out_dim: 1\n", - " hidden_dims: 256\n", + " qm9:\n", + " task_level: graph\n", + " out_dim: 19\n", + " hidden_dims: 128\n", " depth: 2\n", " activation: relu\n", " last_activation: none\n", - " dropout: 0.1\n", - " normalization: layer_norm\n", + " dropout: ${architecture.pre_nn.dropout}\n", + " normalization: ${architecture.pre_nn.normalization}\n", + " last_normalization: none\n", + " residual_type: none\n", + " tox21:\n", + " task_level: graph\n", + " out_dim: 12\n", + " hidden_dims: 64\n", + " depth: 2\n", + " activation: relu\n", + " last_activation: none\n", + " dropout: ${architecture.pre_nn.dropout}\n", + " normalization: ${architecture.pre_nn.normalization}\n", + " last_normalization: none\n", + " residual_type: none\n", + " zinc:\n", + " task_level: graph\n", + " out_dim: 3\n", + " hidden_dims: 32\n", + " depth: 2\n", + " activation: relu\n", + " last_activation: none\n", + " dropout: ${architecture.pre_nn.dropout}\n", + " normalization: ${architecture.pre_nn.normalization}\n", " last_normalization: none\n", " residual_type: none\n", "\n" @@ -386,7 +389,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -395,24 +398,28 @@ "text": [ "predictor:\n", " metrics_on_progress_bar:\n", - " homolumo:\n", + " qm9:\n", + " - mae\n", + " tox21:\n", + " - auroc\n", + " zinc:\n", " - mae\n", - " - pearsonr\n", " loss_fun:\n", - " homolumo: mse_ipu\n", - " random_seed: 42\n", + " qm9: mae_ipu\n", + " tox21: bce_logits_ipu\n", + " zinc: mae_ipu\n", + " random_seed: ${constants.seed}\n", " optim_kwargs:\n", - " lr: 0.0004\n", + " lr: 4.0e-05\n", " torch_scheduler_kwargs:\n", " module_type: WarmUpLinearLR\n", - " max_num_epochs: 5\n", + " max_num_epochs: ${constants.max_epochs}\n", " warmup_epochs: 10\n", " verbose: false\n", " scheduler_kwargs: null\n", " target_nan_mask: null\n", - " flag_kwargs:\n", - " n_steps: 0\n", - " alpha: 0.0\n", + " multitask_handling: flatten\n", + " metrics_every_n_train_steps: 300\n", "\n" ] } @@ -434,7 +441,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -442,7 +449,54 @@ "output_type": "stream", "text": [ "metrics:\n", - " homolumo:\n", + " qm9:\n", + " - name: mae\n", + " metric: mae_ipu\n", + " target_nan_mask: null\n", + " multitask_handling: flatten\n", + " threshold_kwargs: null\n", + " - name: pearsonr\n", + " metric: pearsonr_ipu\n", + " threshold_kwargs: null\n", + " target_nan_mask: null\n", + " multitask_handling: mean-per-label\n", + " - name: r2_score\n", + " metric: r2_score_ipu\n", + " target_nan_mask: null\n", + " multitask_handling: mean-per-label\n", + " threshold_kwargs: null\n", + " tox21:\n", + " - name: auroc\n", + " metric: auroc_ipu\n", + " task: binary\n", + " multitask_handling: mean-per-label\n", + " threshold_kwargs: null\n", + " - name: avpr\n", + " metric: average_precision_ipu\n", + " task: binary\n", + " multitask_handling: mean-per-label\n", + " threshold_kwargs: null\n", + " - name: f1 > 0.5\n", + " metric: f1\n", + " multitask_handling: mean-per-label\n", + " target_to_int: true\n", + " num_classes: 2\n", + " average: micro\n", + " threshold_kwargs:\n", + " operator: greater\n", + " threshold: 0.5\n", + " th_on_preds: true\n", + " th_on_target: true\n", + " - name: precision > 0.5\n", + " metric: precision\n", + " multitask_handling: mean-per-label\n", + " average: micro\n", + " threshold_kwargs:\n", + " operator: greater\n", + " threshold: 0.5\n", + " th_on_preds: true\n", + " th_on_target: true\n", + " zinc:\n", " - name: mae\n", " metric: mae_ipu\n", " target_nan_mask: null\n", @@ -453,6 +507,11 @@ " threshold_kwargs: null\n", " target_nan_mask: null\n", " multitask_handling: mean-per-label\n", + " - name: r2_score\n", + " metric: r2_score_ipu\n", + " target_nan_mask: null\n", + " multitask_handling: mean-per-label\n", + " threshold_kwargs: null\n", "\n" ] } @@ -472,7 +531,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -480,21 +539,17 @@ "output_type": "stream", "text": [ "trainer:\n", - " logger:\n", - " save_dir: logs/PCQMv2\n", - " name: pcqm4mv2_mpnn_4layer\n", - " project: PCQMv2_mpnn\n", + " seed: ${constants.seed}\n", " model_checkpoint:\n", - " dirpath: models_checkpoints/PCMQv2/\n", - " filename: pcqm4mv2_mpnn_4layer\n", - " save_top_k: 1\n", - " every_n_epochs: 100\n", + " filename: ${constants.name}\n", + " save_last: true\n", + " dirpath: models_checkpoints/neurips2023-small-gcn/\n", " trainer:\n", " precision: 32\n", - " max_epochs: 5\n", + " max_epochs: ${constants.max_epochs}\n", " min_epochs: 1\n", - " accumulate_grad_batches: 2\n", " check_val_every_n_epoch: 20\n", + " accumulate_grad_batches: 1\n", "\n" ] } @@ -511,16 +566,13 @@ "\n", "Now that we defined all the configuration files, we want to train the model. The steps are fairly easy using the config loaders, and are given below.\n", "\n", - "First make sure the dataset file is downloaded. \n", - "Using `config_gps_10M_pcqm4m.yaml` as an example, if the file at `df_path` in the config is downloaded.\n", - "In this case, we need to download `pcqm4mv2-20k.csv` into the specified directory `graphium/data/PCQM4M/pcqm4mv2-20k.csv`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "$`python expts/main_run_multitask.py`" + "First make sure the dataset file is downloaded. Using `config_gps_10M_pcqm4m.yaml` as an example, make sure the file specified by `df_path` in the config is available.\n", + "In this case, we need to download `pcqm4mv2-20k.csv` into the specified directory `graphium/data/PCQM4M/pcqm4mv2-20k.csv`.\n", + "\n", + "After that, we can simply run a training through the CLI:\n", + "```bash\n", + "graphium-train\n", + "```" ] } ], @@ -543,7 +595,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.12" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/env.yml b/env.yml index 7fc668692..e49d071a4 100644 --- a/env.yml +++ b/env.yml @@ -28,7 +28,7 @@ dependencies: - gcsfs >=2021.6 # ML packages - - cudatoolkit # works also with CPU-only system. + - cuda-version # works also with CPU-only system. - pytorch >=1.12 - lightning >=2.0 - torchmetrics >=0.7.0,<0.11 diff --git a/expts/__init__.py b/expts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/expts/configs/config_tdc_admet_demo.yaml b/expts/configs/config_tdc_admet_demo.yaml deleted file mode 100644 index aac4d0e50..000000000 --- a/expts/configs/config_tdc_admet_demo.yaml +++ /dev/null @@ -1,315 +0,0 @@ -# Testing the gcn model with the PCQMv2 dataset on IPU. -constants: - name: &name tdc_admet_demo - seed: &seed 42 - raise_train_error: true # Whether the code should raise an error if it crashes during training - -accelerator: - type: gpu # cpu or ipu or gpu - -datamodule: - module_type: "ADMETBenchmarkDataModule" - args: - # TDC specific - tdc_benchmark_names: null - tdc_train_val_seed: *seed - # Featurization - prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True - featurization_backend: "loky" - processed_graph_data_path: "../datacache/tdc-admet-demo/" - featurization: - atom_property_list_onehot: [atomic-number, group, period, total-valence] - atom_property_list_float: [degree, formal-charge, radical-electron, aromatic, in-ring] - edge_property_list: [bond-type-onehot, stereo, in-ring] - add_self_loop: False - explicit_H: False # if H is included - use_bonds_weights: False - pos_encoding_as_features: - pos_types: - lap_eigvec: - pos_level: node - pos_type: laplacian_eigvec - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - lap_eigval: - pos_level: node - pos_type: laplacian_eigval - num_pos: 8 - normalization: "none" # nomrlization already applied on the eigen vectors - disconnected_comp: True # if eigen values/vector for disconnected graph are included - rw_pos: # use same name as pe_encoder - pos_level: node - pos_type: rw_return_probs - ksteps: 16 - - num_workers: -1 # -1 to use all - persistent_workers: False # if use persistent worker at the start of each epoch. - - -architecture: - model_type: FullGraphMultiTaskNetwork - mup_base_path: null - pre_nn: # Set as null to avoid a pre-nn network - out_dim: 64 - hidden_dims: 256 - depth: 2 - activation: relu - last_activation: none - dropout: &dropout 0.18 - normalization: &normalization layer_norm - last_normalization: *normalization - residual_type: none - - pre_nn_edges: null # Set as null to avoid a pre-nn network - - pe_encoders: - out_dim: 32 - pool: "sum" #"mean" "max" - last_norm: None #"batch_norm", "layer_norm" - encoders: #la_pos | rw_pos - la_pos: # Set as null to avoid a pre-nn network - encoder_type: "laplacian_pe" - input_keys: ["laplacian_eigvec", "laplacian_eigval"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - model_type: 'DeepSet' #'Transformer' or 'DeepSet' - num_layers: 2 - num_layers_post: 1 # Num. layers to apply after pooling - dropout: 0.1 - first_normalization: "none" #"batch_norm" or "layer_norm" - rw_pos: - encoder_type: "mlp" - input_keys: ["rw_return_probs"] - output_keys: ["feat"] - hidden_dim: 64 - out_dim: 32 - num_layers: 2 - dropout: 0.1 - normalization: "layer_norm" #"batch_norm" or "layer_norm" - first_normalization: "layer_norm" #"batch_norm" or "layer_norm" - - - - gnn: # Set as null to avoid a post-nn network - in_dim: 64 # or otherwise the correct value - out_dim: &gnn_dim 96 - hidden_dims: *gnn_dim - depth: 4 - activation: gelu - last_activation: none - dropout: 0.1 - normalization: "layer_norm" - last_normalization: *normalization - residual_type: simple - virtual_node: 'none' - layer_type: 'pyg:gcn' #pyg:gine #'pyg:gps' # pyg:gated-gcn, pyg:gine,pyg:gps - layer_kwargs: null # Parameters for the model itself. You could define dropout_attn: 0.1 - - - graph_output_nn: - graph: - pooling: [sum] - out_dim: *gnn_dim - hidden_dims: *gnn_dim - depth: 1 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - - task_heads: - caco2_wang: ®ression_head - task_level: graph - out_dim: 1 - hidden_dims: 64 - depth: 2 - activation: relu - last_activation: none - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - hia_hou: &classification_head - task_level: graph - out_dim: 1 - hidden_dims: 64 - depth: 2 - activation: relu - last_activation: sigmoid - dropout: *dropout - normalization: *normalization - last_normalization: "none" - residual_type: none - pgp_broccatelli: *classification_head - bioavailability_ma: *classification_head - lipophilicity_astrazeneca: *regression_head - solubility_aqsoldb: *regression_head - bbb_martins: *classification_head - ppbr_az: *regression_head - vdss_lombardo: *regression_head - cyp2d6_veith: *classification_head - cyp3a4_veith: *classification_head - cyp2c9_veith: *classification_head - cyp2d6_substrate_carbonmangels: *classification_head - cyp3a4_substrate_carbonmangels: *classification_head - cyp2c9_substrate_carbonmangels: *classification_head - half_life_obach: *regression_head - clearance_microsome_az: *regression_head - clearance_hepatocyte_az: *regression_head - herg: *classification_head - ames: *classification_head - dili: *classification_head - ld50_zhu: *regression_head - -#Task-specific -predictor: - metrics_on_progress_bar: - # All below metrics are directly copied from the TDC website. - # For more information, see https://tdcommons.ai/benchmark/admet_group/overview/ - caco2_wang: ["mae"] - hia_hou: ["auroc"] - pgp_broccatelli: ["auroc"] - bioavailability_ma: ["auroc"] - lipophilicity_astrazeneca: ["mae"] - solubility_aqsoldb: ["mae"] - bbb_martins: ["auroc"] - ppbr_az: ["mae"] - vdss_lombardo: ["spearman"] - cyp2d6_veith: ["auprc"] - cyp3a4_veith: ["auprc"] - cyp2c9_veith: ["auprc"] - cyp2d6_substrate_carbonmangels: ["auprc"] - cyp3a4_substrate_carbonmangels: ["auprc"] - cyp2c9_substrate_carbonmangels: ["auprc"] - half_life_obach: ["spearman"] - clearance_microsome_az: ["spearman"] - clearance_hepatocyte_az: ["spearman"] - herg: ["mae"] - ames: ["auroc"] - dili: ["auroc"] - ld50_zhu: ["auroc"] - loss_fun: - caco2_wang: mae - hia_hou: bce - pgp_broccatelli: bce - bioavailability_ma: bce - lipophilicity_astrazeneca: mae - solubility_aqsoldb: mae - bbb_martins: bce - ppbr_az: mae - vdss_lombardo: mae - cyp2d6_veith: bce - cyp3a4_veith: bce - cyp2c9_veith: bce - cyp2d6_substrate_carbonmangels: bce - cyp3a4_substrate_carbonmangels: bce - cyp2c9_substrate_carbonmangels: bce - half_life_obach: mae - clearance_microsome_az: mae - clearance_hepatocyte_az: mae - herg: bce - ames: bce - dili: bce - ld50_zhu: mae - random_seed: *seed - optim_kwargs: - lr: 4.e-5 # warmup can be scheduled using torch_scheduler_kwargs - torch_scheduler_kwargs: - module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 10 - warmup_epochs: 10 - verbose: False - target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label - multitask_handling: flatten # flatten, mean-per-label - -# Task-specific -metrics: - caco2_wang: ®ression_metrics - - name: mae - metric: mae - target_nan_mask: null - multitask_handling: flatten - threshold_kwargs: null - - name: spearman - metric: spearmanr - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: pearson - metric: pearsonr - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2_score - metric: r2 - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - hia_hou: &classification_metrics - - name: auroc - metric: auroc - task: binary - multitask_handling: mean-per-label - threshold_kwargs: null - - name: auprc - metric: average_precision - task: binary - multitask_handling: mean-per-label - threshold_kwargs: null - - name: accuracy - metric: accuracy - multitask_handling: mean-per-label - target_to_int: True - average: micro - threshold_kwargs: &threshold_05 - operator: greater - threshold: 0.5 - th_on_preds: True - th_on_target: True - - name: mcc - metric: mcc - num_classes: 2 - multitask_handling: mean-per-label - target_to_int: True - average: micro - threshold_kwargs: *threshold_05 - pgp_broccatelli: *classification_metrics - bioavailability_ma: *classification_metrics - lipophilicity_astrazeneca: *regression_metrics - solubility_aqsoldb: *regression_metrics - bbb_martins: *classification_metrics - ppbr_az: *regression_metrics - vdss_lombardo: *regression_metrics - cyp2d6_veith: *classification_metrics - cyp3a4_veith: *classification_metrics - cyp2c9_veith: *classification_metrics - cyp2d6_substrate_carbonmangels: *classification_metrics - cyp3a4_substrate_carbonmangels: *classification_metrics - cyp2c9_substrate_carbonmangels: *classification_metrics - half_life_obach: *regression_metrics - clearance_microsome_az: *regression_metrics - clearance_hepatocyte_az: *regression_metrics - herg: *classification_metrics - ames: *classification_metrics - dili: *classification_metrics - ld50_zhu: *regression_metrics -trainer: - seed: *seed - logger: - save_dir: logs/tdc-admet-demo/ - name: *name - project: *name - model_checkpoint: - dirpath: models_checkpoints/tdc-admet-demo/ - filename: *name - save_last: True - trainer: - max_epochs: *max_epochs - min_epochs: 1 - check_val_every_n_epoch: 20 diff --git a/expts/hydra-configs/README.md b/expts/hydra-configs/README.md index 5e189c304..f695ae20c 100644 --- a/expts/hydra-configs/README.md +++ b/expts/hydra-configs/README.md @@ -38,18 +38,32 @@ trainer: We can now utilize `hydra` to e.g., run a sweep over our models on the ToyMix dataset via ```bash -python main_run_multitask.py -m model=gcn,gin +graphium-train -m model=gcn,gin ``` where the ToyMix dataset is pre-configured in `main.yaml`. Read on to find out how to define new datasets and architectures for pre-training and fine-tuning. ## Pre-training / Fine-tuning -From a configuration point-of-view, fine-tuning requires us to load a pre-trained model and attach new task heads. However, in a highly configurable library such as ours changing the task heads also requires changes to the logged metrics, loss functions and the source of the fine-tuning data. To allow a quick switch between pre-training and fine-tuning, by default, we configure models and the corresponding tasks in a separate manner. More specifically, +Say you trained a model with the following command: +```bash +graphium-train --config-name "main" +``` + +Fine-tuning this model on downstream tasks is then as simple as: +```bash +graphium-train --config-name "main" +finetuning=... +``` + +From a configuration point-of-view, fine-tuning requires us to load a pre-trained model and override part of the training configuration to fine-tune it on downstream tasks. To allow a quick switch between pre-training and fine-tuning, by default, we configure models and the corresponding tasks in a separate manner. More specifically, - under `architecture/` we store architecture related configurations such as the definition of the GNN/Transformer layers or positional/structural encoders - under `tasks/` we store configurations specific to one task set, such as the multi-task dataset ToyMix + - under `tasks/task_heads` we specify the task-specific heads to add on top of the base architecture. + - under `tasks/loss_metrics_datamodule` we specify the data-module to use and the task-specific loss functions and metrics - under `training/` we store configurations specific to training models which could be different for each combination of `architecture` and `tasks` +- under `finetuning/` we store configurations with overrides Since architecture and tasks are logically separated it now becomes very easy to e.g., use an existing architecture backbone on a new set of tasks or a new dataset altogether. Additionally, separating training allows us to specify different training parameters for e.g., pre-training and fine-tuning of the same architecture and task set. + We will now detail how you can add new architectures, tasks and training configurations. ### Adding an architecture @@ -88,7 +102,7 @@ datamodule: ``` You can then select your new architecture during training, e.g., by running ```bash -python main_run_multitask.py architecture=my_architecture +graphium-train architecture=my_architecture ``` ### Adding tasks @@ -125,7 +139,7 @@ predictor: ``` You can then select your new dataset during training, e.g., by running ```bash -python main_run_multitask.py tasks=my_tasks +graphium-train tasks=my_tasks ``` ### Adding training configs diff --git a/expts/hydra-configs/__init__.py b/expts/hydra-configs/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/expts/hydra-configs/finetuning/admet.yaml b/expts/hydra-configs/finetuning/admet.yaml new file mode 100644 index 000000000..80fb20e35 --- /dev/null +++ b/expts/hydra-configs/finetuning/admet.yaml @@ -0,0 +1,91 @@ +# @package _global_ + +# == Fine-tuning configs in Graphium == +# +# A fine-tuning config is a appendum to a (pre-)training config. +# Since many things (e.g. the architecture), will stay constant between (pre-)training and fine-tuning, +# this config should be as minimal as possible to avoid unnecessary duplication. It only specifies +# what to override with regards to the config used for (pre-)training. +# +# Given the following training command: +# >>> graphium-train --cfg /path/to/train.yaml +# +# Fine-tuning now is as easy as: +# >>> graphium-train --cfg /path/to/train.yaml +finetune=admet +# +# NOTE: This config can be used for each of the benchmarks in the TDC ADMET benchmark suite. +# The only thing that needs to be changed is the `constants.task` key. + + +## == Overrides == + +defaults: + # This file contains all metrics and loss function info for all ADMET tasks. + # This config is filtered at runtime based on the `constants.task` key. + - override /tasks/loss_metrics_datamodule: admet + +constants: + + # For now, we assume a model is always fine-tuned on a single task at a time. + # You can override this value with any of the benchmark names in the TDC benchmark suite. + # See also https://tdcommons.ai/benchmark/admet_group/overview/ + task: &task lipophilicity_astrazeneca + + name: finetuning_${constants.task}_gcn + wandb: + name: ${constants.name} + project: *task + entity: multitask-gnn + save_dir: logs/${constants.task} + seed: 42 + max_epochs: 10 + data_dir: expts/data/admet/${constants.task} + raise_train_error: true + +predictor: + optim_kwargs: + lr: 4.e-5 + +# == Fine-tuning config == + +finetuning: + + # For now, we assume a model is always fine-tuned on a single task at a time. + # You can override this value with any of the benchmark names in the TDC benchmark suite. + # See also https://tdcommons.ai/benchmark/admet_group/overview/ + task: ${constants.task} + level: graph + + # Pretrained model + pretrained_model_name: dummy-pretrained-model + finetuning_module: task_heads # gnn + sub_module_from_pretrained: zinc # optional + new_sub_module: lipophilicity_astrazeneca # optional + + # keep_modules_after_finetuning_module: # optional + # graph_output_nn/graph: {} + # task_heads/zinc: + # new_sub_module: lipophilicity_astrazeneca + # out_dim: 1 + + + # Changes to finetuning_module + drop_depth: 1 + new_out_dim: 8 + added_depth: 2 + + # Training + unfreeze_pretrained_depth: 0 + epoch_unfreeze_all: none + + # Optional finetuning head appended to model after finetuning_module + finetuning_head: + task: ${constants.task} + previous_module: task_heads + incoming_level: graph + model_type: mlp + in_dim: 8 + out_dim: 1 + hidden_dims: 8 + depth: 2 + last_layer_is_readout: true diff --git a/expts/hydra-configs/main.yaml b/expts/hydra-configs/main.yaml index d4b3beceb..a57dd22ca 100644 --- a/expts/hydra-configs/main.yaml +++ b/expts/hydra-configs/main.yaml @@ -1,7 +1,7 @@ defaults: # Accelerators - - accelerator: ipu + - accelerator: cpu # Pre-training/fine-tuning - architecture: toymix diff --git a/expts/hydra-configs/tasks/admet.yaml b/expts/hydra-configs/tasks/admet.yaml new file mode 100644 index 000000000..30dec61e0 --- /dev/null +++ b/expts/hydra-configs/tasks/admet.yaml @@ -0,0 +1,7 @@ +# NOTE: We cannot have a single config, since for fine-tuning we will +# only want to override the loss_metrics_datamodule, whereas for training we will +# want to override both. + +defaults: + - task_heads: admet + - loss_metrics_datamodule: admet \ No newline at end of file diff --git a/expts/hydra-configs/tasks/loss_metrics_datamodule/admet.yaml b/expts/hydra-configs/tasks/loss_metrics_datamodule/admet.yaml new file mode 100644 index 000000000..87136b683 --- /dev/null +++ b/expts/hydra-configs/tasks/loss_metrics_datamodule/admet.yaml @@ -0,0 +1,141 @@ +# @package _global_ + +#Task-specific +predictor: + metrics_on_progress_bar: + # All below metrics are directly copied from the TDC website. + # For more information, see https://tdcommons.ai/benchmark/admet_group/overview/ + caco2_wang: ["mae"] + hia_hou: ["auroc"] + pgp_broccatelli: ["auroc"] + bioavailability_ma: ["auroc"] + lipophilicity_astrazeneca: ["mae"] + solubility_aqsoldb: ["mae"] + bbb_martins: ["auroc"] + ppbr_az: ["mae"] + vdss_lombardo: ["spearman"] + cyp2d6_veith: ["auprc"] + cyp3a4_veith: ["auprc"] + cyp2c9_veith: ["auprc"] + cyp2d6_substrate_carbonmangels: ["auprc"] + cyp3a4_substrate_carbonmangels: ["auprc"] + cyp2c9_substrate_carbonmangels: ["auprc"] + half_life_obach: ["spearman"] + clearance_microsome_az: ["spearman"] + clearance_hepatocyte_az: ["spearman"] + herg: ["mae"] + ames: ["auroc"] + dili: ["auroc"] + ld50_zhu: ["auroc"] + loss_fun: + caco2_wang: mae + hia_hou: bce + pgp_broccatelli: bce + bioavailability_ma: bce + lipophilicity_astrazeneca: mae + solubility_aqsoldb: mae + bbb_martins: bce + ppbr_az: mae + vdss_lombardo: mae + cyp2d6_veith: bce + cyp3a4_veith: bce + cyp2c9_veith: bce + cyp2d6_substrate_carbonmangels: bce + cyp3a4_substrate_carbonmangels: bce + cyp2c9_substrate_carbonmangels: bce + half_life_obach: mae + clearance_microsome_az: mae + clearance_hepatocyte_az: mae + herg: bce + ames: bce + dili: bce + ld50_zhu: mae + random_seed: ${constants.seed} + optim_kwargs: + lr: 4.e-5 # warmup can be scheduled using torch_scheduler_kwargs + torch_scheduler_kwargs: + module_type: WarmUpLinearLR + max_num_epochs: &max_epochs 10 + warmup_epochs: 10 + verbose: False + target_nan_mask: null # null: no mask, 0: 0 mask, ignore-flatten, ignore-mean-per-label + multitask_handling: flatten # flatten, mean-per-label + +# Task-specific +metrics: + caco2_wang: ®ression_metrics + - name: mae + metric: mae + target_nan_mask: null + multitask_handling: flatten + threshold_kwargs: null + - name: spearman + metric: spearmanr + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: pearson + metric: pearsonr + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2_score + metric: r2 + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + hia_hou: &classification_metrics + - name: auroc + metric: auroc + task: binary + multitask_handling: mean-per-label + threshold_kwargs: null + - name: auprc + metric: averageprecision + task: binary + multitask_handling: mean-per-label + threshold_kwargs: null + - name: accuracy + metric: accuracy + multitask_handling: mean-per-label + target_to_int: True + average: micro + threshold_kwargs: &threshold_05 + operator: greater + threshold: 0.5 + th_on_preds: True + th_on_target: True + - name: mcc + metric: mcc + num_classes: 2 + multitask_handling: mean-per-label + target_to_int: True + average: micro + threshold_kwargs: *threshold_05 + pgp_broccatelli: *classification_metrics + bioavailability_ma: *classification_metrics + lipophilicity_astrazeneca: *regression_metrics + solubility_aqsoldb: *regression_metrics + bbb_martins: *classification_metrics + ppbr_az: *regression_metrics + vdss_lombardo: *regression_metrics + cyp2d6_veith: *classification_metrics + cyp3a4_veith: *classification_metrics + cyp2c9_veith: *classification_metrics + cyp2d6_substrate_carbonmangels: *classification_metrics + cyp3a4_substrate_carbonmangels: *classification_metrics + cyp2c9_substrate_carbonmangels: *classification_metrics + half_life_obach: *regression_metrics + clearance_microsome_az: *regression_metrics + clearance_hepatocyte_az: *regression_metrics + herg: *classification_metrics + ames: *classification_metrics + dili: *classification_metrics + ld50_zhu: *regression_metrics + +datamodule: + module_type: "ADMETBenchmarkDataModule" + args: + # TDC specific + tdc_benchmark_names: null + tdc_train_val_seed: ${constants.seed} \ No newline at end of file diff --git a/expts/hydra-configs/tasks/loss_metrics_datamodule/pcqm4m.yaml b/expts/hydra-configs/tasks/loss_metrics_datamodule/pcqm4m.yaml new file mode 100644 index 000000000..d5b302dd1 --- /dev/null +++ b/expts/hydra-configs/tasks/loss_metrics_datamodule/pcqm4m.yaml @@ -0,0 +1,48 @@ +# @package _global_ + +#Task-specific +predictor: + metrics_on_progress_bar: + homolumo: [] + metrics_on_training_set: + homolumo: ["pearsonr"] + loss_fun: + homolumo: mae_ipu + +# Task-specific +metrics: + homolumo: + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + +datamodule: + module_type: "MultitaskFromSmilesDataModule" + # module_type: "FakeDataModule" # Option to use generated data + args: # Matches that in the test_multitask_datamodule.py case. + task_specific_args: # To be replaced by a new class "DatasetParams" + homolumo: + df: null + task_level: "graph" + df_path: graphium/data/PCQM4M/pcqm4mv2.csv + # wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv + # or set path as https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv directly + smiles_col: "cxsmiles" + label_cols: ["homo_lumo_gap"] + # sample_size: 8000 # use sample_size for test + splits_path: graphium/data/PCQM4M/split_dict_v2.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` + split_names: ["train", "valid", "test-dev"] + # graphium/data/PCQM4Mv2/split_dict.pt + # graphium/data/PCQM4Mv2/pcqm4m_split.csv + # split_val: 0.1 + # split_test: 0.1 + seed: ${constants.seed} + label_normalization: + method: "normal" diff --git a/expts/hydra-configs/tasks/loss_metrics_datamodule/toymix.yaml b/expts/hydra-configs/tasks/loss_metrics_datamodule/toymix.yaml new file mode 100644 index 000000000..9ac744a52 --- /dev/null +++ b/expts/hydra-configs/tasks/loss_metrics_datamodule/toymix.yaml @@ -0,0 +1,102 @@ +# @package _global_ + +predictor: + metrics_on_progress_bar: + qm9: ["mae"] + tox21: ["auroc"] + zinc: ["mae"] + loss_fun: + qm9: mae_ipu + tox21: bce_logits_ipu + zinc: mae_ipu + +metrics: + qm9: &qm9_metrics + - name: mae + metric: mae_ipu + target_nan_mask: null + multitask_handling: flatten + threshold_kwargs: null + - name: pearsonr + metric: pearsonr_ipu + threshold_kwargs: null + target_nan_mask: null + multitask_handling: mean-per-label + - name: r2_score + metric: r2_score_ipu + target_nan_mask: null + multitask_handling: mean-per-label + threshold_kwargs: null + tox21: + - name: auroc + metric: auroc_ipu + task: binary + multitask_handling: mean-per-label + threshold_kwargs: null + - name: avpr + metric: average_precision_ipu + task: binary + multitask_handling: mean-per-label + threshold_kwargs: null + - name: f1 > 0.5 + metric: f1 + multitask_handling: mean-per-label + target_to_int: True + num_classes: 2 + average: micro + threshold_kwargs: &threshold_05 + operator: greater + threshold: 0.5 + th_on_preds: True + th_on_target: True + - name: precision > 0.5 + metric: precision + multitask_handling: mean-per-label + average: micro + threshold_kwargs: *threshold_05 + zinc: *qm9_metrics + +datamodule: + args: + task_specific_args: + qm9: + df: null + df_path: ${constants.data_dir}/qm9.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9.csv.gz + # or set path as the URL directly + smiles_col: "smiles" + label_cols: ["A", "B", "C", "mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "u0", "u298", "h298", "g298", "cv", "u0_atom", "u298_atom", "h298_atom", "g298_atom"] + # sample_size: 2000 # use sample_size for test + splits_path: ${constants.data_dir}/qm9_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9_random_splits.pt` + seed: ${constants.seed} #*seed + task_level: graph + label_normalization: + normalize_val_test: True + method: "normal" + + tox21: + df: null + df_path: ${constants.data_dir}/Tox21-7k-12-labels.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21-7k-12-labels.csv.gz + # or set path as the URL directly + smiles_col: "smiles" + label_cols: ["NR-AR", "NR-AR-LBD", "NR-AhR", "NR-Aromatase", "NR-ER", "NR-ER-LBD", "NR-PPAR-gamma", "SR-ARE", "SR-ATAD5", "SR-HSE", "SR-MMP", "SR-p53"] + # sample_size: 2000 # use sample_size for test + splits_path: ${constants.data_dir}/Tox21_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21_random_splits.pt` + seed: ${constants.seed} + task_level: graph + + zinc: + df: null + df_path: ${constants.data_dir}/ZINC12k.csv.gz + # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k.csv.gz + # or set path as the URL directly + smiles_col: "smiles" + label_cols: ["SA", "logp", "score"] + # sample_size: 2000 # use sample_size for test + splits_path: ${constants.data_dir}/ZINC12k_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k_random_splits.pt` + seed: ${constants.seed} + task_level: graph + label_normalization: + normalize_val_test: True + method: "normal" \ No newline at end of file diff --git a/expts/hydra-configs/tasks/pcqm4m.yaml b/expts/hydra-configs/tasks/pcqm4m.yaml index d92d381f7..4bd477dcc 100644 --- a/expts/hydra-configs/tasks/pcqm4m.yaml +++ b/expts/hydra-configs/tasks/pcqm4m.yaml @@ -1,62 +1,7 @@ -# @package _global_ +# NOTE: We cannot have a single config, since for fine-tuning we will +# only want to override the loss_metrics_datamodule, whereas for training we will +# want to override both. -architecture: - task_heads: - homolumo: - task_level: graph - out_dim: 1 - hidden_dims: 256 - depth: 2 # Not needed if we have hidden_dims - activation: relu - last_activation: none - dropout: 0.18 - normalization: layer_norm - last_normalization: "none" - residual_type: none - -#Task-specific -predictor: - metrics_on_progress_bar: - homolumo: [] - metrics_on_training_set: - homolumo: ["pearsonr"] - loss_fun: - homolumo: mae_ipu - -# Task-specific -metrics: - homolumo: - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - -datamodule: - module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data - args: # Matches that in the test_multitask_datamodule.py case. - task_specific_args: # To be replaced by a new class "DatasetParams" - homolumo: - df: null - task_level: "graph" - df_path: graphium/data/PCQM4M/pcqm4mv2.csv - # wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv - # or set path as https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv directly - smiles_col: "cxsmiles" - label_cols: ["homo_lumo_gap"] - # sample_size: 8000 # use sample_size for test - splits_path: graphium/data/PCQM4M/split_dict_v2.pt # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt` - split_names: ["train", "valid", "test-dev"] - # graphium/data/PCQM4Mv2/split_dict.pt - # graphium/data/PCQM4Mv2/pcqm4m_split.csv - # split_val: 0.1 - # split_test: 0.1 - seed: ${constants.seed} - label_normalization: - method: "normal" +defaults: + - task_heads: pcqm4m + - loss_metrics_datamodule: pcqm4m \ No newline at end of file diff --git a/expts/hydra-configs/tasks/task_heads/admet.yaml b/expts/hydra-configs/tasks/task_heads/admet.yaml new file mode 100644 index 000000000..2e697b15d --- /dev/null +++ b/expts/hydra-configs/tasks/task_heads/admet.yaml @@ -0,0 +1,47 @@ +# @package _global_ + +architecture: + task_heads: + caco2_wang: ®ression_head + task_level: graph + out_dim: 1 + hidden_dims: 64 + depth: 2 + activation: relu + last_activation: none + dropout: &dropout 0.5 + normalization: &normalization "layer_norm" + last_normalization: "none" + residual_type: none + hia_hou: &classification_head + task_level: graph + out_dim: 1 + hidden_dims: 64 + depth: 2 + activation: relu + last_activation: sigmoid + dropout: *dropout + normalization: *normalization + last_normalization: "none" + residual_type: none + pgp_broccatelli: *classification_head + bioavailability_ma: *classification_head + lipophilicity_astrazeneca: *regression_head + solubility_aqsoldb: *regression_head + bbb_martins: *classification_head + ppbr_az: *regression_head + vdss_lombardo: *regression_head + cyp2d6_veith: *classification_head + cyp3a4_veith: *classification_head + cyp2c9_veith: *classification_head + cyp2d6_substrate_carbonmangels: *classification_head + cyp3a4_substrate_carbonmangels: *classification_head + cyp2c9_substrate_carbonmangels: *classification_head + half_life_obach: *regression_head + clearance_microsome_az: *regression_head + clearance_hepatocyte_az: *regression_head + herg: *classification_head + ames: *classification_head + dili: *classification_head + ld50_zhu: *regression_head + \ No newline at end of file diff --git a/expts/hydra-configs/tasks/task_heads/pcqm4m.yaml b/expts/hydra-configs/tasks/task_heads/pcqm4m.yaml new file mode 100644 index 000000000..b45ee9e62 --- /dev/null +++ b/expts/hydra-configs/tasks/task_heads/pcqm4m.yaml @@ -0,0 +1,15 @@ +# @package _global_ + +architecture: + task_heads: + homolumo: + task_level: graph + out_dim: 1 + hidden_dims: 256 + depth: 2 # Not needed if we have hidden_dims + activation: relu + last_activation: none + dropout: 0.18 + normalization: layer_norm + last_normalization: "none" + residual_type: none diff --git a/expts/hydra-configs/tasks/task_heads/toymix.yaml b/expts/hydra-configs/tasks/task_heads/toymix.yaml new file mode 100644 index 000000000..c1df2522e --- /dev/null +++ b/expts/hydra-configs/tasks/task_heads/toymix.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +architecture: + task_heads: + qm9: + task_level: graph + out_dim: 19 + hidden_dims: 128 + depth: 2 + activation: relu + last_activation: none + dropout: ${architecture.pre_nn.dropout} + normalization: ${architecture.pre_nn.normalization} + last_normalization: "none" + residual_type: none + tox21: + task_level: graph + out_dim: 12 + hidden_dims: 64 + depth: 2 + activation: relu + last_activation: none + dropout: ${architecture.pre_nn.dropout} + normalization: ${architecture.pre_nn.normalization} + last_normalization: "none" + residual_type: none + zinc: + task_level: graph + out_dim: 3 + hidden_dims: 32 + depth: 2 + activation: relu + last_activation: none + dropout: ${architecture.pre_nn.dropout} + normalization: ${architecture.pre_nn.normalization} + last_normalization: "none" + residual_type: none diff --git a/expts/hydra-configs/tasks/toymix.yaml b/expts/hydra-configs/tasks/toymix.yaml index e120c13a8..16d582982 100644 --- a/expts/hydra-configs/tasks/toymix.yaml +++ b/expts/hydra-configs/tasks/toymix.yaml @@ -1,138 +1,7 @@ -# @package _global_ +# NOTE: We cannot have a single config, since for fine-tuning we will +# only want to override the loss_metrics_datamodule, whereas for training we will +# want to override both. -architecture: - task_heads: - qm9: - task_level: graph - out_dim: 19 - hidden_dims: 128 - depth: 2 - activation: relu - last_activation: none - dropout: ${architecture.pre_nn.dropout} - normalization: ${architecture.pre_nn.normalization} - last_normalization: "none" - residual_type: none - tox21: - task_level: graph - out_dim: 12 - hidden_dims: 64 - depth: 2 - activation: relu - last_activation: none - dropout: ${architecture.pre_nn.dropout} - normalization: ${architecture.pre_nn.normalization} - last_normalization: "none" - residual_type: none - zinc: - task_level: graph - out_dim: 3 - hidden_dims: 32 - depth: 2 - activation: relu - last_activation: none - dropout: ${architecture.pre_nn.dropout} - normalization: ${architecture.pre_nn.normalization} - last_normalization: "none" - residual_type: none - -predictor: - metrics_on_progress_bar: - qm9: ["mae"] - tox21: ["auroc"] - zinc: ["mae"] - loss_fun: - qm9: mae_ipu - tox21: bce_logits_ipu - zinc: mae_ipu - -metrics: - qm9: &qm9_metrics - - name: mae - metric: mae_ipu - target_nan_mask: null - multitask_handling: flatten - threshold_kwargs: null - - name: pearsonr - metric: pearsonr_ipu - threshold_kwargs: null - target_nan_mask: null - multitask_handling: mean-per-label - - name: r2_score - metric: r2_score_ipu - target_nan_mask: null - multitask_handling: mean-per-label - threshold_kwargs: null - tox21: - - name: auroc - metric: auroc_ipu - task: binary - multitask_handling: mean-per-label - threshold_kwargs: null - - name: avpr - metric: average_precision_ipu - task: binary - multitask_handling: mean-per-label - threshold_kwargs: null - - name: f1 > 0.5 - metric: f1 - multitask_handling: mean-per-label - target_to_int: True - num_classes: 2 - average: micro - threshold_kwargs: &threshold_05 - operator: greater - threshold: 0.5 - th_on_preds: True - th_on_target: True - - name: precision > 0.5 - metric: precision - multitask_handling: mean-per-label - average: micro - threshold_kwargs: *threshold_05 - zinc: *qm9_metrics - -datamodule: - args: - task_specific_args: - qm9: - df: null - df_path: ${constants.data_dir}/qm9.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9.csv.gz - # or set path as the URL directly - smiles_col: "smiles" - label_cols: ["A", "B", "C", "mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "u0", "u298", "h298", "g298", "cv", "u0_atom", "u298_atom", "h298_atom", "g298_atom"] - # sample_size: 2000 # use sample_size for test - splits_path: ${constants.data_dir}/qm9_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9_random_splits.pt` - seed: ${constants.seed} #*seed - task_level: graph - label_normalization: - normalize_val_test: True - method: "normal" - - tox21: - df: null - df_path: ${constants.data_dir}/Tox21-7k-12-labels.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21-7k-12-labels.csv.gz - # or set path as the URL directly - smiles_col: "smiles" - label_cols: ["NR-AR", "NR-AR-LBD", "NR-AhR", "NR-Aromatase", "NR-ER", "NR-ER-LBD", "NR-PPAR-gamma", "SR-ARE", "SR-ATAD5", "SR-HSE", "SR-MMP", "SR-p53"] - # sample_size: 2000 # use sample_size for test - splits_path: ${constants.data_dir}/Tox21_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21_random_splits.pt` - seed: ${constants.seed} - task_level: graph - - zinc: - df: null - df_path: ${constants.data_dir}/ZINC12k.csv.gz - # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k.csv.gz - # or set path as the URL directly - smiles_col: "smiles" - label_cols: ["SA", "logp", "score"] - # sample_size: 2000 # use sample_size for test - splits_path: ${constants.data_dir}/ZINC12k_random_splits.pt # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k_random_splits.pt` - seed: ${constants.seed} - task_level: graph - label_normalization: - normalize_val_test: True - method: "normal" \ No newline at end of file +defaults: + - task_heads: toymix + - loss_metrics_datamodule: toymix \ No newline at end of file diff --git a/expts/hydra-configs/training/accelerator/toymix_cpu.yaml b/expts/hydra-configs/training/accelerator/toymix_cpu.yaml index eb12c8935..9022eeb84 100644 --- a/expts/hydra-configs/training/accelerator/toymix_cpu.yaml +++ b/expts/hydra-configs/training/accelerator/toymix_cpu.yaml @@ -11,10 +11,10 @@ predictor: optim_kwargs: {} metrics_every_n_train_steps: 300 torch_scheduler_kwargs: - max_num_epochs: &max_epochs 300 + max_num_epochs: ${constants.max_epochs} trainer: trainer: precision: 32 accumulate_grad_batches: 1 - max_epochs: *max_epochs \ No newline at end of file + max_epochs: ${constants.max_epochs} \ No newline at end of file diff --git a/expts/hydra-configs/training/accelerator/toymix_gpu.yaml b/expts/hydra-configs/training/accelerator/toymix_gpu.yaml index 3712373c3..c2c8e4066 100644 --- a/expts/hydra-configs/training/accelerator/toymix_gpu.yaml +++ b/expts/hydra-configs/training/accelerator/toymix_gpu.yaml @@ -14,9 +14,9 @@ predictor: optim_kwargs: {} metrics_every_n_train_steps: 300 torch_scheduler_kwargs: - max_num_epochs: &max_epochs 300 + max_num_epochs: ${constants.max_epochs} trainer: trainer: accumulate_grad_batches: 1 - max_epochs: *max_epochs \ No newline at end of file + max_epochs: ${constants.max_epochs} \ No newline at end of file diff --git a/expts/hydra-configs/training/model/pcqm4m_gpspp.yaml b/expts/hydra-configs/training/model/pcqm4m_gpspp.yaml index e13c44aa0..7fb1e1ee5 100644 --- a/expts/hydra-configs/training/model/pcqm4m_gpspp.yaml +++ b/expts/hydra-configs/training/model/pcqm4m_gpspp.yaml @@ -3,7 +3,6 @@ # GPS++ model with the PCQMv2 dataset. constants: name: pcqm4mv2_gpspp_4layer - entity: "multitask-gnn" seed: 42 max_epochs: 100 raise_train_error: true # Whether the code should raise an error if it crashes during training diff --git a/expts/hydra-configs/training/model/pcqm4m_mpnn.yaml b/expts/hydra-configs/training/model/pcqm4m_mpnn.yaml index 41b55eba1..ca643fe39 100644 --- a/expts/hydra-configs/training/model/pcqm4m_mpnn.yaml +++ b/expts/hydra-configs/training/model/pcqm4m_mpnn.yaml @@ -3,7 +3,6 @@ # MPNN model with the PCQMv2 dataset. constants: name: pcqm4mv2_mpnn_4layer - entity: "multitask-gnn" seed: 42 max_epochs: 100 raise_train_error: true # Whether the code should raise an error if it crashes during training diff --git a/expts/hydra-configs/training/model/toymix_gcn.yaml b/expts/hydra-configs/training/model/toymix_gcn.yaml index f422e37fc..48eabe003 100644 --- a/expts/hydra-configs/training/model/toymix_gcn.yaml +++ b/expts/hydra-configs/training/model/toymix_gcn.yaml @@ -2,7 +2,6 @@ constants: name: neurips2023_small_data_gcn - entity: "multitask-gnn" seed: 42 max_epochs: 100 data_dir: expts/data/neurips2023/small-dataset diff --git a/expts/hydra-configs/training/model/toymix_gin.yaml b/expts/hydra-configs/training/model/toymix_gin.yaml index 605671c68..ed2885efb 100644 --- a/expts/hydra-configs/training/model/toymix_gin.yaml +++ b/expts/hydra-configs/training/model/toymix_gin.yaml @@ -2,7 +2,6 @@ constants: name: neurips2023_small_data_gin - entity: "multitask-gnn" seed: 42 data_dir: expts/data/neurips2023/small-dataset raise_train_error: true diff --git a/expts/hydra-configs/training/pcqm4m.yaml b/expts/hydra-configs/training/pcqm4m.yaml index 910c78c67..871a2a5f1 100644 --- a/expts/hydra-configs/training/pcqm4m.yaml +++ b/expts/hydra-configs/training/pcqm4m.yaml @@ -7,7 +7,7 @@ predictor: # weight_decay: 1.e-7 torch_scheduler_kwargs: module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 + max_num_epochs: ${constants.max_epochs} warmup_epochs: 10 verbose: False scheduler_kwargs: @@ -22,10 +22,6 @@ predictor: trainer: seed: ${constants.seed} - logger: - save_dir: logs/PCQMv2 - name: ${constants.name} - project: PCQMv2_mpnn #early_stopping: # monitor: *monitor # min_delta: 0 @@ -39,6 +35,6 @@ trainer: save_top_k: 1 every_n_epochs: 100 trainer: - max_epochs: *max_epochs + max_epochs: ${constants.max_epochs} min_epochs: 1 check_val_every_n_epoch: 20 diff --git a/expts/hydra-configs/training/toymix.yaml b/expts/hydra-configs/training/toymix.yaml index 05d7c4715..4afcbd56a 100644 --- a/expts/hydra-configs/training/toymix.yaml +++ b/expts/hydra-configs/training/toymix.yaml @@ -7,7 +7,7 @@ predictor: # weight_decay: 1.e-7 torch_scheduler_kwargs: module_type: WarmUpLinearLR - max_num_epochs: &max_epochs 100 + max_num_epochs: ${constants.max_epochs} warmup_epochs: 10 verbose: False scheduler_kwargs: null @@ -16,15 +16,11 @@ predictor: trainer: seed: ${constants.seed} - logger: - save_dir: logs/neurips2023-small/ - name: ${constants.name} - project: ${constants.name} model_checkpoint: filename: ${constants.name} save_last: True trainer: precision: 16 - max_epochs: *max_epochs + max_epochs: ${constants.max_epochs} min_epochs: 1 check_val_every_n_epoch: 20 \ No newline at end of file diff --git a/expts/main_run_multitask.py b/expts/main_run_multitask.py index c14670377..c68663a08 100644 --- a/expts/main_run_multitask.py +++ b/expts/main_run_multitask.py @@ -32,67 +32,10 @@ @hydra.main(version_base=None, config_path="hydra-configs", config_name="main") def main(cfg: DictConfig) -> None: - cfg = OmegaConf.to_container(cfg, resolve=True) - - run_name: str = "main" - add_date_time: bool = True - - st = timeit.default_timer() - - date_time_suffix = "" - if add_date_time: - date_time_suffix = datetime.now().strftime("%d.%m.%Y_%H.%M.%S") - - wandb.init(entity=cfg["constants"]["entity"], project=cfg["constants"]["name"], config=cfg) - - # Initialize the accelerator - cfg, accelerator_type = load_accelerator(cfg) - - # Load and initialize the dataset - datamodule = load_datamodule(cfg, accelerator_type) - - # Initialize the network - model_class, model_kwargs = load_architecture( - cfg, - in_dims=datamodule.in_dims, + raise DeprecationWarning( + "This script is deprecated. Use `python graphium/cli/train_finetune.py` (or `graphium-train`) instead!" ) - datamodule.prepare_data() - - metrics = load_metrics(cfg) - logger.info(metrics) - - predictor = load_predictor( - cfg, model_class, model_kwargs, metrics, accelerator_type, datamodule.task_norms - ) - - logger.info(predictor.model) - logger.info(ModelSummary(predictor, max_depth=4)) - - trainer = load_trainer(cfg, run_name, accelerator_type, date_time_suffix) - save_params_to_wandb(trainer.logger, cfg, predictor, datamodule) - - # Determine the max num nodes and edges in training and validation - predictor.set_max_nodes_edges_per_graph(datamodule, stages=["train", "val"]) - - # Run the model training - with SafeRun(name="TRAINING", raise_error=cfg["constants"]["raise_train_error"], verbose=True): - trainer.fit(model=predictor, datamodule=datamodule) - - # Determine the max num nodes and edges in testing - predictor.set_max_nodes_edges_per_graph(datamodule, stages=["test"]) - - # Run the model testing - with SafeRun(name="TESTING", raise_error=cfg["constants"]["raise_train_error"], verbose=True): - trainer.test(model=predictor, datamodule=datamodule) # , ckpt_path=ckpt_path) - - logger.info("--------------------------------------------") - logger.info("total computation used", timeit.default_timer() - st) - logger.info("--------------------------------------------") - wandb.finish() - - return trainer.callback_metrics - if __name__ == "__main__": main() diff --git a/graphium/cli/README.md b/graphium/cli/README.md deleted file mode 100644 index eb744ac7e..000000000 --- a/graphium/cli/README.md +++ /dev/null @@ -1,9 +0,0 @@ -