Merge branch 'graphium_3.0' into torchmetrics

datamol-io · Jul 13, 2024 · 8aa0f2b · 8aa0f2b
2 parents 10a1017 + 7f933b7
commit 8aa0f2b
Show file tree

Hide file tree

Showing 107 changed files with 7,079 additions and 4,926 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.09", "3.10", "3.11"]
         pytorch-version: ["2.0"]
 
     runs-on: "ubuntu-latest"
@@ -52,6 +52,9 @@ jobs:
       - name: Install test dependencies
         run: micromamba install -c conda-forge pytdc  # Required to run the `test_finetuning.py`
 
+      - name: Install C++ library
+        run: cd graphium/graphium_cpp && git clone https://github.com/pybind/pybind11.git && export PYTHONPATH=$PYTHONPATH:./pybind11 && python -m pip install . && cd ../..
+
       - name: Run tests
         run: pytest -m 'not ipu'
 

diff --git a/LICENSE b/LICENSE
@@ -189,6 +189,7 @@
    Copyright 2023 Valence Labs
    Copyright 2023 Recursion Pharmaceuticals
    Copyright 2023 Graphcore Limited
+   Copyright 2024 NVIDIA CORPORATION & AFFILIATES
 
    Various Academic groups have also contributed to this software under
    the given license. These include, but are not limited, to the following

diff --git a/docs/api/graphium.features.md b/docs/api/graphium.features.md
@@ -5,37 +5,8 @@ Feature extraction and manipulation
 === "Contents"
 
     * [Featurizer](#featurizer)
-    * [Positional Encoding](#positional-encoding)
-    * [Properties](#properties)
-    * [Spectral PE](#spectral-pe)
-    * [Random Walk PE](#random-walk-pe)
-    * [NMP](#nmp)
 
 ## Featurizer
 ------------
 ::: graphium.features.featurizer
 
-
-## Positional Encoding
-------------
-::: graphium.features.positional_encoding
-
-
-## Properties
-------------
-::: graphium.features.properties
-
-
-## Spectral PE
-------------
-::: graphium.features.spectral
-
-
-## Random Walk PE
-------------
-::: graphium.features.rw
-
-
-## NMP
-------------
-::: graphium.features.nmp
diff --git a/env.yml b/env.yml
@@ -28,7 +28,7 @@ dependencies:
   - gcsfs >=2021.6
 
   # ML packages
-  - cuda-version # works also with CPU-only system.
+  - cuda-version == 11.2 # works also with CPU-only system.
   - pytorch >=1.12
   - lightning >=2.0
   - torchmetrics
@@ -43,6 +43,7 @@ dependencies:
   # chemistry
   - rdkit
   - datamol >=0.10
+  - boost # needed by rdkit
 
   # Optional deps
   - sympy

diff --git a/expts/configs/config_gps_10M_pcqm4m.yaml b/expts/configs/config_gps_10M_pcqm4m.yaml
@@ -59,7 +59,6 @@ accelerator:
 
 datamodule:
   module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
   args: # Matches that in the test_multitask_datamodule.py case.
     task_specific_args:   # To be replaced by a new class "DatasetParams"
       homolumo:
@@ -76,10 +75,6 @@ datamodule:
         split_test: 0.1
 
     # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
     featurization:
     # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
     # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
@@ -115,7 +110,6 @@ datamodule:
     num_workers: 0 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
-    featurization_backend: "loky"
 
 
 architecture:

diff --git a/expts/configs/config_gps_10M_pcqm4m_mod.yaml b/expts/configs/config_gps_10M_pcqm4m_mod.yaml
@@ -8,7 +8,6 @@ constants:
 
 datamodule:
   module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
   args: # Matches that in the test_multitask_datamodule.py case.
     task_specific_args:   # To be replaced by a new class "DatasetParams"
       homolumo:
@@ -25,10 +24,6 @@ datamodule:
         split_test: 0.1
 
     # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
     featurization:
     # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
     # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
@@ -84,7 +79,6 @@ datamodule:
     num_workers: 0 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
-    featurization_backend: "loky"
 
     # ipu_dataloader_training_opts:
     #   mode: async

diff --git a/expts/configs/config_mpnn_10M_b3lyp.yaml b/expts/configs/config_mpnn_10M_b3lyp.yaml
@@ -60,7 +60,6 @@ accelerator:
 
 datamodule:
   module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
   args: # Matches that in the test_multitask_datamodule.py case.
     task_specific_args:   # To be replaced by a new class "DatasetParams"
       betagap:
@@ -88,12 +87,7 @@ datamodule:
         split_test: 0.1
 
     # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
     processed_graph_data_path: "../datacache/b3lyp/"
-    dataloading_from: ram
     featurization:
     # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
     # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
@@ -127,7 +121,6 @@ datamodule:
     num_workers: 0 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
-    featurization_backend: "loky"
 
 
 architecture:

diff --git a/expts/configs/config_mpnn_pcqm4m.yaml b/expts/configs/config_mpnn_pcqm4m.yaml
@@ -8,7 +8,6 @@ constants:
 
 datamodule:
   module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
   args: # Matches that in the test_multitask_datamodule.py case.
     task_specific_args:   # To be replaced by a new class "DatasetParams"
       homolumo:
@@ -26,12 +25,7 @@ datamodule:
         split_names: ["train", "valid", "test-dev"]
 
     # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 20
-    featurization_progress: True
-    featurization_backend: "loky"
     processed_graph_data_path: "graphium/data/PCQM4Mv2/"
-    dataloading_from: ram
     featurization:
     # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
     # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
@@ -61,7 +55,6 @@ datamodule:
     num_workers: 40 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
-    featurization_backend: "loky"
 
     # ipu_dataloader_training_opts:
     #   mode: async

diff --git a/expts/hydra-configs/architecture/largemix.yaml b/expts/hydra-configs/architecture/largemix.yaml
@@ -83,12 +83,7 @@ architecture:
 datamodule:
   module_type: "MultitaskFromSmilesDataModule"
   args:
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 20
-    featurization_progress: True
-    featurization_backend: "loky"
     processed_graph_data_path: ${constants.datacache_path}
-    dataloading_from: "disk"
     num_workers: 20 # -1 to use all
     persistent_workers: True
     featurization:

diff --git a/expts/hydra-configs/architecture/pcqm4m.yaml b/expts/hydra-configs/architecture/pcqm4m.yaml
@@ -81,13 +81,8 @@ architecture:
 
 datamodule:
   module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
   args: # Matches that in the test_multitask_datamodule.py case.
     # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
     processed_graph_data_path: ${constants.datacache_path}
     num_workers: 40 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.

diff --git a/expts/hydra-configs/architecture/toymix.yaml b/expts/hydra-configs/architecture/toymix.yaml
@@ -74,12 +74,7 @@ architecture:
 datamodule:
   module_type: "MultitaskFromSmilesDataModule"
   args:
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
     processed_graph_data_path:  ${constants.datacache_path}
-    dataloading_from: ram
     num_workers: 30 # -1 to use all
     persistent_workers: False
     featurization:

diff --git a/expts/hydra-configs/finetuning/admet_baseline.yaml b/expts/hydra-configs/finetuning/admet_baseline.yaml
@@ -20,7 +20,6 @@ constants:
 datamodule:
   args:
     batch_size_training: 32
-    dataloading_from: ram
     persistent_workers: true
     num_workers: 4
 

diff --git a/expts/hydra-configs/tasks/loss_metrics_datamodule/pcqm4m.yaml b/expts/hydra-configs/tasks/loss_metrics_datamodule/pcqm4m.yaml
@@ -25,7 +25,6 @@ metrics:
 
 datamodule:
   module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
   args: # Matches that in the test_multitask_datamodule.py case.
     task_specific_args:   # To be replaced by a new class "DatasetParams"
       homolumo:

diff --git a/expts/hydra-configs/training/accelerator/largemix_cpu.yaml b/expts/hydra-configs/training/accelerator/largemix_cpu.yaml
@@ -4,7 +4,6 @@ datamodule:
   args:
     batch_size_training: 200
     batch_size_inference: 200
-    featurization_n_jobs: 20
     num_workers: 20
 
 predictor:

diff --git a/expts/hydra-configs/training/accelerator/largemix_gpu.yaml b/expts/hydra-configs/training/accelerator/largemix_gpu.yaml
@@ -7,7 +7,6 @@ datamodule:
   args:
     batch_size_training: 2048
     batch_size_inference: 2048
-    featurization_n_jobs: 6
     num_workers: 6
 
 predictor:

diff --git a/expts/hydra-configs/training/accelerator/toymix_cpu.yaml b/expts/hydra-configs/training/accelerator/toymix_cpu.yaml
@@ -4,7 +4,6 @@ datamodule:
   args:
     batch_size_training: 200
     batch_size_inference: 200
-    featurization_n_jobs: 4
     num_workers: 4
 
 predictor:

diff --git a/expts/hydra-configs/training/accelerator/toymix_gpu.yaml b/expts/hydra-configs/training/accelerator/toymix_gpu.yaml
@@ -7,7 +7,6 @@ datamodule:
   args:
     batch_size_training: 200
     batch_size_inference: 200
-    featurization_n_jobs: 4
     num_workers: 4
 
 predictor:

diff --git a/expts/neurips2023_configs/base_config/large.yaml b/expts/neurips2023_configs/base_config/large.yaml
@@ -62,7 +62,6 @@ accelerator:
 
 datamodule:
   module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
   args: # Matches that in the test_multitask_datamodule.py case.
     task_specific_args:   # To be replaced by a new class "DatasetParams"
       l1000_vcap:
@@ -133,11 +132,6 @@ datamodule:
         epoch_sampling_fraction: 1.0
 
     # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
     processed_graph_data_path: ${constants.datacache_path}
     featurization:
     # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),

diff --git a/expts/neurips2023_configs/base_config/large_pcba.yaml b/expts/neurips2023_configs/base_config/large_pcba.yaml
@@ -62,7 +62,6 @@ accelerator:
 
 datamodule:
   module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
   args: # Matches that in the test_multitask_datamodule.py case.
     task_specific_args:   # To be replaced by a new class "DatasetParams"
 
@@ -132,11 +131,6 @@ datamodule:
         #epoch_sampling_fraction: 1.0
 
     # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
     processed_graph_data_path: ${constants.datacache_path}
     featurization:
     # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),

diff --git a/expts/neurips2023_configs/base_config/large_pcqm_g25.yaml b/expts/neurips2023_configs/base_config/large_pcqm_g25.yaml
@@ -62,7 +62,6 @@ accelerator:
 
 datamodule:
   module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
   args: # Matches that in the test_multitask_datamodule.py case.
     task_specific_args:   # To be replaced by a new class "DatasetParams"
 
@@ -132,11 +131,6 @@ datamodule:
       #   epoch_sampling_fraction: 1.0
 
     # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
     processed_graph_data_path: ${constants.datacache_path}
     featurization:
     # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),

diff --git a/expts/neurips2023_configs/base_config/large_pcqm_n4.yaml b/expts/neurips2023_configs/base_config/large_pcqm_n4.yaml
@@ -62,7 +62,6 @@ accelerator:
 
 datamodule:
   module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
   args: # Matches that in the test_multitask_datamodule.py case.
     task_specific_args:   # To be replaced by a new class "DatasetParams"
 
@@ -132,11 +131,6 @@ datamodule:
         epoch_sampling_fraction: 1.0
 
     # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
-    dataloading_from: disk
     processed_graph_data_path: ${constants.datacache_path}
     featurization:
     # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),

diff --git a/expts/neurips2023_configs/base_config/small.yaml b/expts/neurips2023_configs/base_config/small.yaml
@@ -51,7 +51,6 @@ accelerator:
 
 datamodule:
   module_type: "MultitaskFromSmilesDataModule"
-  # module_type: "FakeDataModule"  # Option to use generated data
   args: # Matches that in the test_multitask_datamodule.py case.
     task_specific_args:   # To be replaced by a new class "DatasetParams"
       qm9:
@@ -97,10 +96,6 @@ datamodule:
           method: "normal"
 
     # Featurization
-    prepare_dict_or_graph: pyg:graph
-    featurization_n_jobs: 30
-    featurization_progress: True
-    featurization_backend: "loky"
     processed_graph_data_path: "../datacache/neurips2023-small/"
     featurization:
     # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),