Merge branch 'fix_shuffle' into 'dev'

Make shuffling optional Closes #60 See merge request cdd/QSPRpred!98
CDDLeiden · Jul 6, 2023 · 84423be · 84423be
2 parents 0477788 + c44ba21
commit 84423be
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 42 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@ From v2.0.0 to v2.0.1
 
 - Requirement python version in pyproject.toml updated to 3.10, as older version of python don't support the type hinting used in the code.
 - Corrected type hinting for `QSPRModel.handleInvalidsInPredictions`, which resulted in an error when importing the package in google colab.
+- The `predictMols` method returned random predictions in v2.0.0 due to unpatched shuffling code. This has now been fixed.
 
 ## Changes
 

diff --git a/qsprpred/data/data.py b/qsprpred/data/data.py
@@ -9,7 +9,7 @@
 import warnings
 from collections.abc import Callable
 from multiprocessing import Pool
-from typing import Literal
+from typing import Literal, Optional
 
 import numpy as np
 import pandas as pd
@@ -61,8 +61,8 @@ class ParallelApplyWrapper:
         def __init__(
             self,
             func: Callable,
-            func_args: list = None,
-            func_kwargs: dict = None,
+            func_args: Optional[list] = None,
+            func_kwargs: Optional[dict] = None,
             axis: int = 0,
             raw: bool = False,
             result_type: str = "expand"
@@ -108,10 +108,10 @@ def __call__(self, data: pd.DataFrame):
     def __init__(
         self,
         name: str,
-        df: pd.DataFrame = None,
+        df: Optional[pd.DataFrame] = None,
         store_dir: int = ".",
         overwrite: bool = False,
-        index_cols: list[str] = None,
+        index_cols: Optional[list[str]] = None,
         n_jobs: int = 1,
         chunk_size: int = 1000,
         id_prefix: str = "QSPRID",
@@ -257,12 +257,12 @@ def getSubset(self, prefix: str):
     def apply(
         self,
         func: Callable,
-        func_args: list = None,
-        func_kwargs: dict = None,
+        func_args: Optional[list] = None,
+        func_kwargs: Optional[dict] = None,
         axis: int = 0,
         raw: bool = False,
         result_type: str = "expand",
-        subset: list = None,
+        subset: Optional[list] = None,
     ):
         """Apply a function to the data frame.
 
@@ -305,12 +305,12 @@ def apply(
     def papply(
         self,
         func: Callable,
-        func_args: list = None,
-        func_kwargs: dict = None,
+        func_args: Optional[list] = None,
+        func_kwargs: Optional[dict] = None,
         axis: int = 0,
         raw: bool = False,
         result_type: str = "expand",
-        subset: list = None,
+        subset: Optional[list] = None,
         n_cpus: int = 1,
         chunk_size: int = 1000,
     ):
@@ -358,7 +358,7 @@ def papply(
 
         return pd.concat(results, axis=0)
 
-    def transform(self, targets: list, transformer: Callable, addAs: list = None):
+    def transform(self, targets: list, transformer: Callable, addAs: Optional[list] = None):
         """Transform the data frame (or its part) using a list of transformers.
 
         Each transformer is a function that takes the data frame (or a subset of it as
@@ -450,10 +450,10 @@ def __init__(
         self,
         calculator,
         name_prefix: str,
-        df: pd.DataFrame = None,
+        df: Optional[pd.DataFrame] = None,
         store_dir: str = ".",
         overwrite: bool = False,
-        key_cols: list = None,
+        key_cols: Optional[list] = None,
         n_jobs: int = 1,
         chunk_size: int = 1000
     ):
@@ -510,15 +510,15 @@ class MoleculeTable(PandasDataSet, MoleculeDataSet):
     def __init__(
         self,
         name: str,
-        df: pd.DataFrame = None,
+        df: Optional[pd.DataFrame] = None,
         smiles_col: str = "SMILES",
         add_rdkit: bool = False,
         store_dir: str = ".",
         overwrite: bool = False,
         n_jobs: int = 1,
         chunk_size: int = 50,
         drop_invalids: bool = True,
-        index_cols: list[str] = None,
+        index_cols: Optional[list[str]] = None,
     ):
         """Initialize a `MoleculeTable` object.
 
@@ -881,7 +881,7 @@ def getDescriptors(self):
         ret.drop(columns=join_cols, inplace=True)
         return ret
 
-    def getDescriptorNames(self, prefix: str = None):
+    def getDescriptorNames(self, prefix: Optional[str] = None):
         """Get the names of the descriptors in the data frame.
 
         Args:
@@ -1149,10 +1149,10 @@ def __init__(
         name: str,
         task: Literal[TargetTasks.REGRESSION, TargetTasks.SINGLECLASS,
                       TargetTasks.MULTICLASS],
-        original_name: str = None,
-        th: list[float] | str = None,
-        n_classes: int = None,
-        transformer: Callable = None,
+        original_name: Optional[str] = None,
+        th: Optional[list[float] | str] = None,
+        n_classes: Optional[int] = None,
+        transformer: Optional[Callable] = None,
     ):
         """Initialize a TargetProperty object.
 
@@ -1382,7 +1382,7 @@ def __init__(
         self,
         name: str,
         target_props: list[TargetProperty | dict],
-        df: pd.DataFrame = None,
+        df: Optional[pd.DataFrame] = None,
         smiles_col: str = "SMILES",
         add_rdkit: bool = False,
         store_dir: str = ".",
@@ -1391,8 +1391,8 @@ def __init__(
         chunk_size: int = 50,
         drop_invalids: bool = True,
         drop_empty: bool = True,
-        target_imputer: Callable = None,
-        index_cols: list[str] = None,
+        target_imputer: Optional[Callable] = None,
+        index_cols: Optional[list[str]] = None,
     ):
         """Construct QSPRdata, also apply transformations of output property if
         specified.
@@ -1501,7 +1501,7 @@ def setTargetProperties(
         self,
         target_props: list[TargetProperty],
         drop_empty: bool = True,
-        target_imputer: Callable = None,
+        target_imputer: Optional[Callable] = None,
     ):
         """Set list of target properties and apply transformations if specified.
 
@@ -1623,7 +1623,7 @@ def makeRegression(self, target_property: TargetProperty | str):
         self.restoreTrainingData()
 
     def makeClassification(
-        self, target_property: TargetProperty | str, th: list[float] = None
+        self, target_property: TargetProperty | str, th: Optional[list[float]] = None
     ):
         """Switch to classification task using the given threshold values.
 
@@ -1930,11 +1930,17 @@ def loadDataToSplits(self):
             self.X_ind = self.X.drop(self.X.index)
             self.y_ind = self.y.drop(self.y.index)
 
-    def loadDescriptorsToSplits(self):
+    def loadDescriptorsToSplits(self,
+                                shuffle: bool = True,
+                                random_state: Optional[int] = None):
         """Load all available descriptors into the train and test splits.
 
         If no descriptors are available, an exception will be raised.
 
+        args:
+            shuffle (bool): whether to shuffle the training and test sets
+            random_state (int): random state for shuffling
+
         Raises:
             ValueError: if no descriptors are available
         """
@@ -1955,29 +1961,33 @@ def loadDescriptorsToSplits(self):
             self.y_ind = pd.DataFrame(columns=[self.targetPropertyNames])
 
         # shuffle the training and test sets
-        self.X = self.X.sample(frac=1)
-        self.X_ind = self.X_ind.sample(frac=1)
-        self.y = self.y.loc[self.X.index, :]
-        self.y_ind = self.y_ind.loc[self.X_ind.index, :]
+        if shuffle:
+            self.X = self.X.sample(frac=1, random_state=random_state)
+            self.X_ind = self.X_ind.sample(frac=1, random_state=random_state)
+            self.y = self.y.loc[self.X.index, :]
+            self.y_ind = self.y_ind.loc[self.X_ind.index, :]
 
-    def featurizeSplits(self):
+    def featurizeSplits(self, shuffle: bool = True, random_state: Optional[int] = None):
         """If the data set has descriptors, load them into the train and test splits.
 
         If no descriptors are available, remove all features from
         the splits They will become zero length along the feature axis (columns), but
         will retain their original length along the sample axis (rows). This is useful
         for the case where the data set has no descriptors, but the user wants to retain
         train and test splits.
+
+        shuffle (bool): whether to shuffle the training and test sets
+        random_state (int): random state for shuffling
         """
         if self.featureNames:
-            self.loadDescriptorsToSplits()
+            self.loadDescriptorsToSplits(shuffle=shuffle, random_state=random_state)
             self.X = self.X[self.featureNames]
             self.X_ind = self.X_ind[self.featureNames]
         else:
             self.X = self.X.drop(self.X.columns, axis=1)
             self.X_ind = self.X_ind.drop(self.X_ind.columns, axis=1)
 
-    def fillMissing(self, fill_value: float, columns: list[str] = None):
+    def fillMissing(self, fill_value: float, columns: Optional[list[str]] = None):
         """Fill missing values in the data set with a given value.
 
         Args:
@@ -2054,14 +2064,16 @@ def addFeatures(
     def prepareDataset(
         self,
         smiles_standardizer: str | Callable | None = "chembl",
-        datafilters: list = None,
+        datafilters: Optional[list] = None,
         split=None,
         fold=None,
-        feature_calculators: list = None,
-        feature_filters: list = None,
-        feature_standardizer: SKLearnStandardizer = None,
+        feature_calculators: Optional[list] = None,
+        feature_filters: Optional[list] = None,
+        feature_standardizer: Optional[SKLearnStandardizer] = None,
         feature_fill_value: float = np.nan,
-        recalculate_features: bool = False
+        recalculate_features: bool = False,
+        shuffle: bool = True,
+        random_state: Optional[int] = None
     ):
         """Prepare the dataset for use in QSPR model.
 
@@ -2082,6 +2094,8 @@ def prepareDataset(
                 present in the file
             feature_fill_value (float): value to fill missing values with.
                 Defaults to `numpy.nan`
+            shuffle (bool): whether to shuffle the training and test sets
+            random_state (int): random state for shuffling
         """
         # apply sanitization and standardization
         if smiles_standardizer is not None:
@@ -2110,7 +2124,7 @@ def prepareDataset(
 
         # featurize splits
         if self.hasDescriptors:
-            self.featurizeSplits()
+            self.featurizeSplits(shuffle=shuffle, random_state=random_state)
         else:
             logger.warning(
                 "Attempting to featurize splits without descriptors. "
@@ -2179,7 +2193,7 @@ def checkFeatures(self):
         elif self.X.shape[0] == 0:
             raise ValueError("X has no rows.")
 
-    def createFolds(self, split: DataSplit = None):
+    def createFolds(self, split: DataSplit | None = None):
         """Create folds for cross validation from the  current feature matrix.
 
         If you specify a split to use, it will be used to generate the folds.

diff --git a/qsprpred/extra/models/pcm.py b/qsprpred/extra/models/pcm.py
@@ -75,6 +75,7 @@ def createPredictionDatasetFromMols(
             feature_calculators=self.featureCalculators,
             feature_standardizer=self.featureStandardizer,
             feature_fill_value=fill_value,
+            shuffle=False,
         )
         return dataset, failed_mask
 

diff --git a/qsprpred/models/interfaces.py b/qsprpred/models/interfaces.py
@@ -568,6 +568,7 @@ def createPredictionDatasetFromMols(
             feature_calculators=self.featureCalculators,
             feature_standardizer=self.featureStandardizer,
             feature_fill_value=fill_value,
+            shuffle=False
         )
         return dataset, failed_mask
 
@@ -582,7 +583,6 @@ def predictDataset(self, dataset: QSPRDataset, use_probas: bool = False):
         Returns:
             np.ndarray: an array of predictions
         """
-
         if self.task.isRegression() or not use_probas:
             predictions = self.predict(dataset)
             # always return 2D array