Skip to content

Commit

Permalink
Merge branch 'fix_shuffle' into 'dev'
Browse files Browse the repository at this point in the history
Make shuffling optional

Closes #60

See merge request cdd/QSPRpred!98
  • Loading branch information
martin-sicho committed Jul 6, 2023
2 parents 0477788 + c44ba21 commit 84423be
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 42 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ From v2.0.0 to v2.0.1

- Requirement python version in pyproject.toml updated to 3.10, as older version of python don't support the type hinting used in the code.
- Corrected type hinting for `QSPRModel.handleInvalidsInPredictions`, which resulted in an error when importing the package in google colab.
- The `predictMols` method returned random predictions in v2.0.0 due to unpatched shuffling code. This has now been fixed.

## Changes

Expand Down
96 changes: 55 additions & 41 deletions qsprpred/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import warnings
from collections.abc import Callable
from multiprocessing import Pool
from typing import Literal
from typing import Literal, Optional

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -61,8 +61,8 @@ class ParallelApplyWrapper:
def __init__(
self,
func: Callable,
func_args: list = None,
func_kwargs: dict = None,
func_args: Optional[list] = None,
func_kwargs: Optional[dict] = None,
axis: int = 0,
raw: bool = False,
result_type: str = "expand"
Expand Down Expand Up @@ -108,10 +108,10 @@ def __call__(self, data: pd.DataFrame):
def __init__(
self,
name: str,
df: pd.DataFrame = None,
df: Optional[pd.DataFrame] = None,
store_dir: int = ".",
overwrite: bool = False,
index_cols: list[str] = None,
index_cols: Optional[list[str]] = None,
n_jobs: int = 1,
chunk_size: int = 1000,
id_prefix: str = "QSPRID",
Expand Down Expand Up @@ -257,12 +257,12 @@ def getSubset(self, prefix: str):
def apply(
self,
func: Callable,
func_args: list = None,
func_kwargs: dict = None,
func_args: Optional[list] = None,
func_kwargs: Optional[dict] = None,
axis: int = 0,
raw: bool = False,
result_type: str = "expand",
subset: list = None,
subset: Optional[list] = None,
):
"""Apply a function to the data frame.
Expand Down Expand Up @@ -305,12 +305,12 @@ def apply(
def papply(
self,
func: Callable,
func_args: list = None,
func_kwargs: dict = None,
func_args: Optional[list] = None,
func_kwargs: Optional[dict] = None,
axis: int = 0,
raw: bool = False,
result_type: str = "expand",
subset: list = None,
subset: Optional[list] = None,
n_cpus: int = 1,
chunk_size: int = 1000,
):
Expand Down Expand Up @@ -358,7 +358,7 @@ def papply(

return pd.concat(results, axis=0)

def transform(self, targets: list, transformer: Callable, addAs: list = None):
def transform(self, targets: list, transformer: Callable, addAs: Optional[list] = None):
"""Transform the data frame (or its part) using a list of transformers.
Each transformer is a function that takes the data frame (or a subset of it as
Expand Down Expand Up @@ -450,10 +450,10 @@ def __init__(
self,
calculator,
name_prefix: str,
df: pd.DataFrame = None,
df: Optional[pd.DataFrame] = None,
store_dir: str = ".",
overwrite: bool = False,
key_cols: list = None,
key_cols: Optional[list] = None,
n_jobs: int = 1,
chunk_size: int = 1000
):
Expand Down Expand Up @@ -510,15 +510,15 @@ class MoleculeTable(PandasDataSet, MoleculeDataSet):
def __init__(
self,
name: str,
df: pd.DataFrame = None,
df: Optional[pd.DataFrame] = None,
smiles_col: str = "SMILES",
add_rdkit: bool = False,
store_dir: str = ".",
overwrite: bool = False,
n_jobs: int = 1,
chunk_size: int = 50,
drop_invalids: bool = True,
index_cols: list[str] = None,
index_cols: Optional[list[str]] = None,
):
"""Initialize a `MoleculeTable` object.
Expand Down Expand Up @@ -881,7 +881,7 @@ def getDescriptors(self):
ret.drop(columns=join_cols, inplace=True)
return ret

def getDescriptorNames(self, prefix: str = None):
def getDescriptorNames(self, prefix: Optional[str] = None):
"""Get the names of the descriptors in the data frame.
Args:
Expand Down Expand Up @@ -1149,10 +1149,10 @@ def __init__(
name: str,
task: Literal[TargetTasks.REGRESSION, TargetTasks.SINGLECLASS,
TargetTasks.MULTICLASS],
original_name: str = None,
th: list[float] | str = None,
n_classes: int = None,
transformer: Callable = None,
original_name: Optional[str] = None,
th: Optional[list[float] | str] = None,
n_classes: Optional[int] = None,
transformer: Optional[Callable] = None,
):
"""Initialize a TargetProperty object.
Expand Down Expand Up @@ -1382,7 +1382,7 @@ def __init__(
self,
name: str,
target_props: list[TargetProperty | dict],
df: pd.DataFrame = None,
df: Optional[pd.DataFrame] = None,
smiles_col: str = "SMILES",
add_rdkit: bool = False,
store_dir: str = ".",
Expand All @@ -1391,8 +1391,8 @@ def __init__(
chunk_size: int = 50,
drop_invalids: bool = True,
drop_empty: bool = True,
target_imputer: Callable = None,
index_cols: list[str] = None,
target_imputer: Optional[Callable] = None,
index_cols: Optional[list[str]] = None,
):
"""Construct QSPRdata, also apply transformations of output property if
specified.
Expand Down Expand Up @@ -1501,7 +1501,7 @@ def setTargetProperties(
self,
target_props: list[TargetProperty],
drop_empty: bool = True,
target_imputer: Callable = None,
target_imputer: Optional[Callable] = None,
):
"""Set list of target properties and apply transformations if specified.
Expand Down Expand Up @@ -1623,7 +1623,7 @@ def makeRegression(self, target_property: TargetProperty | str):
self.restoreTrainingData()

def makeClassification(
self, target_property: TargetProperty | str, th: list[float] = None
self, target_property: TargetProperty | str, th: Optional[list[float]] = None
):
"""Switch to classification task using the given threshold values.
Expand Down Expand Up @@ -1930,11 +1930,17 @@ def loadDataToSplits(self):
self.X_ind = self.X.drop(self.X.index)
self.y_ind = self.y.drop(self.y.index)

def loadDescriptorsToSplits(self):
def loadDescriptorsToSplits(self,
shuffle: bool = True,
random_state: Optional[int] = None):
"""Load all available descriptors into the train and test splits.
If no descriptors are available, an exception will be raised.
args:
shuffle (bool): whether to shuffle the training and test sets
random_state (int): random state for shuffling
Raises:
ValueError: if no descriptors are available
"""
Expand All @@ -1955,29 +1961,33 @@ def loadDescriptorsToSplits(self):
self.y_ind = pd.DataFrame(columns=[self.targetPropertyNames])

# shuffle the training and test sets
self.X = self.X.sample(frac=1)
self.X_ind = self.X_ind.sample(frac=1)
self.y = self.y.loc[self.X.index, :]
self.y_ind = self.y_ind.loc[self.X_ind.index, :]
if shuffle:
self.X = self.X.sample(frac=1, random_state=random_state)
self.X_ind = self.X_ind.sample(frac=1, random_state=random_state)
self.y = self.y.loc[self.X.index, :]
self.y_ind = self.y_ind.loc[self.X_ind.index, :]

def featurizeSplits(self):
def featurizeSplits(self, shuffle: bool = True, random_state: Optional[int] = None):
"""If the data set has descriptors, load them into the train and test splits.
If no descriptors are available, remove all features from
the splits They will become zero length along the feature axis (columns), but
will retain their original length along the sample axis (rows). This is useful
for the case where the data set has no descriptors, but the user wants to retain
train and test splits.
shuffle (bool): whether to shuffle the training and test sets
random_state (int): random state for shuffling
"""
if self.featureNames:
self.loadDescriptorsToSplits()
self.loadDescriptorsToSplits(shuffle=shuffle, random_state=random_state)
self.X = self.X[self.featureNames]
self.X_ind = self.X_ind[self.featureNames]
else:
self.X = self.X.drop(self.X.columns, axis=1)
self.X_ind = self.X_ind.drop(self.X_ind.columns, axis=1)

def fillMissing(self, fill_value: float, columns: list[str] = None):
def fillMissing(self, fill_value: float, columns: Optional[list[str]] = None):
"""Fill missing values in the data set with a given value.
Args:
Expand Down Expand Up @@ -2054,14 +2064,16 @@ def addFeatures(
def prepareDataset(
self,
smiles_standardizer: str | Callable | None = "chembl",
datafilters: list = None,
datafilters: Optional[list] = None,
split=None,
fold=None,
feature_calculators: list = None,
feature_filters: list = None,
feature_standardizer: SKLearnStandardizer = None,
feature_calculators: Optional[list] = None,
feature_filters: Optional[list] = None,
feature_standardizer: Optional[SKLearnStandardizer] = None,
feature_fill_value: float = np.nan,
recalculate_features: bool = False
recalculate_features: bool = False,
shuffle: bool = True,
random_state: Optional[int] = None
):
"""Prepare the dataset for use in QSPR model.
Expand All @@ -2082,6 +2094,8 @@ def prepareDataset(
present in the file
feature_fill_value (float): value to fill missing values with.
Defaults to `numpy.nan`
shuffle (bool): whether to shuffle the training and test sets
random_state (int): random state for shuffling
"""
# apply sanitization and standardization
if smiles_standardizer is not None:
Expand Down Expand Up @@ -2110,7 +2124,7 @@ def prepareDataset(

# featurize splits
if self.hasDescriptors:
self.featurizeSplits()
self.featurizeSplits(shuffle=shuffle, random_state=random_state)
else:
logger.warning(
"Attempting to featurize splits without descriptors. "
Expand Down Expand Up @@ -2179,7 +2193,7 @@ def checkFeatures(self):
elif self.X.shape[0] == 0:
raise ValueError("X has no rows.")

def createFolds(self, split: DataSplit = None):
def createFolds(self, split: DataSplit | None = None):
"""Create folds for cross validation from the current feature matrix.
If you specify a split to use, it will be used to generate the folds.
Expand Down
1 change: 1 addition & 0 deletions qsprpred/extra/models/pcm.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def createPredictionDatasetFromMols(
feature_calculators=self.featureCalculators,
feature_standardizer=self.featureStandardizer,
feature_fill_value=fill_value,
shuffle=False,
)
return dataset, failed_mask

Expand Down
2 changes: 1 addition & 1 deletion qsprpred/models/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,7 @@ def createPredictionDatasetFromMols(
feature_calculators=self.featureCalculators,
feature_standardizer=self.featureStandardizer,
feature_fill_value=fill_value,
shuffle=False
)
return dataset, failed_mask

Expand All @@ -582,7 +583,6 @@ def predictDataset(self, dataset: QSPRDataset, use_probas: bool = False):
Returns:
np.ndarray: an array of predictions
"""

if self.task.isRegression() or not use_probas:
predictions = self.predict(dataset)
# always return 2D array
Expand Down

0 comments on commit 84423be

Please sign in to comment.