From 1f11085205eeb9e47244c06a5bdbc787b92cda6c Mon Sep 17 00:00:00 2001 From: maagdenbehwvanden1 Date: Tue, 20 Feb 2024 16:50:36 +0100 Subject: [PATCH 1/4] fix not target transformation not applied --- qsprpred/data/tables/base.py | 14 ++++++--- qsprpred/data/tables/mol.py | 23 --------------- qsprpred/data/tables/pandas.py | 52 +++++++++++++++++++++------------- qsprpred/data/tables/qspr.py | 23 ++++++++------- qsprpred/data/tables/tests.py | 23 +++++++++++++++ 5 files changed, 79 insertions(+), 56 deletions(-) diff --git a/qsprpred/data/tables/base.py b/qsprpred/data/tables/base.py index f9e9fd63..a604aefb 100644 --- a/qsprpred/data/tables/base.py +++ b/qsprpred/data/tables/base.py @@ -64,6 +64,16 @@ def removeProperty(self, name: str): name (str): The name of the property. """ + @abstractmethod + def transformProperties(self, names, transformers): + """Transform property values using a transformer function. + + Args: + targets (list[str]): list of column names to transform. + transformer (Callable): Function that transforms the data in target columns + to a new representation. + """ + @abstractmethod def getSubset(self, prefix: str): """Get a subset of the dataset. @@ -90,10 +100,6 @@ def apply( func_kwargs (dict, optional): The keyword arguments of the function. """ - @abstractmethod - def transform(self, targets, transformers): - pass - @abstractmethod def filter(self, table_filters: list[Callable]): """Filter the dataset. diff --git a/qsprpred/data/tables/mol.py b/qsprpred/data/tables/mol.py index f6d30b76..8304c442 100644 --- a/qsprpred/data/tables/mol.py +++ b/qsprpred/data/tables/mol.py @@ -679,29 +679,6 @@ def dropDescriptors( self.descriptors[idx].clearFiles() self.descriptors.pop(idx) - def imputeProperties(self, names: list[str], imputer: Callable): - """Impute missing property values. - - Args: - names (list): - List of property names to impute. - imputer (Callable): - imputer object implementing the `fit_transform` - method from scikit-learn API. - """ - assert hasattr(imputer, "fit_transform"), ( - "Imputer object must implement the `fit_transform` " - "method from scikit-learn API." - ) - assert all( - name in self.df.columns for name in names - ), "Not all target properties in dataframe columns." - names_old = [f"{name}_before_impute" for name in names] - self.df[names_old] = self.df[names] - self.df[names] = imputer.fit_transform(self.df[names]) - logger.debug(f"Imputed missing values for properties: {names}") - logger.debug(f"Old values saved in: {names_old}") - def dropEmptySmiles(self): """Drop rows with empty SMILES from the data set.""" self.df.dropna(subset=[self.smilesCol], inplace=True) diff --git a/qsprpred/data/tables/pandas.py b/qsprpred/data/tables/pandas.py index 0e6721c4..af63e5ec 100644 --- a/qsprpred/data/tables/pandas.py +++ b/qsprpred/data/tables/pandas.py @@ -432,31 +432,45 @@ def apply( logger.debug(f"Result for chunk returned: {result!r}") yield result - def transform( - self, targets: list[str], transformer: Callable, add_as: list[str] | None = None - ): - """Transform the data frame (or its part) using a list of transformers. - - Each transformer is a function that takes the data frame (or a subset of it as - defined by the `targets` argument) and returns a transformed data frame. The - transformed data frame can then be added to the original data frame if `add_as` - is set to a `list` of new column names. If `add_as` is not `None`, the result of - the application of transformers must have the same number of rows as the - original data frame. + def transformProperties(self, names: list[str], transformer: Callable): + """Transform property values using a transformer function. Args: targets (list[str]): list of column names to transform. transformer (Callable): Function that transforms the data in target columns to a new representation. - add_as (list): If `True`, the transformed data is added to the original data - frame and the - names in this list are used as column names for the new data. """ - ret = self.df[targets] - ret = transformer(ret) - if add_as: - self.df[add_as] = ret - return ret + assert all( + name in self.df.columns for name in names + ), "Not all properties in dataframe columns for transformation." + names_old = [f"{name}_before_transform" for name in names] + self.df[names_old] = self.df[names] + self.df[names] = transformer(self.df[names]) + logger.debug(f"Transformed properties in: {names}") + logger.debug(f"Old values saved in: {names_old}") + + def imputeProperties(self, names: list[str], imputer: Callable): + """Impute missing property values. + + Args: + names (list): + List of property names to impute. + imputer (Callable): + imputer object implementing the `fit_transform` + method from scikit-learn API. + """ + assert hasattr(imputer, "fit_transform"), ( + "Imputer object must implement the `fit_transform` " + "method from scikit-learn API." + ) + assert all( + name in self.df.columns for name in names + ), "Not all properties in dataframe columns for imputation." + names_old = [f"{name}_before_impute" for name in names] + self.df[names_old] = self.df[names] + self.df[names] = imputer.fit_transform(self.df[names]) + logger.debug(f"Imputed missing values for properties: {names}") + logger.debug(f"Old values saved in: {names_old}") def filter(self, table_filters: list[Callable]): """Filter the data frame using a list of filters. diff --git a/qsprpred/data/tables/qspr.py b/qsprpred/data/tables/qspr.py index f1b82311..39a5ffb4 100644 --- a/qsprpred/data/tables/qspr.py +++ b/qsprpred/data/tables/qspr.py @@ -1059,12 +1059,18 @@ def dropEmptyProperties(self, names: list[str]): super().dropEmptyProperties(names) self.restoreTrainingData() - def transform( - self, targets: list[str], transformer: Callable, add_as: list[str] | None = None - ): - super().transform(targets, transformer, add_as) - if add_as is None and (set(targets) & set(self.targetPropertyNames)): - self.restoreTrainingData() + def transformProperties(self, targets: list[str], transformer: Callable): + """Transform the target properties using the given transformer. + + Args: + targets (list[str]): list of target properties names to transform + transformer (Callable): transformer function + add_as (list[str] | None, optional): list of names to add the transformed + target properties as. If `None`, the original target properties will be + overwritten. Defaults to `None`. + """ + super().transformProperties(targets, transformer) + self.restoreTrainingData() def imputeProperties(self, names: list[str], imputer: Callable): super().imputeProperties(names, imputer) @@ -1099,10 +1105,7 @@ def setTargetProperty(self, prop: TargetProperty | dict, drop_empty: bool = True self.imputeProperties([prop.name], prop.imputer) # transform the property if prop.transformer is not None: - self.transform( - [prop.name], - prop.transformer, - ) + self.transformProperties([prop.name], prop.transformer) # drop rows with missing smiles/no target property for any of # the target properties if drop_empty: diff --git a/qsprpred/data/tables/tests.py b/qsprpred/data/tables/tests.py index 184e6637..dff53479 100644 --- a/qsprpred/data/tables/tests.py +++ b/qsprpred/data/tables/tests.py @@ -688,6 +688,29 @@ def testImputation(self): self.assertEqual(self.dataset.df["z"].isna().sum(), 0) +class TestTargetTransformation(DataSetsPathMixIn, QSPRTestCase): + """Tests the transformation of target properties.""" + + def setUp(self): + super().setUp() + self.setUpPaths() + + def prop_transform(self, x): + return np.log10(x) + + def testTransformation(self): + dataset = self.createLargeTestDataSet( + target_props=[ + { + "name": "CL", + "task": TargetTasks.REGRESSION, + "transformer": prop_transform, + }, + ] + ) + self.assertTrue(all(dataset.df["CL"] == np.log10(dataset.df["CL_before_transform"]))) + + class TestApply(DataSetsPathMixIn, QSPRTestCase): """Tests the apply method of the data set.""" From 3e9a11d4cdebaf0ccb3ee590547b231990f855fe Mon Sep 17 00:00:00 2001 From: maagdenbehwvanden1 Date: Mon, 26 Feb 2024 12:57:34 +0100 Subject: [PATCH 2/4] move proptery related functions to pandas --- qsprpred/data/tables/mol.py | 47 ---------------------------------- qsprpred/data/tables/pandas.py | 42 ++++++++++++++++++++++-------- 2 files changed, 31 insertions(+), 58 deletions(-) diff --git a/qsprpred/data/tables/mol.py b/qsprpred/data/tables/mol.py index 8304c442..0b2fec21 100644 --- a/qsprpred/data/tables/mol.py +++ b/qsprpred/data/tables/mol.py @@ -683,10 +683,6 @@ def dropEmptySmiles(self): """Drop rows with empty SMILES from the data set.""" self.df.dropna(subset=[self.smilesCol], inplace=True) - def dropEmptyProperties(self, names: list[str]): - """Drop rows with empty target property value from the data set.""" - self.df.dropna(subset=names, how="all", inplace=True) - def attachDescriptors( self, calculator: DescriptorSet, @@ -851,49 +847,6 @@ def smiles(self) -> Generator[str, None, None]: """ return iter(self.df[self.smilesCol].values) - def getProperties(self): - """Get names of all properties/variables saved in the data frame (all columns). - - Returns: - list: list of property names. - """ - return self.df.columns.tolist() - - def hasProperty(self, name): - """Check whether a property is present in the data frame. - - Args: - name (str): Name of the property. - - Returns: - bool: Whether the property is present. - """ - return name in self.df.columns - - def addProperty(self, name: str, data: list): - """Add a property to the data frame. - - Args: - name (str): Name of the property. - data (list): list of property values. - """ - if isinstance(data, pd.Series): - if not np.array_equal(data.index.txt, self.df.index.txt): - logger.info( - f"Adding property '{name}' to data set might be introducing 'nan' " - "values due to index with pandas series. Make sure the index of " - "the data frame and the series match or convert series to list." - ) - self.df[name] = data - - def removeProperty(self, name): - """Remove a property from the data frame. - - Args: - name (str): Name of the property to delete. - """ - del self.df[name] - def addScaffolds( self, scaffolds: list[Scaffold], diff --git a/qsprpred/data/tables/pandas.py b/qsprpred/data/tables/pandas.py index af63e5ec..4be7474c 100644 --- a/qsprpred/data/tables/pandas.py +++ b/qsprpred/data/tables/pandas.py @@ -263,6 +263,17 @@ def _isInStore(self, name): f"_{name}.{self.storeFormat}" ) + def hasProperty(self, name): + """Check whether a property is present in the data frame. + + Args: + name (str): Name of the property. + + Returns: + bool: Whether the property is present. + """ + return name in self.df.columns + def getProperty(self, name: str) -> pd.Series: """Get property values from the data set. @@ -274,22 +285,23 @@ def getProperty(self, name: str) -> pd.Series: """ return self.df[name] - def getProperties(self): - """Get the properties of the data set. + def getProperties(self) -> list[str]: + """Get names of all properties/variables saved in the data frame (all columns). - Returns: list of properties of the data set. + Returns: + list: list of property names. """ - return self.df.columns + return self.df.columns.tolist() def addProperty(self, name: str, data: list): - """Add a property to the data set. + """Add a property to the data frame. Args: name (str): Name of the property. - data (list): List of values for the property. + data (list): list of property values. """ if isinstance(data, pd.Series): - if not self.df.index.equals(data.index): + if not np.array_equal(data.index.txt, self.df.index.txt): logger.info( f"Adding property '{name}' to data set might be introducing 'nan' " "values due to index with pandas series. Make sure the index of " @@ -297,13 +309,21 @@ def addProperty(self, name: str, data: list): ) self.df[name] = data - def removeProperty(self, name: str): - """Remove a property from the data set. + def removeProperty(self, name): + """Remove a property from the data frame. + + Args: + name (str): Name of the property to delete. + """ + del self.df[name] + + def dropEmptyProperties(self, names: list[str]): + """Drop rows with empty target property value from the data set. Args: - name (str): Name of the property to remove. + names (list[str]): list of property names to check for empty values. """ - self.df.drop(columns=[name], inplace=True) + self.df.dropna(subset=names, how="all", inplace=True) def getSubset(self, prefix: str): """Get a subset of the data set by providing a prefix for the column names or a From 6ac69ee480011485bb61b81646bba4cdcc4cc916 Mon Sep 17 00:00:00 2001 From: maagdenbehwvanden1 Date: Mon, 26 Feb 2024 14:35:16 +0100 Subject: [PATCH 3/4] fix bug in add property --- qsprpred/data/tables/pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qsprpred/data/tables/pandas.py b/qsprpred/data/tables/pandas.py index 4be7474c..1681bc81 100644 --- a/qsprpred/data/tables/pandas.py +++ b/qsprpred/data/tables/pandas.py @@ -301,7 +301,7 @@ def addProperty(self, name: str, data: list): data (list): list of property values. """ if isinstance(data, pd.Series): - if not np.array_equal(data.index.txt, self.df.index.txt): + if not self.df.index.equals(data.index): logger.info( f"Adding property '{name}' to data set might be introducing 'nan' " "values due to index with pandas series. Make sure the index of " From 4f481e001d0422d12d44088021d30e180aee134c Mon Sep 17 00:00:00 2001 From: maagdenbehwvanden1 Date: Mon, 26 Feb 2024 14:44:30 +0100 Subject: [PATCH 4/4] update changelog --- CHANGELOG.md | 142 +++------------------------------------------------ 1 file changed, 7 insertions(+), 135 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e50367f..383a4b00 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,146 +1,18 @@ # Change Log -From v2.1.1 to v3.0.0 +From v3.0.0 to v3.0.1 ## Fixes - -- Fixed random seeds to give reproducible results. Each dataset is initialized with a - single random state (either from the constructor or a random number generator) which - is used in all subsequent random operations. Each model is initialized with a single - random state as well: it uses the random state from the dataset, unless it's overriden - in the constructor. When a dataset is saved to a file so is its random state, which is - used by the dataset when the dataset is reloaded. -- fixed error with serialization of the `DNNModel.params` attribute, when no parameters - are set. -- Fix bug with saving predictions from classification model - when `ModelAssessor.useProba` set to `False`. -- Add missing implementation of `QSPRDataset.removeProperty` -- Improved behavior of the Papyrus data source (does not attempt to connect to the - internet if the data set already exists). -- It is now possible to define new descriptor sets outside the package without errors. -- Basic consistency of models is also checked in the unit test suite, including in - the `qsprpred.extra` package. -- Fixed a problem with feature standardizer being retrained on prediction data when a - prediction from SMILES was invoked. This affected all versions of the package higher - or equal to `v2.1.0`. -- Fixes to the `fromMolTable` method in various data set implementations, in particular - in copying of the feature standardizer and other settings. -- Fixed not working `cluster` split and `--imputation` from `data_CLI.py`. -- Fixed a problem with `ProteinDescriptorSet.getDescriptors` returning descriptors in - wrong order with `Pandas