diff --git a/Makefile b/Makefile index b31a1db..773ca67 100644 --- a/Makefile +++ b/Makefile @@ -18,24 +18,5 @@ test-unit: pytest tests @echo 'unit tests OK' -lint: - pylint cobra - @echo 'lint OK' - -lint-minimal: - pylint E cobra - @echo 'lint minimal OK' - -typecheck: - mypy cobra - @echo 'typecheck OK' - -codestyle: - pycodestyle cobra - @echo 'codestyle OK' - -docstyle: - pydocstyle cobra - @echo 'docstyle OK' - -code-qa: typecheck codestyle docstyle lint-minimal +black-check: + black --diff cobra diff --git a/cobra/evaluation/__init__.py b/cobra/evaluation/__init__.py index 1f8f487..22bbfda 100644 --- a/cobra/evaluation/__init__.py +++ b/cobra/evaluation/__init__.py @@ -4,19 +4,20 @@ from .plotting_utils import plot_performance_curves from .plotting_utils import plot_variable_importance - from .plotting_utils import plot_univariate_predictor_quality from .plotting_utils import plot_correlation_matrix # from .evaluator import Evaluator from .evaluator import ClassificationEvaluator, RegressionEvaluator -__all__ = ["generate_pig_tables", - "compute_pig_table", - "plot_incidence", - "plot_performance_curves", - "plot_variable_importance", - "plot_univariate_predictor_quality", - "plot_correlation_matrix", - "ClassificationEvaluator", - "RegressionEvaluator"] +__all__ = [ + "generate_pig_tables", + "compute_pig_table", + "plot_incidence", + "plot_performance_curves", + "plot_variable_importance", + "plot_univariate_predictor_quality", + "plot_correlation_matrix", + "ClassificationEvaluator", + "RegressionEvaluator", +] diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py index b694a33..9bd7f22 100644 --- a/cobra/evaluation/evaluator.py +++ b/cobra/evaluation/evaluator.py @@ -1,4 +1,3 @@ - import numpy as np import pandas as pd @@ -25,7 +24,8 @@ from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score -class ClassificationEvaluator(): + +class ClassificationEvaluator: """Evaluator class encapsulating classification model metrics and plotting functionality. @@ -56,10 +56,9 @@ class ClassificationEvaluator(): (by default 10, so deciles). """ - def __init__(self, - probability_cutoff: float=None, - lift_at: float=0.05, - n_bins: int = 10): + def __init__( + self, probability_cutoff: float = None, lift_at: float = 0.05, n_bins: int = 10 + ): self.y_true = None self.y_pred = None @@ -90,20 +89,18 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray): # if probability_cutoff is not set, take the optimal cut-off if not self.probability_cutoff: - self.probability_cutoff = (ClassificationEvaluator. - _compute_optimal_cutoff(fpr, tpr, - thresholds)) + self.probability_cutoff = ClassificationEvaluator._compute_optimal_cutoff( + fpr, tpr, thresholds + ) # Transform probabilities to binary array using cut-off - y_pred_b = np.array([0 if pred <= self.probability_cutoff else 1 - for pred in y_pred]) + y_pred_b = np.array( + [0 if pred <= self.probability_cutoff else 1 for pred in y_pred] + ) # Compute the various evaluation metrics self.scalar_metrics = ClassificationEvaluator._compute_scalar_metrics( - y_true, - y_pred, - y_pred_b, - self.lift_at + y_true, y_pred, y_pred_b, self.lift_at ) self.y_true = y_true @@ -111,14 +108,17 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray): self.roc_curve = {"fpr": fpr, "tpr": tpr, "thresholds": thresholds} self.confusion_matrix = confusion_matrix(y_true, y_pred_b) - self.lift_curve = ClassificationEvaluator._compute_lift_per_bin(y_true, y_pred, self.n_bins) - self.cumulative_gains = ClassificationEvaluator._compute_cumulative_gains(y_true, y_pred) + self.lift_curve = ClassificationEvaluator._compute_lift_per_bin( + y_true, y_pred, self.n_bins + ) + self.cumulative_gains = ClassificationEvaluator._compute_cumulative_gains( + y_true, y_pred + ) @staticmethod - def _compute_scalar_metrics(y_true: np.ndarray, - y_pred: np.ndarray, - y_pred_b: np.ndarray, - lift_at: float) -> pd.Series: + def _compute_scalar_metrics( + y_true: np.ndarray, y_pred: np.ndarray, y_pred_b: np.ndarray, lift_at: float + ) -> pd.Series: """Convenient function to compute various scalar performance measures and return them in a pd.Series. @@ -145,20 +145,24 @@ def _compute_scalar_metrics(y_true: np.ndarray, Matthews correlation coefficient Lift at given percentage """ - return pd.Series({ - "accuracy": accuracy_score(y_true, y_pred_b), - "AUC": roc_auc_score(y_true, y_pred), - "precision": precision_score(y_true, y_pred_b), - "recall": recall_score(y_true, y_pred_b), - "F1": f1_score(y_true, y_pred_b, average=None)[1], - "matthews_corrcoef": matthews_corrcoef(y_true, y_pred_b), - "lift at {}".format(lift_at): np.round(ClassificationEvaluator - ._compute_lift(y_true=y_true, - y_pred=y_pred, - lift_at=lift_at), 2) - }) - - def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)): + return pd.Series( + { + "accuracy": accuracy_score(y_true, y_pred_b), + "AUC": roc_auc_score(y_true, y_pred), + "precision": precision_score(y_true, y_pred_b), + "recall": recall_score(y_true, y_pred_b), + "F1": f1_score(y_true, y_pred_b, average=None)[1], + "matthews_corrcoef": matthews_corrcoef(y_true, y_pred_b), + "lift at {}".format(lift_at): np.round( + ClassificationEvaluator._compute_lift( + y_true=y_true, y_pred=y_pred, lift_at=lift_at + ), + 2, + ), + } + ) + + def plot_roc_curve(self, path: str = None, dim: tuple = (12, 8)): """Plot ROC curve of the model. Parameters @@ -170,8 +174,10 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)): """ if self.roc_curve is None: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) @@ -181,13 +187,22 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)): fig, ax = plt.subplots(figsize=dim) - ax.plot(self.roc_curve["fpr"], - self.roc_curve["tpr"], - color="cornflowerblue", linewidth=3, - label="ROC curve (area = {s:.3})".format(s=auc)) - - ax.plot([0, 1], [0, 1], color="darkorange", linewidth=3, - linestyle="--", label="random selection") + ax.plot( + self.roc_curve["fpr"], + self.roc_curve["tpr"], + color="cornflowerblue", + linewidth=3, + label="ROC curve (area = {s:.3})".format(s=auc), + ) + + ax.plot( + [0, 1], + [0, 1], + color="darkorange", + linewidth=3, + linestyle="--", + label="random selection", + ) ax.set_xlabel("False positive rate", fontsize=15) ax.set_ylabel("True positive rate", fontsize=15) ax.legend(loc="lower right") @@ -200,8 +215,9 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)): plt.show() - def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8), - labels: list=["0", "1"]): + def plot_confusion_matrix( + self, path: str = None, dim: tuple = (12, 8), labels: list = ["0", "1"] + ): """Plot the confusion matrix. Parameters @@ -215,26 +231,32 @@ def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8), """ if self.confusion_matrix is None: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) fig, ax = plt.subplots(figsize=dim) - ax = sns.heatmap(self.confusion_matrix, - annot=self.confusion_matrix.astype(str), - fmt="s", cmap="Blues", - xticklabels=labels, yticklabels=labels) + ax = sns.heatmap( + self.confusion_matrix, + annot=self.confusion_matrix.astype(str), + fmt="s", + cmap="Blues", + xticklabels=labels, + yticklabels=labels, + ) ax.set_title("Confusion matrix", fontsize=20) - plt.ylabel('True labels', fontsize=15) - plt.xlabel('Predicted labels', fontsize=15) + plt.ylabel("True labels", fontsize=15) + plt.xlabel("Predicted labels", fontsize=15) if path: plt.savefig(path, format="png", dpi=300, bbox_inches="tight") plt.show() - def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)): + def plot_cumulative_response_curve(self, path: str = None, dim: tuple = (12, 8)): """Plot cumulative response curve. Parameters @@ -246,27 +268,35 @@ def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)): """ if self.lift_curve is None: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) x_labels, lifts, inc_rate = self.lift_curve - lifts = np.array(lifts)*inc_rate*100 + lifts = np.array(lifts) * inc_rate * 100 with plt.style.context("seaborn-ticks"): fig, ax = plt.subplots(figsize=dim) - plt.bar(x_labels[::-1], lifts, align="center", - color="cornflowerblue") + plt.bar(x_labels[::-1], lifts, align="center", color="cornflowerblue") plt.ylabel("Response (%)", fontsize=15) plt.xlabel("Decile", fontsize=15) ax.set_xticks(x_labels) ax.set_xticklabels(x_labels) - plt.axhline(y=inc_rate*100, color="darkorange", linestyle="--", - xmin=0.05, xmax=0.95, linewidth=3, label="incidence") + plt.axhline( + y=inc_rate * 100, + color="darkorange", + linestyle="--", + xmin=0.05, + xmax=0.95, + linewidth=3, + label="incidence", + ) # Legend ax.legend(loc="upper right") @@ -285,7 +315,7 @@ def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)): plt.show() - def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)): + def plot_lift_curve(self, path: str = None, dim: tuple = (12, 8)): """Plot lift per decile. Parameters @@ -297,8 +327,10 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)): """ if self.lift_curve is None: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) @@ -307,15 +339,21 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)): with plt.style.context("seaborn-ticks"): fig, ax = plt.subplots(figsize=dim) - plt.bar(x_labels[::-1], lifts, align="center", - color="cornflowerblue") + plt.bar(x_labels[::-1], lifts, align="center", color="cornflowerblue") plt.ylabel("Lift", fontsize=15) plt.xlabel("Decile", fontsize=15) ax.set_xticks(x_labels) ax.set_xticklabels(x_labels) - plt.axhline(y=1, color="darkorange", linestyle="--", - xmin=0.05, xmax=0.95, linewidth=3, label="baseline") + plt.axhline( + y=1, + color="darkorange", + linestyle="--", + xmin=0.05, + xmax=0.95, + linewidth=3, + label="baseline", + ) # Legend ax.legend(loc="upper right") @@ -334,7 +372,7 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)): plt.show() - def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)): + def plot_cumulative_gains(self, path: str = None, dim: tuple = (12, 8)): """Plot cumulative gains per decile. Parameters @@ -348,11 +386,21 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)): with plt.style.context("seaborn-whitegrid"): fig, ax = plt.subplots(figsize=dim) - ax.plot(self.cumulative_gains[0]*100, self.cumulative_gains[1]*100, - color="cornflowerblue", linewidth=3, - label="cumulative gains") - ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=3, - ls="--", color="darkorange", label="random selection") + ax.plot( + self.cumulative_gains[0] * 100, + self.cumulative_gains[1] * 100, + color="cornflowerblue", + linewidth=3, + label="cumulative gains", + ) + ax.plot( + ax.get_xlim(), + ax.get_ylim(), + linewidth=3, + ls="--", + color="darkorange", + label="random selection", + ) ax.set_title("Cumulative Gains curve", fontsize=20) @@ -379,8 +427,7 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)): plt.show() @staticmethod - def _find_optimal_cutoff(y_true: np.ndarray, - y_pred: np.ndarray) -> float: + def _find_optimal_cutoff(y_true: np.ndarray, y_pred: np.ndarray) -> float: """Find the optimal probability cut off point for a classification model. Wrapper around _compute_optimal_cutoff. @@ -396,12 +443,14 @@ def _find_optimal_cutoff(y_true: np.ndarray, float Optimal cut-off probability for the model. """ - return ClassificationEvaluator._compute_optimal_cutoff(roc_curve(y_true=y_true, - y_score=y_pred)) + return ClassificationEvaluator._compute_optimal_cutoff( + roc_curve(y_true=y_true, y_score=y_pred) + ) @staticmethod - def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, - thresholds: np.ndarray) -> float: + def _compute_optimal_cutoff( + fpr: np.ndarray, tpr: np.ndarray, thresholds: np.ndarray + ) -> float: """Find the optimal probability cut-off point for a classification model. @@ -422,7 +471,7 @@ def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, float Optimal probability cut-off point. """ - temp = np.absolute(tpr - (1-fpr)) + temp = np.absolute(tpr - (1 - fpr)) # index for optimal value is the one for which temp is minimal optimal_index = np.where(temp == min(temp))[0] @@ -430,8 +479,7 @@ def _compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray, return thresholds[optimal_index][0] @staticmethod - def _compute_cumulative_gains(y_true: np.ndarray, - y_pred: np.ndarray) -> tuple: + def _compute_cumulative_gains(y_true: np.ndarray, y_pred: np.ndarray) -> tuple: """Compute cumulative gains of the model, returns percentages and gains cumulative gains curves. @@ -453,7 +501,7 @@ def _compute_cumulative_gains(y_true: np.ndarray, """ # make y_true a boolean vector - y_true = (y_true == 1) + y_true = y_true == 1 sorted_indices = np.argsort(y_pred)[::-1] y_true = y_true[sorted_indices] @@ -470,9 +518,9 @@ def _compute_cumulative_gains(y_true: np.ndarray, return percentages, gains @staticmethod - def _compute_lift_per_bin(y_true: np.ndarray, - y_pred: np.ndarray, - n_bins: int=10) -> tuple: + def _compute_lift_per_bin( + y_true: np.ndarray, y_pred: np.ndarray, n_bins: int = 10 + ) -> tuple: """Compute lift of the model for a given number of bins, returns x-labels, lifts and the target incidence to create cumulative response curves. @@ -492,18 +540,21 @@ def _compute_lift_per_bin(y_true: np.ndarray, Includes x-labels, lifts per decile, and target incidence. """ - lifts = [ClassificationEvaluator._compute_lift(y_true=y_true, - y_pred=y_pred, - lift_at=perc_lift) - for perc_lift in np.linspace(1/n_bins, 1, num=n_bins, endpoint=True)] + lifts = [ + ClassificationEvaluator._compute_lift( + y_true=y_true, y_pred=y_pred, lift_at=perc_lift + ) + for perc_lift in np.linspace(1 / n_bins, 1, num=n_bins, endpoint=True) + ] - x_labels = [len(lifts)-x for x in np.arange(0, len(lifts), 1)] + x_labels = [len(lifts) - x for x in np.arange(0, len(lifts), 1)] return x_labels, lifts, y_true.mean() @staticmethod - def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray, - lift_at: float=0.05) -> float: + def _compute_lift( + y_true: np.ndarray, y_pred: np.ndarray, lift_at: float = 0.05 + ) -> float: """Calculates lift given two arrays on specified level. Parameters @@ -534,22 +585,21 @@ def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray, # Calculate necessary variables nrows = len(y_data) - stop = int(np.floor(nrows*lift_at)) - avg_incidence = np.einsum("ij->j", y_true_)/float(len(y_true_)) + stop = int(np.floor(nrows * lift_at)) + avg_incidence = np.einsum("ij->j", y_true_) / float(len(y_true_)) # Sort and filter data - data_sorted = (y_data[y_data[:, 1].argsort()[::-1]][:stop, 0] - .reshape(stop, 1)) + data_sorted = y_data[y_data[:, 1].argsort()[::-1]][:stop, 0].reshape(stop, 1) # Calculate lift (einsum is a very fast way of summing, but needs specific shape) - inc_in_top_n = np.einsum("ij->j", data_sorted)/float(len(data_sorted)) + inc_in_top_n = np.einsum("ij->j", data_sorted) / float(len(data_sorted)) - lift = np.round(inc_in_top_n/avg_incidence, 2)[0] + lift = np.round(inc_in_top_n / avg_incidence, 2)[0] return lift -class RegressionEvaluator(): +class RegressionEvaluator: """Evaluator class encapsulating regression model metrics and plotting functionality. @@ -586,7 +636,9 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray): Model scores. """ # Compute the various evaluation metrics - self.scalar_metrics = RegressionEvaluator._compute_scalar_metrics(y_true, y_pred) + self.scalar_metrics = RegressionEvaluator._compute_scalar_metrics( + y_true, y_pred + ) self.y_true = y_true self.y_pred = y_pred @@ -595,8 +647,7 @@ def fit(self, y_true: np.ndarray, y_pred: np.ndarray): self.qq = RegressionEvaluator._compute_qq_residuals(y_true, y_pred) @staticmethod - def _compute_scalar_metrics(y_true: np.ndarray, - y_pred: np.ndarray) -> pd.Series: + def _compute_scalar_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> pd.Series: """Convenient function to compute various scalar performance measures and return them in a pd.Series. @@ -616,16 +667,17 @@ def _compute_scalar_metrics(y_true: np.ndarray, Mean squared error (expected value of the quadratic error) Root mean squared error (sqrt of expected value of the quadratic error) """ - return pd.Series({ - "R2": r2_score(y_true, y_pred), - "MAE": mean_absolute_error(y_true, y_pred), - "MSE": mean_squared_error(y_true, y_pred), - "RMSE": sqrt(mean_squared_error(y_true, y_pred)) - }) + return pd.Series( + { + "R2": r2_score(y_true, y_pred), + "MAE": mean_absolute_error(y_true, y_pred), + "MSE": mean_squared_error(y_true, y_pred), + "RMSE": sqrt(mean_squared_error(y_true, y_pred)), + } + ) @staticmethod - def _compute_qq_residuals(y_true: np.ndarray, - y_pred: np.ndarray) -> pd.Series: + def _compute_qq_residuals(y_true: np.ndarray, y_pred: np.ndarray) -> pd.Series: """Convenience function to compute various scalar performance measures and return them in a pd.Series. @@ -648,17 +700,21 @@ def _compute_qq_residuals(y_true: np.ndarray, df = pd.DataFrame({"res": sorted((y_true - y_pred))}) # ascending order m, s = df["res"].mean(), df["res"].std() - df["z_res"] = df["res"].apply(lambda x: (x-m)/s) - df["rank"] = df.index+1 - df["percentile"] = df["rank"].apply(lambda x: x/(n+1)) # divide by n+1 to avoid inf + df["z_res"] = df["res"].apply(lambda x: (x - m) / s) + df["rank"] = df.index + 1 + df["percentile"] = df["rank"].apply( + lambda x: x / (n + 1) + ) # divide by n+1 to avoid inf df["q_theoretical"] = norm.ppf(df["percentile"]) - return pd.Series({ - "quantiles": df["q_theoretical"].values, - "residuals": df["z_res"].values, - }) + return pd.Series( + { + "quantiles": df["q_theoretical"].values, + "residuals": df["z_res"].values, + } + ) - def plot_predictions(self, path: str=None, dim: tuple=(12, 8)): + def plot_predictions(self, path: str = None, dim: tuple = (12, 8)): """Plot predictions from the model against actual values. Parameters @@ -669,8 +725,10 @@ def plot_predictions(self, path: str=None, dim: tuple=(12, 8)): Tuple with width and length of the plot. """ if self.y_true is None and self.y_pred is None: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) y_true = self.y_true y_pred = self.y_pred @@ -679,9 +737,11 @@ def plot_predictions(self, path: str=None, dim: tuple=(12, 8)): fig, ax = plt.subplots(figsize=dim) - x = np.arange(1, len(y_true)+1) + x = np.arange(1, len(y_true) + 1) - ax.plot(x, y_true, ls="--", label="actuals", color="darkorange", linewidth=3) + ax.plot( + x, y_true, ls="--", label="actuals", color="darkorange", linewidth=3 + ) ax.plot(x, y_pred, label="predictions", color="cornflowerblue", linewidth=3) ax.set_xlabel("Index", fontsize=15) @@ -694,7 +754,7 @@ def plot_predictions(self, path: str=None, dim: tuple=(12, 8)): plt.show() - def plot_qq(self, path: str=None, dim: tuple=(12, 8)): + def plot_qq(self, path: str = None, dim: tuple = (12, 8)): """Display a Q-Q plot from the standardized prediction residuals. Parameters @@ -706,8 +766,10 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)): """ if self.qq is None: - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) @@ -718,14 +780,24 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)): x = self.qq["quantiles"] y = self.qq["residuals"] - ax.plot(x, x, ls="--", label="perfect model", color="darkorange", linewidth=3) + ax.plot( + x, x, ls="--", label="perfect model", color="darkorange", linewidth=3 + ) ax.plot(x, y, label="current model", color="cornflowerblue", linewidth=3) ax.set_xlabel("Theoretical quantiles", fontsize=15) - ax.set_xticks(range(int(np.floor(min(x))), int(np.ceil(max(x[x < float("inf")])))+1, 1)) + ax.set_xticks( + range( + int(np.floor(min(x))), int(np.ceil(max(x[x < float("inf")]))) + 1, 1 + ) + ) ax.set_ylabel("Standardized residuals", fontsize=15) - ax.set_yticks(range(int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")])))+1, 1)) + ax.set_yticks( + range( + int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")]))) + 1, 1 + ) + ) ax.legend(loc="best") ax.set_title("Q-Q plot", fontsize=20) @@ -733,4 +805,4 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)): if path: plt.savefig(path, format="png", dpi=300, bbox_inches="tight") - plt.show() \ No newline at end of file + plt.show() diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py index 6cca2d0..9d23d8c 100644 --- a/cobra/evaluation/pigs_tables.py +++ b/cobra/evaluation/pigs_tables.py @@ -1,4 +1,3 @@ - import pandas as pd import matplotlib.pyplot as plt import seaborn as sns @@ -7,10 +6,13 @@ import cobra.utils as utils -def generate_pig_tables(basetable: pd.DataFrame, - target_column_name: str, - preprocessed_predictors: list, - id_column_name: str = None) -> pd.DataFrame: + +def generate_pig_tables( + basetable: pd.DataFrame, + target_column_name: str, + preprocessed_predictors: list, + id_column_name: str = None, +) -> pd.DataFrame: """Compute PIG tables for all predictors in preprocessed_predictors. The output is a DataFrame with columns ``variable``, ``label``, @@ -26,25 +28,25 @@ def generate_pig_tables(basetable: pd.DataFrame, List of basetable column names containing preprocessed predictors. id_column_name : str, default=None Name of the basetable column containing the IDs of the basetable rows - (e.g. customernumber). + (e.g. customernumber). Returns ------- pd.DataFrame DataFrame containing a PIG table for all predictors. """ - #check if there is a id-column and define no_predictor accordingly + # check if there is a id-column and define no_predictor accordingly if id_column_name == None: no_predictor = [target_column_name] else: no_predictor = [id_column_name, target_column_name] - pigs = [ - compute_pig_table(basetable, - column_name, - target_column_name, - ) + compute_pig_table( + basetable, + column_name, + target_column_name, + ) for column_name in sorted(preprocessed_predictors) if column_name not in no_predictor ] @@ -52,9 +54,9 @@ def generate_pig_tables(basetable: pd.DataFrame, return output -def compute_pig_table(basetable: pd.DataFrame, - predictor_column_name: str, - target_column_name: str) -> pd.DataFrame: +def compute_pig_table( + basetable: pd.DataFrame, predictor_column_name: str, target_column_name: str +) -> pd.DataFrame: """Compute the PIG table of a given predictor for a given target. Parameters @@ -77,37 +79,36 @@ def compute_pig_table(basetable: pd.DataFrame, # (= mean of the target for the given bin) and compute the bin size # (e.g. COUNT(id_column_name)). After that, rename the columns - res = (basetable.groupby(predictor_column_name) - .agg( - avg_target = (target_column_name, "mean"), - pop_size = (target_column_name, "size") - ) - .reset_index() - .rename( - columns={predictor_column_name: "label"} - ) + res = ( + basetable.groupby(predictor_column_name) + .agg( + avg_target=(target_column_name, "mean"), + pop_size=(target_column_name, "size"), + ) + .reset_index() + .rename(columns={predictor_column_name: "label"}) ) - # add the column name to a variable column # add the average incidence # replace population size by a percentage of total population res["variable"] = utils.clean_predictor_name(predictor_column_name) res["global_avg_target"] = global_avg_target - res["pop_size"] = res["pop_size"]/len(basetable.index) + res["pop_size"] = res["pop_size"] / len(basetable.index) # make sure to always return the data with the proper column order - column_order = ["variable", "label", "pop_size", - "global_avg_target", "avg_target"] + column_order = ["variable", "label", "pop_size", "global_avg_target", "avg_target"] return res[column_order] -def plot_incidence(pig_tables: pd.DataFrame, - variable: str, - model_type: str, - column_order: list=None, - dim: tuple=(12, 8)): +def plot_incidence( + pig_tables: pd.DataFrame, + variable: str, + model_type: str, + column_order: list = None, + dim: tuple = (12, 8), +): """Plots a Predictor Insights Graph (PIG), a graph in which the mean target value is plotted for a number of bins constructed from a predictor variable. When the target is a binary classification target, @@ -132,26 +133,28 @@ def plot_incidence(pig_tables: pd.DataFrame, Optional tuple to configure the width and length of the plot. """ if model_type not in ["classification", "regression"]: - raise ValueError("An unexpected value was set for the model_type " - "parameter. Expected 'classification' or " - "'regression'.") + raise ValueError( + "An unexpected value was set for the model_type " + "parameter. Expected 'classification' or " + "'regression'." + ) - df_plot = pig_tables[pig_tables['variable'] == variable].copy() + df_plot = pig_tables[pig_tables["variable"] == variable].copy() if column_order is not None: - if not set(df_plot['label']) == set(column_order): + if not set(df_plot["label"]) == set(column_order): raise ValueError( - 'The column_order and pig_tables parameters do not contain ' - 'the same set of variables.') + "The column_order and pig_tables parameters do not contain " + "the same set of variables." + ) - df_plot['label'] = df_plot['label'].astype('category') - df_plot['label'].cat.reorder_categories(column_order, - inplace=True) + df_plot["label"] = df_plot["label"].astype("category") + df_plot["label"].cat.reorder_categories(column_order, inplace=True) - df_plot.sort_values(by=['label'], ascending=True, inplace=True) + df_plot.sort_values(by=["label"], ascending=True, inplace=True) df_plot.reset_index(inplace=True) else: - df_plot.sort_values(by=['avg_target'], ascending=False, inplace=True) + df_plot.sort_values(by=["avg_target"], ascending=False, inplace=True) df_plot.reset_index(inplace=True) with plt.style.context("seaborn-ticks"): @@ -160,35 +163,49 @@ def plot_incidence(pig_tables: pd.DataFrame, # -------------------------- # Left axis - average target # -------------------------- - ax.plot(df_plot['label'], df_plot['avg_target'], - color="#00ccff", marker=".", - markersize=20, linewidth=3, - label='incidence rate per bin' if model_type == "classification" else "mean target value per bin", - zorder=10) - - ax.plot(df_plot['label'], df_plot['global_avg_target'], - color="#022252", linestyle='--', linewidth=4, - label='average incidence rate' if model_type == "classification" else "global mean target value", - zorder=10) + ax.plot( + df_plot["label"], + df_plot["avg_target"], + color="#00ccff", + marker=".", + markersize=20, + linewidth=3, + label="incidence rate per bin" + if model_type == "classification" + else "mean target value per bin", + zorder=10, + ) + + ax.plot( + df_plot["label"], + df_plot["global_avg_target"], + color="#022252", + linestyle="--", + linewidth=4, + label="average incidence rate" + if model_type == "classification" + else "global mean target value", + zorder=10, + ) # Dummy line to have label on second axis from first - ax.plot(np.nan, "#939598", linewidth=6, label='bin size') + ax.plot(np.nan, "#939598", linewidth=6, label="bin size") # Set labels & ticks - ax.set_ylabel('Incidence' if model_type == "classification" else "Mean target value", - fontsize=16) + ax.set_ylabel( + "Incidence" if model_type == "classification" else "Mean target value", + fontsize=16, + ) ax.set_xlabel("Bins", fontsize=15) ax.xaxis.set_tick_params(labelsize=14) - plt.setp(ax.get_xticklabels(), - rotation=45, ha="right", rotation_mode="anchor") + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") ax.yaxis.set_tick_params(labelsize=14) if model_type == "classification": # Mean target values are between 0 and 1 (target incidence rate), # so format them as percentages - ax.set_yticks(np.arange(0, max(df_plot['avg_target'])+0.05, 0.05)) - ax.yaxis.set_major_formatter( - FuncFormatter(lambda y, _: '{:.1%}'.format(y))) + ax.set_yticks(np.arange(0, max(df_plot["avg_target"]) + 0.05, 0.05)) + ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: "{:.1%}".format(y))) elif model_type == "regression": # If the difference between the highest avg. target of all bins # versus the global avg. target AND the difference between the @@ -200,40 +217,52 @@ def plot_incidence(pig_tables: pd.DataFrame, # the bins and versus the global avg. target. # (Motivation for the AND above: if on one end there IS enough # difference, the effect that we discuss here does not occur.) - global_avg_target = max(df_plot['global_avg_target']) # series of same number, for every bin. - if ((np.abs((max(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25) - and (np.abs((min(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)): - ax.set_ylim(global_avg_target * 0.75, - global_avg_target * 1.25) + global_avg_target = max( + df_plot["global_avg_target"] + ) # series of same number, for every bin. + if ( + np.abs((max(df_plot["avg_target"]) - global_avg_target)) + / global_avg_target + < 0.25 + ) and ( + np.abs((min(df_plot["avg_target"]) - global_avg_target)) + / global_avg_target + < 0.25 + ): + ax.set_ylim(global_avg_target * 0.75, global_avg_target * 1.25) # Remove ticks but keep the labels - ax.tick_params(axis='both', which='both', length=0) - ax.tick_params(axis='y', colors="#00ccff") - ax.yaxis.label.set_color('#00ccff') + ax.tick_params(axis="both", which="both", length=0) + ax.tick_params(axis="y", colors="#00ccff") + ax.yaxis.label.set_color("#00ccff") # ----------------- # Right Axis - bins # ----------------- ax2 = ax.twinx() - ax2.bar(df_plot['label'], df_plot['pop_size'], - align='center', color="#939598", zorder=1) + ax2.bar( + df_plot["label"], + df_plot["pop_size"], + align="center", + color="#939598", + zorder=1, + ) # Set labels & ticks ax2.set_xlabel("Bins", fontsize=15) ax2.xaxis.set_tick_params(rotation=45, labelsize=14) ax2.yaxis.set_tick_params(labelsize=14) - ax2.yaxis.set_major_formatter( - FuncFormatter(lambda y, _: '{:.1%}'.format(y))) - ax2.set_ylabel('Population size', fontsize=15) - ax2.tick_params(axis='y', colors="#939598") - ax2.yaxis.label.set_color('#939598') + ax2.yaxis.set_major_formatter(FuncFormatter(lambda y, _: "{:.1%}".format(y))) + ax2.set_ylabel("Population size", fontsize=15) + ax2.tick_params(axis="y", colors="#939598") + ax2.yaxis.label.set_color("#939598") # Despine & prettify sns.despine(ax=ax, right=True, left=True) sns.despine(ax=ax2, left=True, right=False) - ax2.spines['right'].set_color('white') + ax2.spines["right"].set_color("white") ax2.grid(False) @@ -244,9 +273,15 @@ def plot_incidence(pig_tables: pd.DataFrame, title = "Mean target plot" fig.suptitle(title, fontsize=20) plt.title(variable, fontsize=17) - ax.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102), - loc=3, ncol=1, mode="expand", borderaxespad=0., - prop={"size": 14}) + ax.legend( + frameon=False, + bbox_to_anchor=(0.0, 1.01, 1.0, 0.102), + loc=3, + ncol=1, + mode="expand", + borderaxespad=0.0, + prop={"size": 14}, + ) # Set order of layers ax.set_zorder(1) diff --git a/cobra/evaluation/plotting_utils.py b/cobra/evaluation/plotting_utils.py index 8cac03c..6e5f7a9 100644 --- a/cobra/evaluation/plotting_utils.py +++ b/cobra/evaluation/plotting_utils.py @@ -1,4 +1,3 @@ - # third party imports import numpy as np import pandas as pd @@ -6,9 +5,10 @@ import matplotlib.pyplot as plt import seaborn as sns -def plot_univariate_predictor_quality(df_metric: pd.DataFrame, - dim: tuple=(12, 8), - path: str=None): + +def plot_univariate_predictor_quality( + df_metric: pd.DataFrame, dim: tuple = (12, 8), path: str = None +): """Plot univariate quality of the predictors. Parameters @@ -30,13 +30,17 @@ def plot_univariate_predictor_quality(df_metric: pd.DataFrame, metric = "RMSE" ascending = True - df = (df_metric[df_metric["preselection"]] - .sort_values(by=metric+" selection", ascending=ascending)) + df = df_metric[df_metric["preselection"]].sort_values( + by=metric + " selection", ascending=ascending + ) - df = pd.melt(df, id_vars=["predictor"], - value_vars=[metric+" train", metric+" selection"], - var_name="split", - value_name=metric) + df = pd.melt( + df, + id_vars=["predictor"], + value_vars=[metric + " train", metric + " selection"], + var_name="split", + value_name=metric, + ) # plot data with plt.style.context("seaborn-ticks"): @@ -60,9 +64,10 @@ def plot_univariate_predictor_quality(df_metric: pd.DataFrame, plt.show() -def plot_correlation_matrix(df_corr: pd.DataFrame, - dim: tuple=(12, 8), - path: str=None): + +def plot_correlation_matrix( + df_corr: pd.DataFrame, dim: tuple = (12, 8), path: str = None +): """Plot correlation matrix of the predictors. Parameters @@ -83,13 +88,18 @@ def plot_correlation_matrix(df_corr: pd.DataFrame, plt.show() -def plot_performance_curves(model_performance: pd.DataFrame, - dim: tuple=(12, 8), - path: str=None, - colors: dict={"train": "#0099bf", - "selection": "#ff9500", - "validation": "#8064a2"}, - metric_name: str=None): + +def plot_performance_curves( + model_performance: pd.DataFrame, + dim: tuple = (12, 8), + path: str = None, + colors: dict = { + "train": "#0099bf", + "selection": "#ff9500", + "validation": "#8064a2", + }, + metric_name: str = None, +): """Plot performance curves generated by the forward feature selection for the train-selection-validation sets. @@ -118,28 +128,49 @@ def plot_performance_curves(model_performance: pd.DataFrame, elif model_type == "regression": metric_name = "RMSE" - max_metric = np.round(max(max(model_performance['train_performance']), - max(model_performance['selection_performance']), - max(model_performance['validation_performance'])), 1) + max_metric = np.round( + max( + max(model_performance["train_performance"]), + max(model_performance["selection_performance"]), + max(model_performance["validation_performance"]), + ), + 1, + ) with plt.style.context("seaborn-whitegrid"): fig, ax = plt.subplots(figsize=dim) - plt.plot(model_performance['train_performance'], marker=".", - markersize=20, linewidth=3, label="train", - color=colors["train"]) - plt.plot(model_performance['selection_performance'], marker=".", - markersize=20, linewidth=3, label="selection", - color=colors["selection"]) - plt.plot(model_performance['validation_performance'], marker=".", - markersize=20, linewidth=3, label="validation", - color=colors["validation"]) + plt.plot( + model_performance["train_performance"], + marker=".", + markersize=20, + linewidth=3, + label="train", + color=colors["train"], + ) + plt.plot( + model_performance["selection_performance"], + marker=".", + markersize=20, + linewidth=3, + label="selection", + color=colors["selection"], + ) + plt.plot( + model_performance["validation_performance"], + marker=".", + markersize=20, + linewidth=3, + label="validation", + color=colors["validation"], + ) # Set x- and y-ticks - ax.set_xticks(np.arange(len(model_performance['last_added_predictor']))) - ax.set_xticklabels(model_performance['last_added_predictor'].tolist(), - rotation=40, ha='right') + ax.set_xticks(np.arange(len(model_performance["last_added_predictor"]))) + ax.set_xticklabels( + model_performance["last_added_predictor"].tolist(), rotation=40, ha="right" + ) if model_type == "classification": ax.set_yticks(np.arange(0.5, max_metric + 0.02, 0.05)) @@ -147,24 +178,26 @@ def plot_performance_curves(model_performance: pd.DataFrame, # In regression, the scale of the y-axis can largely vary depending # on the dataset, it is easier to just set the y-axis bounds, # but not the tick distance. - ax.set_ylim(0, max_metric*1.1) + ax.set_ylim(0, max_metric * 1.1) # Make pretty - ax.legend(loc='lower right') - fig.suptitle('Performance curves forward feature selection', - fontsize=20) - plt.title("Metric: "+metric_name, fontsize=15, loc="left") - plt.ylabel('Model performance', fontsize=15) + ax.legend(loc="lower right") + fig.suptitle("Performance curves forward feature selection", fontsize=20) + plt.title("Metric: " + metric_name, fontsize=15, loc="left") + plt.ylabel("Model performance", fontsize=15) if path is not None: plt.savefig(path, format="png", dpi=300, bbox_inches="tight") plt.show() -def plot_variable_importance(df_variable_importance: pd.DataFrame, - title: str=None, - dim: tuple=(12, 8), - path: str=None): + +def plot_variable_importance( + df_variable_importance: pd.DataFrame, + title: str = None, + dim: tuple = (12, 8), + path: str = None, +): """Plot variable importance of a given model. Parameters @@ -180,9 +213,12 @@ def plot_variable_importance(df_variable_importance: pd.DataFrame, """ with plt.style.context("seaborn-ticks"): fig, ax = plt.subplots(figsize=dim) - ax = sns.barplot(x="importance", y="predictor", - data=df_variable_importance, - color="cornflowerblue") + ax = sns.barplot( + x="importance", + y="predictor", + data=df_variable_importance, + color="cornflowerblue", + ) if title: ax.set_title(title, fontsize=20) else: @@ -190,8 +226,8 @@ def plot_variable_importance(df_variable_importance: pd.DataFrame, # Make pretty axis sns.despine(ax=ax, right=True) - plt.ylabel('Predictor', fontsize=15) - plt.xlabel('Importance', fontsize=15) + plt.ylabel("Predictor", fontsize=15) + plt.xlabel("Importance", fontsize=15) # Remove white lines from the second axis ax.grid(False) diff --git a/cobra/model_building/__init__.py b/cobra/model_building/__init__.py index 7a646c3..19c5bf9 100644 --- a/cobra/model_building/__init__.py +++ b/cobra/model_building/__init__.py @@ -5,9 +5,11 @@ from .models import LogisticRegressionModel, LinearRegressionModel from .forward_selection import ForwardFeatureSelection -__all__ = ['compute_univariate_preselection', - 'get_preselected_predictors', - 'compute_correlations', - 'LogisticRegressionModel', - 'LinearRegressionModel', - 'ForwardFeatureSelection'] +__all__ = [ + "compute_univariate_preselection", + "get_preselected_predictors", + "compute_correlations", + "LogisticRegressionModel", + "LinearRegressionModel", + "ForwardFeatureSelection", +] diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py index 29e06b3..e39512e 100644 --- a/cobra/model_building/forward_selection.py +++ b/cobra/model_building/forward_selection.py @@ -1,4 +1,3 @@ - import logging from typing import Callable, Optional @@ -9,6 +8,7 @@ log = logging.getLogger(__name__) + class ForwardFeatureSelection: """Perform forward feature selection for a given dataset using a given algorithm. @@ -35,10 +35,12 @@ class ForwardFeatureSelection: List of fitted models. """ - def __init__(self, - model_type: str="classification", - max_predictors: int=50, - pos_only: bool=True): + def __init__( + self, + model_type: str = "classification", + max_predictors: int = 50, + pos_only: bool = True, + ): self.model_type = model_type if model_type == "classification": @@ -70,16 +72,20 @@ def get_model_from_step(self, step: int): In case step is larger than the number of available models. """ if len(self._fitted_models) <= step: - raise ValueError(f"No model available for step {step}. " - "The first step starts from index 0.") + raise ValueError( + f"No model available for step {step}. " + "The first step starts from index 0." + ) return self._fitted_models[step] - def compute_model_performances(self, data: pd.DataFrame, - target_column_name: str, - splits: list=["train", "selection", "validation"], - metric: Optional[Callable]=None, - ) -> pd.DataFrame: + def compute_model_performances( + self, + data: pd.DataFrame, + target_column_name: str, + splits: list = ["train", "selection", "validation"], + metric: Optional[Callable] = None, + ) -> pd.DataFrame: """Compute for each model the performance for different sets (e.g. train-selection-validation) and return them along with a list of predictors used in the model. Note that the computation of the @@ -112,24 +118,25 @@ def compute_model_performances(self, data: pd.DataFrame, predictor_set = set([]) for model in self._fitted_models: - last_added_predictor = (set(model.predictors) - .difference(predictor_set)) + last_added_predictor = set(model.predictors).difference(predictor_set) tmp = { "predictors": model.predictors, - "last_added_predictor": list(last_added_predictor)[0] + "last_added_predictor": list(last_added_predictor)[0], } # Evaluate model on each dataset split, # e.g. train-selection-validation - tmp.update({ - f"{split}_performance": model.evaluate( - data[data["split"] == split], - data[data["split"] == split][target_column_name], - split=split, # parameter used for caching - metric=metric - ) - for split in splits - }) + tmp.update( + { + f"{split}_performance": model.evaluate( + data[data["split"] == split], + data[data["split"] == split][target_column_name], + split=split, # parameter used for caching + metric=metric, + ) + for split in splits + } + ) results.append(tmp) @@ -140,9 +147,14 @@ def compute_model_performances(self, data: pd.DataFrame, return df - def fit(self, train_data: pd.DataFrame, target_column_name: str, - predictors: list, forced_predictors: list=[], - excluded_predictors: list=[]): + def fit( + self, + train_data: pd.DataFrame, + target_column_name: str, + predictors: list, + forced_predictors: list = [], + excluded_predictors: list = [], + ): """Fit the forward feature selection estimator. Parameters @@ -169,38 +181,57 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str, number of allowed predictors in the model. """ - assert "split" in train_data.columns, "The train_data input df does not include a split column." - assert len(set(["train", "selection"]).difference(set(train_data["split"].unique()))) == 0, \ - "The train_data input df does not include a 'train' and 'selection' split." + assert ( + "split" in train_data.columns + ), "The train_data input df does not include a split column." + assert ( + len( + set(["train", "selection"]).difference( + set(train_data["split"].unique()) + ) + ) + == 0 + ), "The train_data input df does not include a 'train' and 'selection' split." # remove excluded predictors from predictor lists - filtered_predictors = [var for var in predictors - if (var not in excluded_predictors and - var not in forced_predictors)] + filtered_predictors = [ + var + for var in predictors + if (var not in excluded_predictors and var not in forced_predictors) + ] # checks on predictor lists and self.max_predictors attr if len(forced_predictors) > self.max_predictors: - raise ValueError("Size of forced_predictors cannot be bigger than " - "max_predictors.") + raise ValueError( + "Size of forced_predictors cannot be bigger than " "max_predictors." + ) elif len(forced_predictors) == self.max_predictors: - log.info("Size of forced_predictors equals max_predictors " - "only one model will be trained...") + log.info( + "Size of forced_predictors equals max_predictors " + "only one model will be trained..." + ) # train model with all forced_predictors (only) - (self._fitted_models - .append(self._train_model(train_data[train_data["split"] == "train"], - target_column_name, - forced_predictors))) + ( + self._fitted_models.append( + self._train_model( + train_data[train_data["split"] == "train"], + target_column_name, + forced_predictors, + ) + ) + ) else: - self._fitted_models = self._forward_selection(train_data, - target_column_name, - filtered_predictors, - forced_predictors) - - def _forward_selection(self, - train_data: pd.DataFrame, - target_column_name: str, - predictors: list, - forced_predictors: list = []) -> list: + self._fitted_models = self._forward_selection( + train_data, target_column_name, filtered_predictors, forced_predictors + ) + + def _forward_selection( + self, + train_data: pd.DataFrame, + target_column_name: str, + predictors: list, + forced_predictors: list = [], + ) -> list: """Perform the forward feature selection algorithm to compute a list of models (with increasing performance). The length of the list, i.e. the number of models, is bounded by the max_predictors class @@ -226,29 +257,34 @@ def _forward_selection(self, fitted_models = [] current_predictors = [] - max_steps = 1 + min(self.max_predictors, - len(predictors) + len(forced_predictors)) + max_steps = 1 + min( + self.max_predictors, len(predictors) + len(forced_predictors) + ) - for step in tqdm(range(1, max_steps), desc="Sequentially adding best " - "predictor..."): + for step in tqdm( + range(1, max_steps), desc="Sequentially adding best " "predictor..." + ): if step <= len(forced_predictors): # first, we go through the forced predictors - candidate_predictors = [var for var in forced_predictors - if var not in current_predictors] + candidate_predictors = [ + var for var in forced_predictors if var not in current_predictors + ] else: - candidate_predictors = [var for var in (predictors - + forced_predictors) - if var not in current_predictors] + candidate_predictors = [ + var + for var in (predictors + forced_predictors) + if var not in current_predictors + ] - model = self._find_next_best_model(train_data, - target_column_name, - candidate_predictors, - current_predictors) + model = self._find_next_best_model( + train_data, target_column_name, candidate_predictors, current_predictors + ) if model is not None: # Add new model predictors to the list of current predictors - current_predictors = list(set(current_predictors) - .union(set(model.predictors))) + current_predictors = list( + set(current_predictors).union(set(model.predictors)) + ) fitted_models.append(model) # else: @@ -262,11 +298,13 @@ def _forward_selection(self, return fitted_models - def _find_next_best_model(self, - train_data: pd.DataFrame, - target_column_name: str, - candidate_predictors: list, - current_predictors: list): + def _find_next_best_model( + self, + train_data: pd.DataFrame, + target_column_name: str, + candidate_predictors: list, + current_predictors: list, + ): """Given a list of current predictors which are already selected to be include in the model, find amongst a list candidate predictors the predictor to add to the selected list so that the resulting model @@ -295,42 +333,54 @@ def _find_next_best_model(self, elif self.MLModel == LinearRegressionModel: best_performance = float("inf") # RMSE metric is used else: - raise ValueError("No metric comparison method has been configured " - "for the given model_type specified as " - "ForwardFeatureSelection argument.") - - fit_data = train_data[train_data["split"] == "train"] # data to fit the models with - sel_data = train_data[train_data["split"] == "selection"] # data to compare the models with + raise ValueError( + "No metric comparison method has been configured " + "for the given model_type specified as " + "ForwardFeatureSelection argument." + ) + + fit_data = train_data[ + train_data["split"] == "train" + ] # data to fit the models with + sel_data = train_data[ + train_data["split"] == "selection" + ] # data to compare the models with for pred in candidate_predictors: # Train a model with an additional predictor - model = self._train_model(fit_data, target_column_name, - (current_predictors + [pred])) + model = self._train_model( + fit_data, target_column_name, (current_predictors + [pred]) + ) # Evaluate the model - performance = (model - .evaluate(sel_data[current_predictors + [pred]], - sel_data[target_column_name], - split="selection")) + performance = model.evaluate( + sel_data[current_predictors + [pred]], + sel_data[target_column_name], + split="selection", + ) if self.pos_only and (not (model.get_coef() >= 0).all()): continue # Check if the model is better than the current best model # and if it is, replace the current best. - if self.MLModel == LogisticRegressionModel \ - and performance > best_performance: # AUC metric is used + if ( + self.MLModel == LogisticRegressionModel + and performance > best_performance + ): # AUC metric is used best_performance = performance best_model = model - elif self.MLModel == LinearRegressionModel \ - and performance < best_performance: # RMSE metric is used + elif ( + self.MLModel == LinearRegressionModel and performance < best_performance + ): # RMSE metric is used best_performance = performance best_model = model return best_model - def _train_model(self, train_data: pd.DataFrame, target_column_name: str, - predictors: list): + def _train_model( + self, train_data: pd.DataFrame, target_column_name: str, predictors: list + ): """Train the model with a given set of predictors. Parameters diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py index 3a921c0..a456b81 100644 --- a/cobra/model_building/models.py +++ b/cobra/model_building/models.py @@ -1,4 +1,3 @@ - from typing import Callable, Optional # third party imports @@ -14,6 +13,7 @@ import cobra.utils as utils from cobra.evaluation import ClassificationEvaluator + class LogisticRegressionModel: """Wrapper around the LogisticRegression class, with additional methods implemented such as evaluation (using AUC), getting a list of coefficients, @@ -28,8 +28,9 @@ class LogisticRegressionModel: """ def __init__(self): - self.logit = LogisticRegression(fit_intercept=True, C=1e9, - solver='liblinear', random_state=42) + self.logit = LogisticRegression( + fit_intercept=True, C=1e9, solver="liblinear", random_state=42 + ) self._is_fitted = False # placeholder to keep track of a list of predictors self.predictors = [] @@ -47,16 +48,18 @@ def serialize(self) -> dict: "meta": "logistic-regression", "predictors": self.predictors, "_eval_metrics_by_split": self._eval_metrics_by_split, - "params": self.logit.get_params() + "params": self.logit.get_params(), } if self._is_fitted: - serialized_model.update({ - "classes_": self.logit.classes_.tolist(), - "coef_": self.logit.coef_.tolist(), - "intercept_": self.logit.intercept_.tolist(), - "n_iter_": self.logit.n_iter_.tolist(), - }) + serialized_model.update( + { + "classes_": self.logit.classes_.tolist(), + "coef_": self.logit.coef_.tolist(), + "intercept_": self.logit.intercept_.tolist(), + "n_iter_": self.logit.n_iter_.tolist(), + } + ) return serialized_model @@ -147,9 +150,13 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray: # ensure we have the proper predictors and the proper order return self.logit.predict_proba(X[self.predictors])[:, 1] - def evaluate(self, X: pd.DataFrame, y: pd.Series, - split: str=None, - metric: Optional[Callable]=None) -> float: + def evaluate( + self, + X: pd.DataFrame, + y: pd.Series, + split: str = None, + metric: Optional[Callable] = None, + ) -> float: """Evaluate the model on a given dataset (X, y). The optional split parameter is to indicate that the dataset belongs to (train, selection, validation), so that the computation on these sets @@ -179,7 +186,9 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series, y_pred = self.score_model(X) fpr, tpr, thresholds = roc_curve(y_true=y, y_score=y_pred) - cutoff = (ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds)) + cutoff = ClassificationEvaluator._compute_optimal_cutoff( + fpr, tpr, thresholds + ) y_pred_b = np.array([0 if pred <= cutoff else 1 for pred in y_pred]) performance = metric(y_true=y, y_pred=y_pred_b) @@ -216,23 +225,21 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: importance_by_variable = { utils.clean_predictor_name(predictor): stats.pearsonr( - data[predictor], - y_pred - )[0] + data[predictor], y_pred + )[0] for predictor in self.predictors } - df = pd.DataFrame.from_dict(importance_by_variable, - orient="index").reset_index() + df = pd.DataFrame.from_dict( + importance_by_variable, orient="index" + ).reset_index() df.columns = ["predictor", "importance"] - return (df.sort_values(by="importance", ascending=False) - .reset_index(drop=True)) + return df.sort_values(by="importance", ascending=False).reset_index(drop=True) def _is_valid_dict(self, model_dict: dict) -> bool: - if ("meta" not in model_dict - or model_dict["meta"] != "logistic-regression"): + if "meta" not in model_dict or model_dict["meta"] != "logistic-regression": return False attr = ["classes_", "coef_", "intercept_", "n_iter_", "predictors"] @@ -240,8 +247,7 @@ def _is_valid_dict(self, model_dict: dict) -> bool: if not (key in model_dict or type(model_dict[key]) != list): return False - if ("params" not in model_dict - or "_eval_metrics_by_split" not in model_dict): + if "params" not in model_dict or "_eval_metrics_by_split" not in model_dict: return False return True @@ -278,14 +284,16 @@ def serialize(self) -> dict: "meta": "linear-regression", "predictors": self.predictors, "_eval_metrics_by_split": self._eval_metrics_by_split, - "params": self.linear.get_params() + "params": self.linear.get_params(), } if self._is_fitted: - serialized_model.update({ - "coef_": self.linear.coef_.tolist(), - "intercept_": self.linear.intercept_.tolist() - }) + serialized_model.update( + { + "coef_": self.linear.coef_.tolist(), + "intercept_": self.linear.intercept_.tolist(), + } + ) return serialized_model @@ -374,9 +382,13 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray: # ensure we have the proper predictors and the proper order return self.linear.predict(X[self.predictors]) - def evaluate(self, X: pd.DataFrame, y: pd.Series, - split: str=None, - metric: Optional[Callable]=None) -> float: + def evaluate( + self, + X: pd.DataFrame, + y: pd.Series, + split: str = None, + metric: Optional[Callable] = None, + ) -> float: """Evaluate the model on a given dataset (X, y). The optional split parameter is to indicate that the dataset belongs to (train, selection, validation), so that the computation on these sets @@ -438,23 +450,21 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: importance_by_variable = { utils.clean_predictor_name(predictor): stats.pearsonr( - data[predictor], - y_pred - )[0] + data[predictor], y_pred + )[0] for predictor in self.predictors } - df = pd.DataFrame.from_dict(importance_by_variable, - orient="index").reset_index() + df = pd.DataFrame.from_dict( + importance_by_variable, orient="index" + ).reset_index() df.columns = ["predictor", "importance"] - return (df.sort_values(by="importance", ascending=False) - .reset_index(drop=True)) + return df.sort_values(by="importance", ascending=False).reset_index(drop=True) def _is_valid_dict(self, model_dict: dict) -> bool: - if ("meta" not in model_dict - or model_dict["meta"] != "linear-regression"): + if "meta" not in model_dict or model_dict["meta"] != "linear-regression": return False attr = ["coef_", "intercept_", "predictors"] @@ -462,8 +472,7 @@ def _is_valid_dict(self, model_dict: dict) -> bool: if not (key in model_dict or type(model_dict[key]) != list): return False - if ("params" not in model_dict - or "_eval_metrics_by_split" not in model_dict): + if "params" not in model_dict or "_eval_metrics_by_split" not in model_dict: return False return True diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index 2db4abb..5f60e8f 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -1,19 +1,20 @@ - import pandas as pd from sklearn.metrics import roc_auc_score, mean_squared_error from numpy import sqrt import cobra.utils as utils -def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, - target_enc_selection_data: pd.DataFrame, - predictors: list, - target_column: str, - model_type: str = "classification", - preselect_auc_threshold: float = 0.053, - preselect_rmse_threshold: float = 5, - preselect_overtrain_threshold: float = 0.05 - ) -> pd.DataFrame: + +def compute_univariate_preselection( + target_enc_train_data: pd.DataFrame, + target_enc_selection_data: pd.DataFrame, + predictors: list, + target_column: str, + model_type: str = "classification", + preselect_auc_threshold: float = 0.053, + preselect_rmse_threshold: float = 5, + preselect_overtrain_threshold: float = 0.05, +) -> pd.DataFrame: """Perform a preselection of predictors based on an AUC (in case of classification) or a RMSE (in case of regression) threshold of a univariate model on a train and selection dataset and return a DataFrame @@ -71,15 +72,21 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, auc_train = roc_auc_score( y_true=target_enc_train_data[target_column], - y_score=target_enc_train_data[predictor]) + y_score=target_enc_train_data[predictor], + ) auc_selection = roc_auc_score( y_true=target_enc_selection_data[target_column], - y_score=target_enc_selection_data[predictor]) + y_score=target_enc_selection_data[predictor], + ) - result.append({"predictor": cleaned_predictor, - "AUC train": auc_train, - "AUC selection": auc_selection}) + result.append( + { + "predictor": cleaned_predictor, + "AUC train": auc_train, + "AUC selection": auc_selection, + } + ) df_auc = pd.DataFrame(result) @@ -88,28 +95,41 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, # Identify those variables for which the AUC difference between train # and selection is within a user-defined ratio - auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"]) - < preselect_overtrain_threshold) + auc_overtrain = ( + df_auc["AUC train"] - df_auc["AUC selection"] + ) < preselect_overtrain_threshold df_auc["preselection"] = auc_thresh & auc_overtrain - df_out = df_auc.sort_values(by="AUC selection", ascending=False).reset_index(drop=True) + df_out = df_auc.sort_values(by="AUC selection", ascending=False).reset_index( + drop=True + ) elif model_type == "regression": for predictor in predictors: cleaned_predictor = utils.clean_predictor_name(predictor) - rmse_train = sqrt(mean_squared_error( - y_true=target_enc_train_data[target_column], - y_pred=target_enc_train_data[predictor])) - - rmse_selection = sqrt(mean_squared_error( - y_true=target_enc_selection_data[target_column], - y_pred=target_enc_selection_data[predictor])) - - result.append({"predictor": cleaned_predictor, - "RMSE train": rmse_train, - "RMSE selection": rmse_selection}) + rmse_train = sqrt( + mean_squared_error( + y_true=target_enc_train_data[target_column], + y_pred=target_enc_train_data[predictor], + ) + ) + + rmse_selection = sqrt( + mean_squared_error( + y_true=target_enc_selection_data[target_column], + y_pred=target_enc_selection_data[predictor], + ) + ) + + result.append( + { + "predictor": cleaned_predictor, + "RMSE train": rmse_train, + "RMSE selection": rmse_selection, + } + ) df_rmse = pd.DataFrame(result) @@ -118,15 +138,19 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, # Identify those variables for which the RMSE difference between train # and selection is within a user-defined ratio - rmse_overtrain = ((df_rmse["RMSE selection"] - df_rmse["RMSE train"]) # flip subtraction vs. AUC - < preselect_overtrain_threshold) + rmse_overtrain = ( + df_rmse["RMSE selection"] - df_rmse["RMSE train"] + ) < preselect_overtrain_threshold # flip subtraction vs. AUC df_rmse["preselection"] = rmse_thresh & rmse_overtrain - df_out = df_rmse.sort_values(by="RMSE selection", ascending=True).reset_index(drop=True) # lower is better + df_out = df_rmse.sort_values(by="RMSE selection", ascending=True).reset_index( + drop=True + ) # lower is better return df_out + def get_preselected_predictors(df_metric: pd.DataFrame) -> list: """Wrapper function to extract a list of predictors from df_metric. @@ -144,18 +168,24 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list: """ if "AUC selection" in df_metric.columns: - predictor_list = (df_metric[df_metric["preselection"]] - .sort_values(by="AUC selection", ascending=False) - .predictor.tolist()) + predictor_list = ( + df_metric[df_metric["preselection"]] + .sort_values(by="AUC selection", ascending=False) + .predictor.tolist() + ) elif "RMSE selection" in df_metric.columns: - predictor_list = (df_metric[df_metric["preselection"]] - .sort_values(by="RMSE selection", ascending=True) # lower is better - .predictor.tolist()) + predictor_list = ( + df_metric[df_metric["preselection"]] + .sort_values(by="RMSE selection", ascending=True) # lower is better + .predictor.tolist() + ) return [col + "_enc" for col in predictor_list] -def compute_correlations(target_enc_train_data: pd.DataFrame, - predictors: list) -> pd.DataFrame: + +def compute_correlations( + target_enc_train_data: pd.DataFrame, predictors: list +) -> pd.DataFrame: """Given a DataFrame and a list of predictors, compute the correlations amongst the predictors in the DataFrame. @@ -175,8 +205,9 @@ def compute_correlations(target_enc_train_data: pd.DataFrame, correlations = target_enc_train_data[predictors].corr() - predictors_cleaned = [utils.clean_predictor_name(predictor) - for predictor in predictors] + predictors_cleaned = [ + utils.clean_predictor_name(predictor) for predictor in predictors + ] # Change index and columns with the cleaned version of the predictors # e.g. change "var1_enc" with "var1" diff --git a/cobra/preprocessing/__init__.py b/cobra/preprocessing/__init__.py index e02ad4c..3e6c1df 100644 --- a/cobra/preprocessing/__init__.py +++ b/cobra/preprocessing/__init__.py @@ -3,7 +3,9 @@ from .categorical_data_processor import CategoricalDataProcessor from .preprocessor import PreProcessor -__all__ = ['KBinsDiscretizer', - 'TargetEncoder', - 'CategoricalDataProcessor', - 'PreProcessor'] +__all__ = [ + "KBinsDiscretizer", + "TargetEncoder", + "CategoricalDataProcessor", + "PreProcessor", +] diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index 175bfb5..daca7d5 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -1,4 +1,3 @@ - # standard lib imports import re from typing import Optional @@ -14,6 +13,7 @@ log = logging.getLogger(__name__) + class CategoricalDataProcessor(BaseEstimator): """Regroups the categories of categorical variables based on significance with target variable. @@ -60,22 +60,33 @@ class CategoricalDataProcessor(BaseEstimator): Whether contingency table should be scaled before chi^2. """ - valid_keys = ["model_type", "regroup", "regroup_name", "keep_missing", - "category_size_threshold", "p_value_threshold", - "scale_contingency_table", "forced_categories"] - - def __init__(self, - model_type: str="classification", - regroup: bool=True, - regroup_name: str="Other", - keep_missing: bool=True, - category_size_threshold: int=5, - p_value_threshold: float=0.001, - scale_contingency_table: bool=True, - forced_categories: dict={}): - + valid_keys = [ + "model_type", + "regroup", + "regroup_name", + "keep_missing", + "category_size_threshold", + "p_value_threshold", + "scale_contingency_table", + "forced_categories", + ] + + def __init__( + self, + model_type: str = "classification", + regroup: bool = True, + regroup_name: str = "Other", + keep_missing: bool = True, + category_size_threshold: int = 5, + p_value_threshold: float = 0.001, + scale_contingency_table: bool = True, + forced_categories: dict = {}, + ): + if model_type not in ["classification", "regression"]: - raise ValueError("An unexpected model_type was provided. A valid model_type is either 'classification' or 'regression'.") + raise ValueError( + "An unexpected model_type was provided. A valid model_type is either 'classification' or 'regression'." + ) self.model_type = model_type self.regroup = regroup @@ -125,9 +136,10 @@ def set_attributes_from_dict(self, params: dict): _fitted_output = params.pop("_cleaned_categories_by_column", {}) if type(_fitted_output) != dict: - raise ValueError("_cleaned_categories_by_column is expected to " - "be a dict but is of type {} instead" - .format(type(_fitted_output))) + raise ValueError( + "_cleaned_categories_by_column is expected to " + "be a dict but is of type {} instead".format(type(_fitted_output)) + ) # Clean out params dictionary to remove unknown keys (for safety!) params = {key: params[key] for key in params if key in self.valid_keys} @@ -142,8 +154,7 @@ def set_attributes_from_dict(self, params: dict): return self - def fit(self, data: pd.DataFrame, column_names: list, - target_column: str): + def fit(self, data: pd.DataFrame, column_names: list, target_column: str): """Fit the CategoricalDataProcessor. Parameters @@ -162,12 +173,13 @@ def fit(self, data: pd.DataFrame, column_names: list, log.info("regroup was set to False, so no fitting is required") return None - for column_name in tqdm(column_names, desc="Fitting category " - "regrouping..."): + for column_name in tqdm(column_names, desc="Fitting category " "regrouping..."): if column_name not in data.columns: - log.warning("DataFrame has no column '{}', so it will be " - "skipped in fitting" .format(column_name)) + log.warning( + "DataFrame has no column '{}', so it will be " + "skipped in fitting".format(column_name) + ) continue cleaned_cats = self._fit_column(data, column_name, target_column) @@ -179,8 +191,7 @@ def fit(self, data: pd.DataFrame, column_names: list, # Add to _cleaned_categories_by_column for later use self._cleaned_categories_by_column[column_name] = cleaned_cats - def _fit_column(self, data: pd.DataFrame, column_name: str, - target_column) -> set: + def _fit_column(self, data: pd.DataFrame, column_name: str, target_column) -> set: """Compute which categories to regroup into "Other" for a particular column, and return those that need to be kept as-is. @@ -200,8 +211,10 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, model_type = self.model_type if len(data[column_name].unique()) == 1: - log.warning(f"Predictor {column_name} is constant" - " and will be ignored in computation.") + log.warning( + f"Predictor {column_name} is constant" + " and will be ignored in computation." + ) return set(data[column_name].unique()) y = data[target_column] @@ -213,36 +226,31 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, combined_categories = set() # replace missings and get unique categories as a list - X = (CategoricalDataProcessor - ._replace_missings(data[column_name]) - .astype(object)) + X = CategoricalDataProcessor._replace_missings(data[column_name]).astype(object) unique_categories = list(X.unique()) # do not merge categories in case of dummies, i.e. 0 and 1 # (and possibly "Missing") - if (len(unique_categories) == 2 - or (len(unique_categories) == 3 - and "Missing" in unique_categories)): + if len(unique_categories) == 2 or ( + len(unique_categories) == 3 and "Missing" in unique_categories + ): return set(unique_categories) # get small categories and add them to the merged category list # does not apply incidence factor when model_type = "regression" - small_categories = (CategoricalDataProcessor - ._get_small_categories( - X, - incidence, - self.category_size_threshold)) + small_categories = CategoricalDataProcessor._get_small_categories( + X, incidence, self.category_size_threshold + ) combined_categories = combined_categories.union(small_categories) for category in unique_categories: if category in small_categories: continue - pval = (CategoricalDataProcessor - ._compute_p_value(X, y, category, - model_type, - self.scale_contingency_table)) + pval = CategoricalDataProcessor._compute_p_value( + X, y, category, model_type, self.scale_contingency_table + ) # if not significant, add it to the list if pval > self.p_value_threshold: @@ -254,8 +262,7 @@ def _fit_column(self, data: pd.DataFrame, column_name: str, return set(unique_categories).difference(combined_categories) - def transform(self, data: pd.DataFrame, - column_names: list) -> pd.DataFrame: + def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: """Transform the data. Parameters @@ -273,24 +280,24 @@ def transform(self, data: pd.DataFrame, """ if self.regroup and len(self._cleaned_categories_by_column) == 0: - msg = ("{} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") + msg = ( + "{} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) for column_name in column_names: if column_name not in data.columns: - log.warning("Unknown column '{}' will be skipped" - .format(column_name)) + log.warning("Unknown column '{}' will be skipped".format(column_name)) continue data = self._transform_column(data, column_name) return data - def _transform_column(self, data: pd.DataFrame, - column_name: str) -> pd.DataFrame: + def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFrame: """Given a DataFrame, a column name and a list of categories to combine, create an additional column which combines these categories into "Other". @@ -312,11 +319,9 @@ def _transform_column(self, data: pd.DataFrame, data.loc[:, column_name_clean] = data[column_name].astype(object) # Fill missings first - data.loc[:, column_name_clean] = (CategoricalDataProcessor - ._replace_missings( - data, - column_name_clean - )) + data.loc[:, column_name_clean] = CategoricalDataProcessor._replace_missings( + data, column_name_clean + ) if self.regroup: categories = self._cleaned_categories_by_column.get(column_name) @@ -325,24 +330,26 @@ def _transform_column(self, data: pd.DataFrame, # Log warning if categories is None, which indicates it is # not in fitted output if categories is None: - log.warning("Column '{}' is not in fitted output " - "and will be skipped".format(column_name)) + log.warning( + "Column '{}' is not in fitted output " + "and will be skipped".format(column_name) + ) return data - data.loc[:, column_name_clean] = (CategoricalDataProcessor - ._replace_categories( - data[column_name_clean], - categories, - self.regroup_name)) + data.loc[ + :, column_name_clean + ] = CategoricalDataProcessor._replace_categories( + data[column_name_clean], categories, self.regroup_name + ) # change data to categorical - data.loc[:, column_name_clean] = (data[column_name_clean] - .astype("category")) + data.loc[:, column_name_clean] = data[column_name_clean].astype("category") return data - def fit_transform(self, data: pd.DataFrame, column_names: list, - target_column: str) -> pd.DataFrame: + def fit_transform( + self, data: pd.DataFrame, column_names: list, target_column: str + ) -> pd.DataFrame: """Fits the data, then transforms it. Parameters @@ -365,9 +372,9 @@ def fit_transform(self, data: pd.DataFrame, column_names: list, return self.transform(data, column_names) @staticmethod - def _get_small_categories(predictor_series: pd.Series, - incidence: float, - category_size_threshold: int) -> set: + def _get_small_categories( + predictor_series: pd.Series, incidence: float, category_size_threshold: int + ) -> set: """Fetch categories with a size below a certain threshold. Note that we use an additional weighting with the overall incidence. @@ -392,12 +399,13 @@ def _get_small_categories(predictor_series: pd.Series, factor = 1 # Get all categories with a count below a threshold - bool_mask = (category_counts*factor) <= category_size_threshold + bool_mask = (category_counts * factor) <= category_size_threshold return set(category_counts[bool_mask].index.tolist()) @staticmethod - def _replace_missings(data: pd.DataFrame, - column_names: Optional[list] = None) -> pd.DataFrame: + def _replace_missings( + data: pd.DataFrame, column_names: Optional[list] = None + ) -> pd.DataFrame: """Replace missing values (incl. empty strings). Parameters @@ -427,9 +435,13 @@ def _replace_missings(data: pd.DataFrame, return temp @staticmethod - def _compute_p_value(X: pd.Series, y: pd.Series, category: str, - model_type: str, - scale_contingency_table: bool) -> float: + def _compute_p_value( + X: pd.Series, + y: pd.Series, + category: str, + model_type: str, + scale_contingency_table: bool, + ) -> float: """Calculates p-value in order to evaluate whether category of interest is significantly different from the rest of the categories, given the target variable. @@ -461,29 +473,32 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str, df["other_categories"] = np.where(X == category, 0, 1) if model_type == "classification": - contingency_table = pd.crosstab(index=df["other_categories"], columns=df["y"], - margins=False) + contingency_table = pd.crosstab( + index=df["other_categories"], columns=df["y"], margins=False + ) # if true, we scale the "other" categories if scale_contingency_table: size_other_cats = contingency_table.iloc[1].sum() incidence_mean = y.mean() - contingency_table.iloc[1, 0] = (1-incidence_mean) * size_other_cats + contingency_table.iloc[1, 0] = (1 - incidence_mean) * size_other_cats contingency_table.iloc[1, 1] = incidence_mean * size_other_cats contingency_table = contingency_table.values.astype(np.int64) pval = stats.chi2_contingency(contingency_table, correction=False)[1] elif model_type == "regression": - pval = stats.kruskal(df.y[df.other_categories == 0], - df.y[df.other_categories == 1])[1] + pval = stats.kruskal( + df.y[df.other_categories == 0], df.y[df.other_categories == 1] + )[1] return pval @staticmethod - def _replace_categories(data: pd.Series, categories: set, - replace_with: str) -> pd.Series: + def _replace_categories( + data: pd.Series, categories: set, replace_with: str + ) -> pd.Series: """Replace categories in set with "Other" and transform the remaining categories to strings to avoid type errors later on in the pipeline. @@ -501,5 +516,4 @@ def _replace_categories(data: pd.Series, categories: set, pd.Series Series with replaced categories. """ - return data.apply( - lambda x: str(x) if x in categories else replace_with) + return data.apply(lambda x: str(x) if x in categories else replace_with) diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index c30d7de..9f884c9 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -1,4 +1,3 @@ - # standard lib imports from copy import deepcopy from typing import List @@ -15,6 +14,7 @@ log = logging.getLogger(__name__) + class KBinsDiscretizer(BaseEstimator): """Bin continuous data into intervals of predefined size. It provides a way to partition continuous data into discrete values, i.e. transform @@ -59,16 +59,26 @@ class KBinsDiscretizer(BaseEstimator): """ valid_strategies = ("uniform", "quantile") - valid_keys = ["n_bins", "strategy", "closed", "auto_adapt_bins", - "starting_precision", "label_format", - "change_endpoint_format"] - - def __init__(self, n_bins: int = 10, strategy: str = "quantile", - closed: str = "right", - auto_adapt_bins: bool = False, - starting_precision: int = 0, - label_format: str = "{} - {}", - change_endpoint_format: bool = False): + valid_keys = [ + "n_bins", + "strategy", + "closed", + "auto_adapt_bins", + "starting_precision", + "label_format", + "change_endpoint_format", + ] + + def __init__( + self, + n_bins: int = 10, + strategy: str = "quantile", + closed: str = "right", + auto_adapt_bins: bool = False, + starting_precision: int = 0, + label_format: str = "{} - {}", + change_endpoint_format: bool = False, + ): # validate number of bins self._validate_n_bins(n_bins) @@ -99,14 +109,19 @@ def _validate_n_bins(self, n_bins: int): in case ``n_bins`` is not an integer or if ``n_bins < 2`` """ if not isinstance(n_bins, numbers.Integral): - raise ValueError("{} received an invalid n_bins type. " - "Received {}, expected int." - .format(KBinsDiscretizer.__name__, - type(n_bins).__name__)) + raise ValueError( + "{} received an invalid n_bins type. " + "Received {}, expected int.".format( + KBinsDiscretizer.__name__, type(n_bins).__name__ + ) + ) if n_bins < 2: - raise ValueError("{} received an invalid number " - "of bins. Received {}, expected at least 2." - .format(KBinsDiscretizer.__name__, n_bins)) + raise ValueError( + "{} received an invalid number " + "of bins. Received {}, expected at least 2.".format( + KBinsDiscretizer.__name__, n_bins + ) + ) def attributes_to_dict(self) -> dict: """Return the attributes of KBinsDiscretizer in a dictionary @@ -144,9 +159,10 @@ def set_attributes_from_dict(self, params: dict): _bins_by_column = params.pop("_bins_by_column", {}) if type(_bins_by_column) != dict: - raise ValueError("_bins_by_column is expected to be a dict " - "but is of type {} instead" - .format(type(_bins_by_column))) + raise ValueError( + "_bins_by_column is expected to be a dict " + "but is of type {} instead".format(type(_bins_by_column)) + ) # Clean out params dictionary to remove unknown keys (for safety!) params = {key: params[key] for key in params if key in self.valid_keys} @@ -174,17 +190,22 @@ def fit(self, data: pd.DataFrame, column_names: list): """ if self.strategy not in self.valid_strategies: - raise ValueError("{}: valid options for 'strategy' are {}. " - "Got strategy={!r} instead." - .format(KBinsDiscretizer.__name__, - self.valid_strategies, self.strategy)) + raise ValueError( + "{}: valid options for 'strategy' are {}. " + "Got strategy={!r} instead.".format( + KBinsDiscretizer.__name__, self.valid_strategies, self.strategy + ) + ) - for column_name in tqdm(column_names, desc="Computing " - "discretization bins..."): + for column_name in tqdm( + column_names, desc="Computing " "discretization bins..." + ): if column_name not in data.columns: - log.warning("DataFrame has no column '{}', so it will be " - "skipped in fitting" .format(column_name)) + log.warning( + "DataFrame has no column '{}', so it will be " + "skipped in fitting".format(column_name) + ) continue bins = self._fit_column(data, column_name) @@ -192,8 +213,7 @@ def fit(self, data: pd.DataFrame, column_names: list): # Add to bins_by_column for later use self._bins_by_column[column_name] = bins - def _fit_column(self, data: pd.DataFrame, - column_name: str) -> List[tuple]: + def _fit_column(self, data: pd.DataFrame, column_name: str) -> List[tuple]: """Compute bins for a specific column in data Parameters @@ -211,49 +231,56 @@ def _fit_column(self, data: pd.DataFrame, col_min, col_max = data[column_name].min(), data[column_name].max() if col_min == col_max: - log.warning("Predictor '{}' is constant and " - "will be ignored in computation".format(column_name)) + log.warning( + "Predictor '{}' is constant and " + "will be ignored in computation".format(column_name) + ) return None - prop_inf = (np.sum(np.isinf(data[column_name])) - / data[column_name].shape[0]) + prop_inf = np.sum(np.isinf(data[column_name])) / data[column_name].shape[0] if prop_inf > 0: - log.warning(f"Column {column_name} has " - f"{prop_inf:.1%} inf values, thus it was skipped. " - f"Consider dropping or transforming it.") + log.warning( + f"Column {column_name} has " + f"{prop_inf:.1%} inf values, thus it was skipped. " + f"Consider dropping or transforming it." + ) return None prop_nan = data[column_name].isna().sum() / data[column_name].shape[0] if prop_nan >= 0.99: - log.warning(f"Column {column_name} is" - f" {prop_nan:.1%}% NaNs, " - f"consider dropping or transforming it.") + log.warning( + f"Column {column_name} is" + f" {prop_nan:.1%}% NaNs, " + f"consider dropping or transforming it." + ) n_bins = self.n_bins if self.auto_adapt_bins: size = len(data.index) - missing_pct = data[column_name].isnull().sum()/size + missing_pct = data[column_name].isnull().sum() / size n_bins = int(max(round((1 - missing_pct) * n_bins), 2)) - bin_edges = self._compute_bin_edges(data, column_name, n_bins, - col_min, col_max) + bin_edges = self._compute_bin_edges(data, column_name, n_bins, col_min, col_max) if len(bin_edges) < 3: - log.warning("Only 1 bin was found for predictor '{}' so it will " - "be ignored in computation".format(column_name)) + log.warning( + "Only 1 bin was found for predictor '{}' so it will " + "be ignored in computation".format(column_name) + ) return None if len(bin_edges) < n_bins + 1: - log.warning("The number of actual bins for predictor '{}' is {} " - "which is smaller than the requested number of bins " - "{}".format(column_name, len(bin_edges) - 1, n_bins)) + log.warning( + "The number of actual bins for predictor '{}' is {} " + "which is smaller than the requested number of bins " + "{}".format(column_name, len(bin_edges) - 1, n_bins) + ) return self._compute_bins_from_edges(bin_edges) - def transform(self, data: pd.DataFrame, - column_names: list) -> pd.DataFrame: + def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: """Discretizes the data in the given list of columns by mapping each number to the appropriate bin computed by the fit method @@ -270,15 +297,19 @@ def transform(self, data: pd.DataFrame, data with additional discretized variables """ if len(self._bins_by_column) == 0: - msg = ("{} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") + msg = ( + "{} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) for column_name in tqdm(column_names, desc="Discretizing columns..."): if column_name not in self._bins_by_column: - log.warning("Column '{}' is not in fitted output " - "and will be skipped".format(column_name)) + log.warning( + "Column '{}' is not in fitted output " + "and will be skipped".format(column_name) + ) continue # can be None for a column with a constant value! @@ -288,9 +319,9 @@ def transform(self, data: pd.DataFrame, return data - def _transform_column(self, data: pd.DataFrame, - column_name: str, - bins: List[tuple]) -> pd.DataFrame: + def _transform_column( + self, data: pd.DataFrame, column_name: str, bins: List[tuple] + ) -> pd.DataFrame: """Given a DataFrame, a column name and a list of bins, create an additional column which determines the bin in which the value of column_name lies in. @@ -315,14 +346,14 @@ def _transform_column(self, data: pd.DataFrame, column_name_bin = column_name + "_bin" # use pd.cut to compute bins - data.loc[:, column_name_bin] = pd.cut(x=data[column_name], - bins=interval_idx) + data.loc[:, column_name_bin] = pd.cut(x=data[column_name], bins=interval_idx) # Rename bins so that the output has a proper format bin_labels = self._create_bin_labels(bins) - data.loc[:, column_name_bin] = (data[column_name_bin] - .cat.rename_categories(bin_labels)) + data.loc[:, column_name_bin] = data[column_name_bin].cat.rename_categories( + bin_labels + ) if data[column_name_bin].isnull().sum() > 0: @@ -335,8 +366,7 @@ def _transform_column(self, data: pd.DataFrame, return data - def fit_transform(self, data: pd.DataFrame, - column_names: list) -> pd.DataFrame: + def fit_transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: """Fits to data, then transform it Parameters @@ -354,9 +384,14 @@ def fit_transform(self, data: pd.DataFrame, self.fit(data, column_names) return self.transform(data, column_names) - def _compute_bin_edges(self, data: pd.DataFrame, column_name: str, - n_bins: int, col_min: float, - col_max: float) -> list: + def _compute_bin_edges( + self, + data: pd.DataFrame, + column_name: str, + n_bins: int, + col_min: float, + col_max: float, + ) -> list: """Compute the bin edges for a given column, a DataFrame and the number of required bins @@ -381,9 +416,11 @@ def _compute_bin_edges(self, data: pd.DataFrame, column_name: str, bin_edges = [] if self.strategy == "quantile": - bin_edges = list(data[column_name] - .quantile(np.linspace(0, 1, n_bins + 1), - interpolation="linear")) + bin_edges = list( + data[column_name].quantile( + np.linspace(0, 1, n_bins + 1), interpolation="linear" + ) + ) elif self.strategy == "uniform": bin_edges = list(np.linspace(col_min, col_max, n_bins + 1)) @@ -397,15 +434,14 @@ def _compute_bin_edges(self, data: pd.DataFrame, column_name: str, bin_edges[-1] = np.inf if np.isnan(bin_edges).sum() > 0: - log.warning(f"Column {column_name} " - "has NaNs present in bin definitions") + log.warning(f"Column {column_name} " "has NaNs present in bin definitions") - # Make absolutely sure bin edges are ordered, + # Make absolutely sure bin edges are ordered, # in very rare situations this wasn't the case - # due to rounding in quantile calculation (e.g. + # due to rounding in quantile calculation (e.g. # distributions with strong mass for same value) bin_edges = sorted(bin_edges) - + # Make sure the bin_edges are unique # and order remains the same return list(dict.fromkeys(bin_edges)) @@ -460,7 +496,7 @@ def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]: # this can be a negative number, which then # rounds numbers to the nearest 10, 100, ... precision = self._compute_minimal_precision_of_bin_edges(bin_edges) - + bins = [] for a, b in zip(bin_edges, bin_edges[1:]): fmt_a = round(a, precision) @@ -471,8 +507,9 @@ def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]: return bins @staticmethod - def _create_index(intervals: List[tuple], - closed: str = "right") -> pd.IntervalIndex: + def _create_index( + intervals: List[tuple], closed: str = "right" + ) -> pd.IntervalIndex: """Create an pd.IntervalIndex based on a list of tuples. This is basically a wrapper around pd.IntervalIndex.from_tuples However, the lower bound of the first entry in the list (the lower bin) @@ -495,10 +532,12 @@ def _create_index(intervals: List[tuple], # check if closed is of the proper form if closed not in ["left", "right"]: - raise ValueError("{}: valid options for 'closed' are {}. " - "Got strategy={!r} instead." - .format(KBinsDiscretizer.__name__, - ["left", "right"], closed)) + raise ValueError( + "{}: valid options for 'closed' are {}. " + "Got strategy={!r} instead.".format( + KBinsDiscretizer.__name__, ["left", "right"], closed + ) + ) # deepcopy variable because we do not want to modify the content # of intervals (which is still used outside of this function) @@ -526,8 +565,7 @@ def _create_bin_labels(self, bins: List[tuple]) -> list: """ bin_labels = [] for interval in bins: - bin_labels.append(self.label_format.format(interval[0], - interval[1])) + bin_labels.append(self.label_format.format(interval[0], interval[1])) # Format first and last bin as < x and > y resp. if self.change_endpoint_format: diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index 3eda39d..d5f3939 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -1,4 +1,3 @@ - import logging import pandas as pd @@ -8,6 +7,7 @@ log = logging.getLogger(__name__) + class TargetEncoder(BaseEstimator): """Target encoding for categorical features, inspired by http://contrib.scikit-learn.org/category_encoders/targetencoder.html. @@ -62,22 +62,25 @@ class TargetEncoder(BaseEstimator): valid_imputation_strategies = ("mean", "min", "max") - def __init__(self, weight: float=0.0, - imputation_strategy: str="mean"): + def __init__(self, weight: float = 0.0, imputation_strategy: str = "mean"): if weight < 0: raise ValueError("The value of weight cannot be smaller than zero.") elif imputation_strategy not in self.valid_imputation_strategies: - raise ValueError("Valid options for 'imputation_strategy' are {}." - " Got imputation_strategy={!r} instead." - .format(self.valid_imputation_strategies, - imputation_strategy)) + raise ValueError( + "Valid options for 'imputation_strategy' are {}." + " Got imputation_strategy={!r} instead.".format( + self.valid_imputation_strategies, imputation_strategy + ) + ) if weight == 0: - log.warning("The target encoder's additive smoothing weight is " - "set to 0. This disables smoothing and may make the " - "encoding prone to overfitting. Increase the weight " - "if needed.") + log.warning( + "The target encoder's additive smoothing weight is " + "set to 0. This disables smoothing and may make the " + "encoding prone to overfitting. Increase the weight " + "if needed." + ) self.weight = weight self.imputation_strategy = imputation_strategy @@ -98,8 +101,7 @@ def attributes_to_dict(self) -> dict: params = self.get_params() params["_mapping"] = { - key: value.to_dict() - for key, value in self._mapping.items() + key: value.to_dict() for key, value in self._mapping.items() } params["_global_mean"] = self._global_mean @@ -119,8 +121,10 @@ def set_attributes_from_dict(self, params: dict): if "weight" in params and type(params["weight"]) == float: self.weight = params["weight"] - if ("imputation_strategy" in params and - params["imputation_strategy"] in self.valid_imputation_strategies): + if ( + "imputation_strategy" in params + and params["imputation_strategy"] in self.valid_imputation_strategies + ): self.imputation_strategy = params["imputation_strategy"] if "_global_mean" in params and type(params["_global_mean"]) == float: @@ -136,14 +140,12 @@ def dict_to_series(key, value): return s self._mapping = { - key: dict_to_series(key, value) - for key, value in _mapping.items() + key: dict_to_series(key, value) for key, value in _mapping.items() } return self - def fit(self, data: pd.DataFrame, column_names: list, - target_column: str): + def fit(self, data: pd.DataFrame, column_names: list, target_column: str): """Fit the TargetEncoder to the data. Parameters @@ -162,8 +164,10 @@ def fit(self, data: pd.DataFrame, column_names: list, for column in tqdm(column_names, desc="Fitting target encoding..."): if column not in data.columns: - log.warning("DataFrame has no column '{}', so it will be " - "skipped in fitting" .format(column)) + log.warning( + "DataFrame has no column '{}', so it will be " + "skipped in fitting".format(column) + ) continue self._mapping[column] = self._fit_column(data[column], y) @@ -191,15 +195,13 @@ def _fit_column(self, X: pd.Series, y: pd.Series) -> pd.Series: stats = y.groupby(X).agg(["mean", "count"]) # Note: if self.weight = 0, we have the ordinary incidence replacement - numerator = (stats["count"] * stats["mean"] - + self.weight * self._global_mean) + numerator = stats["count"] * stats["mean"] + self.weight * self._global_mean denominator = stats["count"] + self.weight return numerator / denominator - def transform(self, data: pd.DataFrame, - column_names: list) -> pd.DataFrame: + def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: """Replace (e.g. encode) values of each categorical column with a new value (reflecting the corresponding average target value, optionally smoothed by a regularization weight), @@ -224,25 +226,27 @@ def transform(self, data: pd.DataFrame, method. """ if (len(self._mapping) == 0) or (self._global_mean is None): - msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this method.") + msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this method." + ) raise NotFittedError(msg.format(self.__class__.__name__)) for column in tqdm(column_names, desc="Applying target encoding..."): if column not in data.columns: - log.warning("Unknown column '{}' will be skipped." - .format(column)) + log.warning("Unknown column '{}' will be skipped.".format(column)) continue elif column not in self._mapping: - log.warning("Column '{}' is not in fitted output " - "and will be skipped.".format(column)) + log.warning( + "Column '{}' is not in fitted output " + "and will be skipped.".format(column) + ) continue data = self._transform_column(data, column) return data - def _transform_column(self, data: pd.DataFrame, - column_name: str) -> pd.DataFrame: + def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFrame: """Replace (e.g. encode) values of a categorical column with a new value (reflecting the corresponding average target value, optionally smoothed by a regularization weight), @@ -265,8 +269,9 @@ def _transform_column(self, data: pd.DataFrame, # Convert dtype to float, because when the original dtype # is of type "category", the resulting dtype would otherwise also be of # type "category": - data[new_column] = (data[column_name].map(self._mapping[column_name]) - .astype("float")) + data[new_column] = ( + data[column_name].map(self._mapping[column_name]).astype("float") + ) # In case of categorical data, it could be that new categories will # emerge which were not present in the train set, so this will result @@ -274,20 +279,17 @@ def _transform_column(self, data: pd.DataFrame, # configured imputation strategy: if data[new_column].isnull().sum() > 0: if self.imputation_strategy == "mean": - data[new_column].fillna(self._global_mean, - inplace=True) + data[new_column].fillna(self._global_mean, inplace=True) elif self.imputation_strategy == "min": - data[new_column].fillna(data[new_column].min(), - inplace=True) + data[new_column].fillna(data[new_column].min(), inplace=True) elif self.imputation_strategy == "max": - data[new_column].fillna(data[new_column].max(), - inplace=True) + data[new_column].fillna(data[new_column].max(), inplace=True) return data - def fit_transform(self, data: pd.DataFrame, - column_names: list, - target_column: str) -> pd.DataFrame: + def fit_transform( + self, data: pd.DataFrame, column_names: list, target_column: str + ) -> pd.DataFrame: """Fit the encoder and transform the data. Parameters diff --git a/cobra/version.py b/cobra/version.py index 545d07d..a82b376 100644 --- a/cobra/version.py +++ b/cobra/version.py @@ -1 +1 @@ -__version__ = "1.1.1" \ No newline at end of file +__version__ = "1.1.1" diff --git a/requirements.dev.txt b/requirements.dev.txt index 3d87710..1cdeb06 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,6 +1,6 @@ +black mypy>=0.942 pycodestyle>=2.8.0 pydocstyle>=6.1.1 -pylint>=2.13.7 pytest>=7.1.1 pytest-mock>=3.7.0 \ No newline at end of file diff --git a/tests/evaluation/test_evaluation.py b/tests/evaluation/test_evaluation.py index 0ed388f..441670e 100644 --- a/tests/evaluation/test_evaluation.py +++ b/tests/evaluation/test_evaluation.py @@ -1,4 +1,3 @@ - import pytest import pandas as pd import numpy as np @@ -6,15 +5,19 @@ from cobra.evaluation import plot_incidence from cobra.evaluation import ClassificationEvaluator, RegressionEvaluator + def mock_data(): - d = {'variable': ['education', 'education', 'education', 'education'], - 'label': ['1st-4th', '5th-6th', '7th-8th', '9th'], - 'pop_size': [0.002, 0.004, 0.009, 0.019], - 'avg_incidence': [0.23, 0.23, 0.23, 0.23], - 'incidence': [0.047, 0.0434, 0.054, 0.069]} + d = { + "variable": ["education", "education", "education", "education"], + "label": ["1st-4th", "5th-6th", "7th-8th", "9th"], + "pop_size": [0.002, 0.004, 0.009, 0.019], + "avg_incidence": [0.23, 0.23, 0.23, 0.23], + "incidence": [0.047, 0.0434, 0.054, 0.069], + } return pd.DataFrame(d) -def mock_preds(n, seed = 505): + +def mock_preds(n, seed=505): np.random.seed(seed) y_true = np.random.uniform(size=n) @@ -22,22 +25,22 @@ def mock_preds(n, seed = 505): return y_true, y_pred -class TestEvaluation: +class TestEvaluation: def test_plot_incidence_with_unsupported_model_type(self): with pytest.raises(ValueError): - plot_incidence(pig_tables=None, - variable="", - model_type="anomaly_detection") + plot_incidence(pig_tables=None, variable="", model_type="anomaly_detection") def test_plot_incidence_with_different_column_orders(self): data = mock_data() with pytest.raises(ValueError): - plot_incidence(pig_tables=data, - variable='education', - model_type="classification", - # different bins than in the data variable: - column_order=['1st-4th', '5th-6th', '7th-8th']) + plot_incidence( + pig_tables=data, + variable="education", + model_type="classification", + # different bins than in the data variable: + column_order=["1st-4th", "5th-6th", "7th-8th"], + ) # Stubs for later (requires exposing df_plot and testing matplotlib's # plot object fix and ax internals): @@ -93,7 +96,9 @@ def test_lift_curve_n_bins(self): n_bins_out = [] for n_bins in n_bins_test: e = ClassificationEvaluator(n_bins=n_bins) - out = ClassificationEvaluator._compute_lift_per_bin(y_true, y_pred, e.n_bins) + out = ClassificationEvaluator._compute_lift_per_bin( + y_true, y_pred, e.n_bins + ) lifts = out[1] n_bins_out.append(len(lifts)) @@ -108,8 +113,15 @@ def test_fit_classification(self): assert (evaluator.y_true == y_true).all() assert (evaluator.y_pred == y_pred).all() - for metric in ["accuracy", "AUC", "precision", "recall", - "F1", "matthews_corrcoef", "lift at {}".format(evaluator.lift_at)]: + for metric in [ + "accuracy", + "AUC", + "precision", + "recall", + "F1", + "matthews_corrcoef", + "lift at {}".format(evaluator.lift_at), + ]: assert evaluator.scalar_metrics[metric] is not None assert evaluator.roc_curve is not None assert evaluator.confusion_matrix is not None @@ -118,7 +130,10 @@ def test_fit_classification(self): def test_fit_regression(self): y_true, y_pred = mock_preds(50, seed=789) - y_true, y_pred = y_true*10, y_pred*10 # rescale so it looks more regression-like + y_true, y_pred = ( + y_true * 10, + y_pred * 10, + ) # rescale so it looks more regression-like evaluator = RegressionEvaluator() evaluator.fit(y_true, y_pred) diff --git a/tests/model_building/test_forward_selection.py b/tests/model_building/test_forward_selection.py index 19f7157..29e4910 100644 --- a/tests/model_building/test_forward_selection.py +++ b/tests/model_building/test_forward_selection.py @@ -1,4 +1,3 @@ - from contextlib import contextmanager import pytest import pandas as pd @@ -6,29 +5,33 @@ from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel from cobra.model_building.forward_selection import ForwardFeatureSelection + @contextmanager def does_not_raise(): yield -def mock_data(add_split_col: bool=False, model_type="classification"): - data = pd.DataFrame({"var1_enc": [0.42] * 10, - "var2_enc": [0.94] * 10, - "var3_enc": [0.87] * 10}) + +def mock_data(add_split_col: bool = False, model_type="classification"): + data = pd.DataFrame( + {"var1_enc": [0.42] * 10, "var2_enc": [0.94] * 10, "var3_enc": [0.87] * 10} + ) if model_type == "classification": - data["target"] = ([0] * 5 + [1] * 2 + [0] * 2 + [1]) + data["target"] = [0] * 5 + [1] * 2 + [0] * 2 + [1] elif model_type == "regression": data["target"] = [7, 2, 2, 9, 7, 3, 1, 4, 8, 5] if add_split_col: - data.loc[:, "split"] = (["train"] * 7 + ["selection"] * 3) + data.loc[:, "split"] = ["train"] * 7 + ["selection"] * 3 return data + def mock_model_num_pred(n_predictors, model_type="classification"): predictors = [f"var{i + 1}_enc" for i in range(n_predictors)] return mock_model(predictors, model_type) + def mock_model(predictor_list, model_type="classification"): if model_type == "classification": model = LogisticRegressionModel() @@ -41,7 +44,6 @@ def mock_model(predictor_list, model_type="classification"): class TestForwardFeatureSelection: - def test_get_model_from_step(self): forward_selection = ForwardFeatureSelection() @@ -58,10 +60,12 @@ def test_compute_model_performances(self, mocker, model_type): fw_selection._fitted_models = [ mock_model_num_pred(1, model_type=model_type), mock_model_num_pred(2, model_type=model_type), - mock_model_num_pred(3, model_type=model_type) + mock_model_num_pred(3, model_type=model_type), ] - def mock_evaluate(self, X, y, split, metric): # on AUC scale, but gives the same for RMSE as it is a mock + def mock_evaluate( + self, X, y, split, metric + ): # on AUC scale, but gives the same for RMSE as it is a mock if split == "train": return 0.612 else: @@ -70,29 +74,41 @@ def mock_evaluate(self, X, y, split, metric): # on AUC scale, but gives the sam if model_type == "classification": patch_fct = "cobra.model_building.forward_selection.LogisticRegressionModel.evaluate" elif model_type == "regression": - patch_fct = "cobra.model_building.forward_selection.LinearRegressionModel.evaluate" + patch_fct = ( + "cobra.model_building.forward_selection.LinearRegressionModel.evaluate" + ) mocker.patch(patch_fct, mock_evaluate) - actual = (fw_selection - .compute_model_performances(data, "target", - splits=["train", "selection"], - metric=None)) - - expected = pd.DataFrame([ - {"predictors": ["var1_enc"], - "last_added_predictor": "var1_enc", - "train_performance": 0.612, "selection_performance": 0.609, - "model_type": model_type}, - {"predictors": ["var1_enc", "var2_enc"], - "last_added_predictor": "var2_enc", - "train_performance": 0.612, "selection_performance": 0.609, - "model_type": model_type}, - {"predictors": ["var1_enc", "var2_enc", "var3_enc"], - "last_added_predictor": "var3_enc", - "train_performance": 0.612, "selection_performance": 0.609, - "model_type": model_type} - ]) + actual = fw_selection.compute_model_performances( + data, "target", splits=["train", "selection"], metric=None + ) + + expected = pd.DataFrame( + [ + { + "predictors": ["var1_enc"], + "last_added_predictor": "var1_enc", + "train_performance": 0.612, + "selection_performance": 0.609, + "model_type": model_type, + }, + { + "predictors": ["var1_enc", "var2_enc"], + "last_added_predictor": "var2_enc", + "train_performance": 0.612, + "selection_performance": 0.609, + "model_type": model_type, + }, + { + "predictors": ["var1_enc", "var2_enc", "var3_enc"], + "last_added_predictor": "var3_enc", + "train_performance": 0.612, + "selection_performance": 0.609, + "model_type": model_type, + }, + ] + ) pd.testing.assert_frame_equal(actual, expected) @@ -108,18 +124,21 @@ def test_ffs_train_data_assertions(self, model_type): with pytest.raises(AssertionError): # not at least train & selection sets fw_selection.fit(df[df["split"] == "train"], "target", predictors=[""]) - @pytest.mark.parametrize("model_type, max_predictors, expectation", - [("classification", 2, pytest.raises(ValueError)), - ("classification", 3, does_not_raise()), - ("classification", 5, does_not_raise()), - ("classification", 10, does_not_raise()), - ("classification", 15, does_not_raise()), - ("regression", 2, pytest.raises(ValueError)), - ("regression", 3, does_not_raise()), - ("regression", 5, does_not_raise()), - ("regression", 10, does_not_raise()), - ("regression", 15, does_not_raise()) - ]) + @pytest.mark.parametrize( + "model_type, max_predictors, expectation", + [ + ("classification", 2, pytest.raises(ValueError)), + ("classification", 3, does_not_raise()), + ("classification", 5, does_not_raise()), + ("classification", 10, does_not_raise()), + ("classification", 15, does_not_raise()), + ("regression", 2, pytest.raises(ValueError)), + ("regression", 3, does_not_raise()), + ("regression", 5, does_not_raise()), + ("regression", 10, does_not_raise()), + ("regression", 15, does_not_raise()), + ], + ) def test_fit(self, mocker, model_type, max_predictors: int, expectation): # create list of elements [var1_enc, var2_enc, ..., var10_enc] @@ -127,55 +146,71 @@ def test_fit(self, mocker, model_type, max_predictors: int, expectation): # extract sublist [var1_enc, var5_enc, var9_enc] forced_predictors_list = predictors_list[::4] - ordered_output_list = (forced_predictors_list - + [pred for pred in predictors_list - if pred not in forced_predictors_list]) + ordered_output_list = forced_predictors_list + [ + pred for pred in predictors_list if pred not in forced_predictors_list + ] - fw_selection = ForwardFeatureSelection(model_type=model_type, max_predictors=max_predictors) + fw_selection = ForwardFeatureSelection( + model_type=model_type, max_predictors=max_predictors + ) def mock_train_model(self, train_data, target_column_name, predictors): return mock_model(predictors, model_type=model_type) - def mock_forward_selection(self, train_data, target_column_name, - predictors, forced_predictors): + def mock_forward_selection( + self, train_data, target_column_name, predictors, forced_predictors + ): n_models = min(max_predictors, len(predictors) + len(forced_predictors)) - return [mock_model(ordered_output_list[:i+1], model_type=model_type) - for i in range(n_models)] + return [ + mock_model(ordered_output_list[: i + 1], model_type=model_type) + for i in range(n_models) + ] - mocker.patch("cobra.model_building.ForwardFeatureSelection._train_model", - mock_train_model) + mocker.patch( + "cobra.model_building.ForwardFeatureSelection._train_model", + mock_train_model, + ) - mocker.patch("cobra.model_building.ForwardFeatureSelection._forward_selection", - mock_forward_selection) + mocker.patch( + "cobra.model_building.ForwardFeatureSelection._forward_selection", + mock_forward_selection, + ) df = mock_data(add_split_col=True, model_type=model_type) with expectation: - fw_selection.fit(df, "target", # data is ignored - predictors=predictors_list, - forced_predictors=forced_predictors_list, - excluded_predictors=[]) + fw_selection.fit( + df, + "target", # data is ignored + predictors=predictors_list, + forced_predictors=forced_predictors_list, + excluded_predictors=[], + ) # for each fitted model, check number of predictors - actual = [model.predictors - for model in fw_selection._fitted_models] + actual = [model.predictors for model in fw_selection._fitted_models] - expected = [ordered_output_list[:i+1] - for i in range(min(max_predictors, - len(predictors_list)))] + expected = [ + ordered_output_list[: i + 1] + for i in range(min(max_predictors, len(predictors_list))) + ] if max_predictors == len(forced_predictors_list): expected = [forced_predictors_list] assert actual == expected - @pytest.mark.parametrize("model_type, max_predictors", [("classification", 5), - ("classification", 10), - ("classification", 15), - ("regression", 5), - ("regression", 10), - ("regression", 15) - ]) + @pytest.mark.parametrize( + "model_type, max_predictors", + [ + ("classification", 5), + ("classification", 10), + ("classification", 15), + ("regression", 5), + ("regression", 10), + ("regression", 15), + ], + ) def test_forward_selection(self, mocker, model_type, max_predictors: int): # create list of elements [var1_enc, var2_c, ..., var10_enc] @@ -184,30 +219,39 @@ def test_forward_selection(self, mocker, model_type, max_predictors: int): # extract sublist [var1_enc, var5_enc, var9_enc]: forced_predictors = predictors_list[::4] # remove these from predictors list to have clean version - predictors = [pred for pred in predictors_list - if pred not in forced_predictors] + predictors = [pred for pred in predictors_list if pred not in forced_predictors] ordered_output_list = forced_predictors + predictors - def mock_find_next_best_model(self, train_data, target_column_name, - candidate_predictors, - current_predictors): - return mock_model(current_predictors + candidate_predictors[0:1], model_type=model_type) - - mocker.patch(("cobra.model_building.ForwardFeatureSelection." - "_find_next_best_model"), mock_find_next_best_model) - - fw_selection = ForwardFeatureSelection(model_type=model_type, max_predictors=max_predictors) - - fitted_models = (fw_selection. - _forward_selection(pd.DataFrame(), "target", - predictors, - forced_predictors)) + def mock_find_next_best_model( + self, + train_data, + target_column_name, + candidate_predictors, + current_predictors, + ): + return mock_model( + current_predictors + candidate_predictors[0:1], model_type=model_type + ) + + mocker.patch( + ("cobra.model_building.ForwardFeatureSelection." "_find_next_best_model"), + mock_find_next_best_model, + ) + + fw_selection = ForwardFeatureSelection( + model_type=model_type, max_predictors=max_predictors + ) + + fitted_models = fw_selection._forward_selection( + pd.DataFrame(), "target", predictors, forced_predictors + ) actual = [sorted(model.predictors) for model in fitted_models] - expected = [sorted(ordered_output_list[:i+1]) - for i in range(min(max_predictors, - len(predictors_list)))] + expected = [ + sorted(ordered_output_list[: i + 1]) + for i in range(min(max_predictors, len(predictors_list))) + ] assert actual == expected diff --git a/tests/model_building/test_models.py b/tests/model_building/test_models.py index 7eca6e6..84484fc 100644 --- a/tests/model_building/test_models.py +++ b/tests/model_building/test_models.py @@ -1,23 +1,24 @@ - import numpy as np import pandas as pd from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel + def mock_data(): - return pd.DataFrame({"var1_enc": [0.42] * 10, - "var2_enc": [0.94] * 10, - "var3_enc": [0.87] * 10}) + return pd.DataFrame( + {"var1_enc": [0.42] * 10, "var2_enc": [0.94] * 10, "var3_enc": [0.87] * 10} + ) def mock_score_model_classification(self, data): return np.array([0.5, 0.8, 0.2, 0.9, 0.1, 0.7, 0.3, 0.6, 0.4, 0.5]) + def mock_score_model_regression(self, data): - return np.array([0.7, 0.2, 0.2, 0.9, 0.7, 0.3, 0.1, 0.4, 0.8, 0.5])*15 + return np.array([0.7, 0.2, 0.2, 0.9, 0.7, 0.3, 0.1, 0.4, 0.8, 0.5]) * 15 -class TestLogisticRegressionModel: +class TestLogisticRegressionModel: def test_evaluate(self, mocker): X = mock_data() @@ -26,13 +27,14 @@ def test_evaluate(self, mocker): def mock_roc_auc_score(y_true, y_score): return 0.79 - (mocker - .patch("cobra.model_building.LogisticRegressionModel.score_model", - mock_score_model_classification)) + ( + mocker.patch( + "cobra.model_building.LogisticRegressionModel.score_model", + mock_score_model_classification, + ) + ) - (mocker - .patch("cobra.model_building.models.roc_auc_score", - mock_roc_auc_score)) + (mocker.patch("cobra.model_building.models.roc_auc_score", mock_roc_auc_score)) model = LogisticRegressionModel() actual = model.evaluate(X, y) @@ -52,17 +54,17 @@ def test_evaluate_cached(self): assert actual == expected def test_compute_variable_importance(self, mocker): - def mock_pearsonr(ypred, ytrue): return [ypred.unique()[0]] - (mocker - .patch("cobra.model_building.LogisticRegressionModel.score_model", - mock_score_model_classification)) + ( + mocker.patch( + "cobra.model_building.LogisticRegressionModel.score_model", + mock_score_model_classification, + ) + ) - (mocker - .patch("cobra.model_building.models.stats.pearsonr", - mock_pearsonr)) + (mocker.patch("cobra.model_building.models.stats.pearsonr", mock_pearsonr)) model = LogisticRegressionModel() model.predictors = ["var1_enc", "var2_enc", "var3_enc"] @@ -71,11 +73,17 @@ def mock_pearsonr(ypred, ytrue): actual = model.compute_variable_importance(data) - expected = pd.DataFrame([ - {"predictor": "var1", "importance": data["var1_enc"].unique()[0]}, - {"predictor": "var2", "importance": data["var2_enc"].unique()[0]}, - {"predictor": "var3", "importance": data["var3_enc"].unique()[0]} - ]).sort_values(by="importance", ascending=False).reset_index(drop=True) + expected = ( + pd.DataFrame( + [ + {"predictor": "var1", "importance": data["var1_enc"].unique()[0]}, + {"predictor": "var2", "importance": data["var2_enc"].unique()[0]}, + {"predictor": "var3", "importance": data["var3_enc"].unique()[0]}, + ] + ) + .sort_values(by="importance", ascending=False) + .reset_index(drop=True) + ) pd.testing.assert_frame_equal(actual, expected) @@ -103,8 +111,8 @@ def test_serialize(self): "solver": "liblinear", "tol": 0.0001, "verbose": 0, - "warm_start": False - } + "warm_start": False, + }, } assert actual == expected @@ -132,12 +140,12 @@ def test_deserialize(self): "solver": "liblinear", "tol": 0.0001, "verbose": 0, - "warm_start": False + "warm_start": False, }, "classes_": [0, 1], "coef_": [[0.5, 0.75]], "intercept_": [-3], - "n_iter_": [10] + "n_iter_": [10], } model.deserialize(model_dict) @@ -149,23 +157,29 @@ def test_deserialize(self): assert logit.intercept_.all() == (np.array(model_dict["intercept_"]).all()) assert logit.coef_.all() == np.array(model_dict["coef_"]).all() -class TestLinearRegressionModel: +class TestLinearRegressionModel: def test_evaluate(self, mocker): X = mock_data() - y = pd.Series(np.array([0.6, 0.1, 0.2, 0.9, 0.8, 0.3, 0.2, 0.4, 0.9, 0.5])*12) + y = pd.Series(np.array([0.6, 0.1, 0.2, 0.9, 0.8, 0.3, 0.2, 0.4, 0.9, 0.5]) * 12) def mock_mean_squared_error(y_true, y_pred): return 1.23 - (mocker - .patch("cobra.model_building.LinearRegressionModel.score_model", - mock_score_model_regression)) + ( + mocker.patch( + "cobra.model_building.LinearRegressionModel.score_model", + mock_score_model_regression, + ) + ) - (mocker - .patch("cobra.model_building.models.mean_squared_error", - mock_mean_squared_error)) + ( + mocker.patch( + "cobra.model_building.models.mean_squared_error", + mock_mean_squared_error, + ) + ) model = LinearRegressionModel() actual = model.evaluate(X, y) @@ -185,17 +199,17 @@ def test_evaluate_cached(self): assert actual == expected def test_compute_variable_importance(self, mocker): - def mock_pearsonr(ypred, ytrue): return [ypred.unique()[0]] - (mocker - .patch("cobra.model_building.LinearRegressionModel.score_model", - mock_score_model_regression)) + ( + mocker.patch( + "cobra.model_building.LinearRegressionModel.score_model", + mock_score_model_regression, + ) + ) - (mocker - .patch("cobra.model_building.models.stats.pearsonr", - mock_pearsonr)) + (mocker.patch("cobra.model_building.models.stats.pearsonr", mock_pearsonr)) model = LinearRegressionModel() model.predictors = ["var1_enc", "var2_enc", "var3_enc"] @@ -204,11 +218,17 @@ def mock_pearsonr(ypred, ytrue): actual = model.compute_variable_importance(data) - expected = pd.DataFrame([ - {"predictor": "var1", "importance": data["var1_enc"].unique()[0]}, - {"predictor": "var2", "importance": data["var2_enc"].unique()[0]}, - {"predictor": "var3", "importance": data["var3_enc"].unique()[0]} - ]).sort_values(by="importance", ascending=False).reset_index(drop=True) + expected = ( + pd.DataFrame( + [ + {"predictor": "var1", "importance": data["var1_enc"].unique()[0]}, + {"predictor": "var2", "importance": data["var2_enc"].unique()[0]}, + {"predictor": "var3", "importance": data["var3_enc"].unique()[0]}, + ] + ) + .sort_values(by="importance", ascending=False) + .reset_index(drop=True) + ) pd.testing.assert_frame_equal(actual, expected) @@ -225,8 +245,8 @@ def test_serialize(self): "copy_X": True, "fit_intercept": True, "n_jobs": None, - "positive": False - } + "positive": False, + }, } assert actual == expected @@ -243,10 +263,10 @@ def test_deserialize(self): "copy_X": True, "fit_intercept": True, "n_jobs": None, - "positive": False + "positive": False, }, "coef_": [[0.5, 0.75]], - "intercept_": [-3] + "intercept_": [-3], } model.deserialize(model_dict) @@ -255,4 +275,3 @@ def test_deserialize(self): assert linear.get_params() == model_dict["params"] assert linear.intercept_.all() == (np.array(model_dict["intercept_"]).all()) assert linear.coef_.all() == np.array(model_dict["coef_"]).all() - diff --git a/tests/model_building/test_univariate_selection.py b/tests/model_building/test_univariate_selection.py index c69a4de..dbb257b 100644 --- a/tests/model_building/test_univariate_selection.py +++ b/tests/model_building/test_univariate_selection.py @@ -1,15 +1,15 @@ - import pandas as pd from cobra.model_building import univariate_selection + def mock_data(): - return pd.DataFrame({"var1_enc": [0.42] * 10, - "var2_enc": [0.94] * 10, - "var3_enc": [0.87] * 10}) + return pd.DataFrame( + {"var1_enc": [0.42] * 10, "var2_enc": [0.94] * 10, "var3_enc": [0.87] * 10} + ) -class TestUnivariateSelection: +class TestUnivariateSelection: def test_preselection_classification(self): X = mock_data() @@ -25,19 +25,21 @@ def test_preselection_classification(self): target_column="target", model_type="classification", preselect_auc_threshold=0.48, - preselect_overtrain_threshold=0.05) + preselect_overtrain_threshold=0.05, + ) assert all(c in df_auc.columns for c in ["AUC train", "AUC selection"]) - preselected_predictors = (univariate_selection - .get_preselected_predictors(df_auc)) + preselected_predictors = univariate_selection.get_preselected_predictors(df_auc) assert preselected_predictors == ["var1_enc", "var2_enc", "var3_enc"] def test_preselection_regression(self): X = mock_data() - y = pd.DataFrame([6.0, 9.0, 4.2, 5.5, 0.7, 1.9, 8.7, 8.0, 2.0, 7.2], columns=["target"]) + y = pd.DataFrame( + [6.0, 9.0, 4.2, 5.5, 0.7, 1.9, 8.7, 8.0, 2.0, 7.2], columns=["target"] + ) basetable = pd.concat([y, X], axis=1) basetable["split"] = ["train"] * 3 + ["selection"] * 6 + ["train"] @@ -49,11 +51,13 @@ def test_preselection_regression(self): target_column="target", model_type="regression", preselect_auc_threshold=5, - preselect_overtrain_threshold=0.05) + preselect_overtrain_threshold=0.05, + ) assert all(c in df_rmse.columns for c in ["RMSE train", "RMSE selection"]) - preselected_predictors = (univariate_selection - .get_preselected_predictors(df_rmse)) + preselected_predictors = univariate_selection.get_preselected_predictors( + df_rmse + ) assert preselected_predictors == ["var2_enc", "var3_enc"] diff --git a/tests/preprocessing/test_categorical_data_processor.py b/tests/preprocessing/test_categorical_data_processor.py index dd53434..abcf797 100644 --- a/tests/preprocessing/test_categorical_data_processor.py +++ b/tests/preprocessing/test_categorical_data_processor.py @@ -1,20 +1,17 @@ - import pytest import numpy as np import pandas as pd from cobra.preprocessing import CategoricalDataProcessor -class TestCategoricalDataProcessor: +class TestCategoricalDataProcessor: def test_attributes_to_dict(self): processor = CategoricalDataProcessor() cleaned_categories = ["a", "b", "c"] - processor._cleaned_categories_by_column = { - "variable": set(cleaned_categories) - } + processor._cleaned_categories_by_column = {"variable": set(cleaned_categories)} actual = processor.attributes_to_dict() @@ -29,16 +26,24 @@ def test_attributes_to_dict(self): "forced_categories": {}, "_cleaned_categories_by_column": { "variable": list(set(cleaned_categories)) - } + }, } assert actual == expected - @pytest.mark.parametrize("attribute", - ["regroup", "regroup_name", "keep_missing", - "category_size_threshold", "p_value_threshold", - "scale_contingency_table", "forced_categories", - "_cleaned_categories_by_column"]) + @pytest.mark.parametrize( + "attribute", + [ + "regroup", + "regroup_name", + "keep_missing", + "category_size_threshold", + "p_value_threshold", + "scale_contingency_table", + "forced_categories", + "_cleaned_categories_by_column", + ], + ) def test_set_attributes_from_dict(self, attribute): processor = CategoricalDataProcessor() @@ -52,9 +57,7 @@ def test_set_attributes_from_dict(self, attribute): "p_value_threshold": 0.001, "scale_contingency_table": True, "forced_categories": {}, - "_cleaned_categories_by_column": { - "variable": cleaned_categories - } + "_cleaned_categories_by_column": {"variable": cleaned_categories}, } expected = params[attribute] @@ -69,245 +72,408 @@ def test_set_attributes_from_dict(self, attribute): assert actual == expected - @pytest.mark.parametrize("scale_contingency_table, expected", - [(False, 0.01329), - (True, 0.43437)]) + @pytest.mark.parametrize( + "scale_contingency_table, expected", [(False, 0.01329), (True, 0.43437)] + ) def test_compute_p_value_classification(self, scale_contingency_table, expected): - X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10)) - y = pd.Series(data=([0]*35 + [1]*35 + [0]*15 + [1]*5 + [0]*8 + [1]*2)) + X = pd.Series(data=(["c1"] * 70 + ["c2"] * 20 + ["c3"] * 10)) + y = pd.Series( + data=([0] * 35 + [1] * 35 + [0] * 15 + [1] * 5 + [0] * 8 + [1] * 2) + ) category = "c1" - actual = (CategoricalDataProcessor - ._compute_p_value(X, y, category, "classification", scale_contingency_table)) + actual = CategoricalDataProcessor._compute_p_value( + X, y, category, "classification", scale_contingency_table + ) assert pytest.approx(actual, abs=1e-5) == expected - @pytest.mark.parametrize("seed, expected", - [(505, 0.02222), - (603, 0.89230)]) + @pytest.mark.parametrize("seed, expected", [(505, 0.02222), (603, 0.89230)]) def test_compute_p_value_regression(self, seed, expected): np.random.seed(seed) - X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10)) - y = pd.Series(data=np.random.uniform(0, 1, 100)*5) + X = pd.Series(data=(["c1"] * 70 + ["c2"] * 20 + ["c3"] * 10)) + y = pd.Series(data=np.random.uniform(0, 1, 100) * 5) category = "c1" - actual = (CategoricalDataProcessor - ._compute_p_value(X, y, category, "regression", None)) + actual = CategoricalDataProcessor._compute_p_value( + X, y, category, "regression", None + ) assert pytest.approx(actual, abs=1e-5) == expected def test_get_small_categories(self): - data = pd.Series(data=(["c1"]*50 + ["c2"]*25 + ["c3"]*15 + ["c4"]*5)) + data = pd.Series(data=(["c1"] * 50 + ["c2"] * 25 + ["c3"] * 15 + ["c4"] * 5)) incidence = 0.35 threshold = 10 # to make it easy to manualLy compute expected = {"c3", "c4"} - actual = (CategoricalDataProcessor - ._get_small_categories(data, incidence, threshold)) + actual = CategoricalDataProcessor._get_small_categories( + data, incidence, threshold + ) assert actual == expected def test_replace_missings(self): data = pd.DataFrame({"variable": ["c1", "c2", np.nan, "", " "]}) - expected = pd.DataFrame({"variable": ["c1", "c2", "Missing", "Missing", - "Missing"] - }) - actual = (CategoricalDataProcessor - ._replace_missings(data, ["variable"])) + expected = pd.DataFrame( + {"variable": ["c1", "c2", "Missing", "Missing", "Missing"]} + ) + actual = CategoricalDataProcessor._replace_missings(data, ["variable"]) pd.testing.assert_frame_equal(actual, expected) - @pytest.mark.parametrize("cleaned_categories, expected", - [({"c1", "c2"}, - pd.Series(data=["c1", "c2", "Other", "Other"])), - ({"c1", "c2", "c3", "c4"}, - pd.Series(data=["c1", "c2", "c3", "c4"]))]) + @pytest.mark.parametrize( + "cleaned_categories, expected", + [ + ({"c1", "c2"}, pd.Series(data=["c1", "c2", "Other", "Other"])), + ({"c1", "c2", "c3", "c4"}, pd.Series(data=["c1", "c2", "c3", "c4"])), + ], + ) def test_replace_categories(self, cleaned_categories, expected): data = pd.Series(data=["c1", "c2", "c3", "c4"]) - actual = (CategoricalDataProcessor - ._replace_categories(data, cleaned_categories, 'Other')) + actual = CategoricalDataProcessor._replace_categories( + data, cleaned_categories, "Other" + ) pd.testing.assert_series_equal(actual, expected) def test_all_cats_not_significant(self): # Expected - e = {'categorical_var': ['A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', - 'C', 'C', 'C', 'C'], - 'target': [1, 1, 1, 1, - 0, 0, 0, 0, - 1, 0, 1, 0], - 'categorical_var_processed': ['A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', - 'C', 'C', 'C', 'C']} + e = { + "categorical_var": [ + "A", + "A", + "A", + "A", + "B", + "B", + "B", + "B", + "C", + "C", + "C", + "C", + ], + "target": [1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0], + "categorical_var_processed": [ + "A", + "A", + "A", + "A", + "B", + "B", + "B", + "B", + "C", + "C", + "C", + "C", + ], + } # data -> actual - d = {'categorical_var': ['A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', - 'C', 'C', 'C', 'C'], - 'target': [1, 1, 1, 1, - 0, 0, 0, 0, - 1, 0, 1, 0]} + d = { + "categorical_var": [ + "A", + "A", + "A", + "A", + "B", + "B", + "B", + "B", + "C", + "C", + "C", + "C", + ], + "target": [1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0], + } - discrete_vars = ['categorical_var'] - target_column_name = 'target' + discrete_vars = ["categorical_var"] + target_column_name = "target" - data = pd.DataFrame(d, columns=['categorical_var', 'target']) - expected = pd.DataFrame(e, columns=['categorical_var', - 'target', - 'categorical_var_processed']) + data = pd.DataFrame(d, columns=["categorical_var", "target"]) + expected = pd.DataFrame( + e, columns=["categorical_var", "target", "categorical_var_processed"] + ) categorical_data_processor = CategoricalDataProcessor( - category_size_threshold=0, - p_value_threshold=0.0001) + category_size_threshold=0, p_value_threshold=0.0001 + ) - categorical_data_processor.fit(data, - discrete_vars, - target_column_name) + categorical_data_processor.fit(data, discrete_vars, target_column_name) - actual = categorical_data_processor.transform(data, - discrete_vars) + actual = categorical_data_processor.transform(data, discrete_vars) pd.testing.assert_frame_equal(actual, expected) def test_regroup_name(self): # Expected - e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', 'B', 'B', - 'C', 'C', 'C', 'C', 'C', 'C'], - 'target': [1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, - 1, 0, 1, 0, 1, 0], - 'categorical_var_processed': [ - 'A', 'A', 'A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', 'B', 'B', - 'OTH', 'OTH', 'OTH', 'OTH', 'OTH', 'OTH']} + e = { + "categorical_var": [ + "A", + "A", + "A", + "A", + "A", + "A", + "B", + "B", + "B", + "B", + "B", + "B", + "C", + "C", + "C", + "C", + "C", + "C", + ], + "target": [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0], + "categorical_var_processed": [ + "A", + "A", + "A", + "A", + "A", + "A", + "B", + "B", + "B", + "B", + "B", + "B", + "OTH", + "OTH", + "OTH", + "OTH", + "OTH", + "OTH", + ], + } # data -> actual - d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', 'B', 'B', - 'C', 'C', 'C', 'C', 'C', 'C'], - 'target': [1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, - 1, 0, 1, 0, 1, 0]} + d = { + "categorical_var": [ + "A", + "A", + "A", + "A", + "A", + "A", + "B", + "B", + "B", + "B", + "B", + "B", + "C", + "C", + "C", + "C", + "C", + "C", + ], + "target": [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0], + } - discrete_vars = ['categorical_var'] - target_column_name = 'target' + discrete_vars = ["categorical_var"] + target_column_name = "target" - data = pd.DataFrame(d, columns=['categorical_var', 'target']) - expected = pd.DataFrame(e, columns=['categorical_var', - 'target', - 'categorical_var_processed']) + data = pd.DataFrame(d, columns=["categorical_var", "target"]) + expected = pd.DataFrame( + e, columns=["categorical_var", "target", "categorical_var_processed"] + ) - expected['categorical_var_processed'] = ( - expected['categorical_var_processed'].astype("category")) + expected["categorical_var_processed"] = expected[ + "categorical_var_processed" + ].astype("category") categorical_data_processor = CategoricalDataProcessor( - category_size_threshold=0, - regroup_name='OTH', - p_value_threshold=0.05) + category_size_threshold=0, regroup_name="OTH", p_value_threshold=0.05 + ) - categorical_data_processor.fit(data, - discrete_vars, - target_column_name) + categorical_data_processor.fit(data, discrete_vars, target_column_name) - actual = categorical_data_processor.transform(data, - discrete_vars) + actual = categorical_data_processor.transform(data, discrete_vars) pd.testing.assert_frame_equal(actual, expected) def test_force_category(self): # Expected - e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', 'B', 'B', - 'C', 'C', 'C', 'C', 'C', 'C'], - 'target': [1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, - 1, 0, 1, 0, 1, 0], - 'categorical_var_processed': ['A', 'A', 'A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', 'B', 'B', - 'C', 'C', 'C', 'C', 'C', 'C']} + e = { + "categorical_var": [ + "A", + "A", + "A", + "A", + "A", + "A", + "B", + "B", + "B", + "B", + "B", + "B", + "C", + "C", + "C", + "C", + "C", + "C", + ], + "target": [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0], + "categorical_var_processed": [ + "A", + "A", + "A", + "A", + "A", + "A", + "B", + "B", + "B", + "B", + "B", + "B", + "C", + "C", + "C", + "C", + "C", + "C", + ], + } # data -> actual - d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', 'B', 'B', - 'C', 'C', 'C', 'C', 'C', 'C'], - 'target': [1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, - 1, 0, 1, 0, 1, 0]} + d = { + "categorical_var": [ + "A", + "A", + "A", + "A", + "A", + "A", + "B", + "B", + "B", + "B", + "B", + "B", + "C", + "C", + "C", + "C", + "C", + "C", + ], + "target": [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0], + } - discrete_vars = ['categorical_var'] - target_column_name = 'target' + discrete_vars = ["categorical_var"] + target_column_name = "target" - data = pd.DataFrame(d, columns=['categorical_var', 'target']) - expected = pd.DataFrame(e, columns=['categorical_var', - 'target', - 'categorical_var_processed']) + data = pd.DataFrame(d, columns=["categorical_var", "target"]) + expected = pd.DataFrame( + e, columns=["categorical_var", "target", "categorical_var_processed"] + ) - expected['categorical_var_processed'] = ( - expected['categorical_var_processed'].astype("category")) + expected["categorical_var_processed"] = expected[ + "categorical_var_processed" + ].astype("category") categorical_data_processor = CategoricalDataProcessor( - category_size_threshold=0, - forced_categories={'categorical_var': ['C']}, - p_value_threshold=0.05) + category_size_threshold=0, + forced_categories={"categorical_var": ["C"]}, + p_value_threshold=0.05, + ) - categorical_data_processor.fit(data, - discrete_vars, - target_column_name) + categorical_data_processor.fit(data, discrete_vars, target_column_name) - actual = categorical_data_processor.transform(data, - discrete_vars) + actual = categorical_data_processor.transform(data, discrete_vars) pd.testing.assert_frame_equal(actual, expected) def test_categorical_variable_is_constant(self): # Expected - e = {'categorical_var': ['A', 'A', 'A', 'A', - 'A', 'A', 'A', 'A', - 'A', 'A', 'A', 'A'], - 'target': [1, 1, 1, 1, - 0, 0, 0, 0, - 1, 0, 1, 0], - 'categorical_var_processed': ['A', 'A', 'A', 'A', - 'A', 'A', 'A', 'A', - 'A', 'A', 'A', 'A']} + e = { + "categorical_var": [ + "A", + "A", + "A", + "A", + "A", + "A", + "A", + "A", + "A", + "A", + "A", + "A", + ], + "target": [1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0], + "categorical_var_processed": [ + "A", + "A", + "A", + "A", + "A", + "A", + "A", + "A", + "A", + "A", + "A", + "A", + ], + } # data -> actual - d = {'categorical_var': ['A', 'A', 'A', 'A', - 'A', 'A', 'A', 'A', - 'A', 'A', 'A', 'A'], - 'target': [1, 1, 1, 1, - 0, 0, 0, 0, - 1, 0, 1, 0]} + d = { + "categorical_var": [ + "A", + "A", + "A", + "A", + "A", + "A", + "A", + "A", + "A", + "A", + "A", + "A", + ], + "target": [1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0], + } - discrete_vars = ['categorical_var'] - target_column_name = 'target' + discrete_vars = ["categorical_var"] + target_column_name = "target" - data = pd.DataFrame(d, columns=['categorical_var', 'target']) - expected = pd.DataFrame(e, columns=['categorical_var', - 'target', - 'categorical_var_processed']) + data = pd.DataFrame(d, columns=["categorical_var", "target"]) + expected = pd.DataFrame( + e, columns=["categorical_var", "target", "categorical_var_processed"] + ) - expected['categorical_var_processed'] = ( - expected['categorical_var_processed'].astype("category")) + expected["categorical_var_processed"] = expected[ + "categorical_var_processed" + ].astype("category") categorical_data_processor = CategoricalDataProcessor( - category_size_threshold=0, - p_value_threshold=0.0001) + category_size_threshold=0, p_value_threshold=0.0001 + ) - categorical_data_processor.fit(data, - discrete_vars, - target_column_name) + categorical_data_processor.fit(data, discrete_vars, target_column_name) - actual = categorical_data_processor.transform(data, - discrete_vars) + actual = categorical_data_processor.transform(data, discrete_vars) pd.testing.assert_frame_equal(actual, expected) diff --git a/tests/preprocessing/test_kbins_discretizer.py b/tests/preprocessing/test_kbins_discretizer.py index d3a643a..58831c4 100644 --- a/tests/preprocessing/test_kbins_discretizer.py +++ b/tests/preprocessing/test_kbins_discretizer.py @@ -1,4 +1,3 @@ - from contextlib import contextmanager import pytest import numpy as np @@ -6,6 +5,7 @@ from cobra.preprocessing.kbins_discretizer import KBinsDiscretizer + @contextmanager def does_not_raise(): yield @@ -31,17 +31,24 @@ def test_attributes_to_dict(self): "starting_precision": 0, "label_format": "{} - {}", "change_endpoint_format": False, - "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0], - [6.0, 9.0]]} + "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0], [6.0, 9.0]]}, } assert actual == expected - @pytest.mark.parametrize("attribute", - ["n_bins", "strategy", "closed", - "auto_adapt_bins", "starting_precision", - "label_format", "change_endpoint_format", - "_bins_by_column"]) + @pytest.mark.parametrize( + "attribute", + [ + "n_bins", + "strategy", + "closed", + "auto_adapt_bins", + "starting_precision", + "label_format", + "change_endpoint_format", + "_bins_by_column", + ], + ) def test_set_attributes_from_dict(self, attribute): discretizer = KBinsDiscretizer() @@ -54,8 +61,7 @@ def test_set_attributes_from_dict(self, attribute): "starting_precision": 1, "label_format": "[,)", "change_endpoint_format": True, - "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0], - [6.0, 9.0]]} + "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0], [6.0, 9.0]]}, } expected = params[attribute] @@ -72,9 +78,10 @@ def test_set_attributes_from_dict(self, attribute): assert actual == expected # no further tests here as this is just a wrapper around _fit_column! - @pytest.mark.parametrize("strategy, expectation", - [("trees", pytest.raises(ValueError)), - ("quantile", does_not_raise())]) + @pytest.mark.parametrize( + "strategy, expectation", + [("trees", pytest.raises(ValueError)), ("quantile", does_not_raise())], + ) def test_fit_exception(self, strategy, expectation): discretizer = KBinsDiscretizer(strategy=strategy) @@ -84,10 +91,14 @@ def test_fit_exception(self, strategy, expectation): discretizer.fit(data, ["variable"]) # no further tests here as this is just a wrapper around _transform_column! - @pytest.mark.parametrize("scenario, expectation", - [("raise", pytest.raises(ValueError)), - ("regular_test", does_not_raise()), - ("constant_data", does_not_raise())]) + @pytest.mark.parametrize( + "scenario, expectation", + [ + ("raise", pytest.raises(ValueError)), + ("regular_test", does_not_raise()), + ("constant_data", does_not_raise()), + ], + ) def test_transform(self, scenario, expectation): discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform") @@ -104,12 +115,11 @@ def test_transform(self, scenario, expectation): discretizer.fit(data, ["variable"]) categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"] - expected["variable_bin"] = pd.Categorical(["0.0 - 3.0"]*4 - + ["3.0 - 6.0"]*3 - + ["6.0 - 9.0"]*3 - + ["Missing"], - categories=categories, - ordered=True) + expected["variable_bin"] = pd.Categorical( + ["0.0 - 3.0"] * 4 + ["3.0 - 6.0"] * 3 + ["6.0 - 9.0"] * 3 + ["Missing"], + categories=categories, + ordered=True, + ) elif scenario == "constant_data": discretizer.fit(data, ["variable"]) @@ -118,10 +128,14 @@ def test_transform(self, scenario, expectation): pd.testing.assert_frame_equal(actual, expected) # ---------------- Test for private methods ---------------- - @pytest.mark.parametrize("n_bins, expectation", - [(1, pytest.raises(ValueError)), - (10.5, pytest.raises(ValueError)), - (2, does_not_raise())]) + @pytest.mark.parametrize( + "n_bins, expectation", + [ + (1, pytest.raises(ValueError)), + (10.5, pytest.raises(ValueError)), + (2, does_not_raise()), + ], + ) def test_validate_n_bins_exception(self, n_bins, expectation): with expectation: assert KBinsDiscretizer()._validate_n_bins(n_bins=n_bins) is None @@ -138,79 +152,103 @@ def test_transform_column(self): categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"] expected = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]}) - expected["variable_bin"] = pd.Categorical(["0.0 - 3.0"]*4 - + ["3.0 - 6.0"]*3 - + ["6.0 - 9.0"]*3 - + ["Missing"], - categories=categories, - ordered=True) + expected["variable_bin"] = pd.Categorical( + ["0.0 - 3.0"] * 4 + ["3.0 - 6.0"] * 3 + ["6.0 - 9.0"] * 3 + ["Missing"], + categories=categories, + ordered=True, + ) # assert using pandas testing module pd.testing.assert_frame_equal(actual, expected) - @pytest.mark.parametrize("n_bins, auto_adapt_bins, data, expected", - [(4, False, - pd.DataFrame({"variable": list(range(0, 11))}), - [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0), - (8.0, 10.0)]), - (10, True, - # ints from 0-10 with 17 nan's - pd.DataFrame({"variable": list(range(0, 11)) + - ([np.nan] * 17)}), - [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0), - (8.0, 10.0)]), - (10, False, - # almost constant - pd.DataFrame({"variable": [0] + ([1] * 100)}), - None), - (2, False, - pd.DataFrame({"variable": [5.4, 9.3, np.inf]}), - None)], - ids=["regular", "auto_adapt_bins", - "two bin edges", "infs"]) + @pytest.mark.parametrize( + "n_bins, auto_adapt_bins, data, expected", + [ + ( + 4, + False, + pd.DataFrame({"variable": list(range(0, 11))}), + [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0), (8.0, 10.0)], + ), + ( + 10, + True, + # ints from 0-10 with 17 nan's + pd.DataFrame({"variable": list(range(0, 11)) + ([np.nan] * 17)}), + [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0), (8.0, 10.0)], + ), + ( + 10, + False, + # almost constant + pd.DataFrame({"variable": [0] + ([1] * 100)}), + None, + ), + (2, False, pd.DataFrame({"variable": [5.4, 9.3, np.inf]}), None), + ], + ids=["regular", "auto_adapt_bins", "two bin edges", "infs"], + ) def test_fit_column(self, n_bins, auto_adapt_bins, data, expected): - discretizer = KBinsDiscretizer(n_bins=n_bins, - auto_adapt_bins=auto_adapt_bins) + discretizer = KBinsDiscretizer(n_bins=n_bins, auto_adapt_bins=auto_adapt_bins) actual = discretizer._fit_column(data, column_name="variable") assert actual == expected - @pytest.mark.parametrize("strategy, n_bins, data, expected", - [("quantile", # strategy - 4, # n_bins - # data (ints from 0 - 10): - pd.DataFrame({"variable": list(range(0, 11))}), - [0.0, 2.5, 5, 7.5, 10.0]), # expected result - ("uniform", # strategy - 3, # n_bins - # data (ints from 0 - 9): - pd.DataFrame({"variable": list(range(0, 10))}), - [0.0, 3.0, 6.0, 9.0])], # expected result - ids=["quantile", "uniform"]) + @pytest.mark.parametrize( + "strategy, n_bins, data, expected", + [ + ( + "quantile", # strategy + 4, # n_bins + # data (ints from 0 - 10): + pd.DataFrame({"variable": list(range(0, 11))}), + [0.0, 2.5, 5, 7.5, 10.0], + ), # expected result + ( + "uniform", # strategy + 3, # n_bins + # data (ints from 0 - 9): + pd.DataFrame({"variable": list(range(0, 10))}), + [0.0, 3.0, 6.0, 9.0], + ), + ], # expected result + ids=["quantile", "uniform"], + ) def test_compute_bin_edges(self, strategy, n_bins, data, expected): discretizer = KBinsDiscretizer(strategy=strategy) - actual = discretizer._compute_bin_edges(data, column_name="variable", - n_bins=n_bins, - col_min=data.variable.min(), - col_max=data.variable.max()) + actual = discretizer._compute_bin_edges( + data, + column_name="variable", + n_bins=n_bins, + col_min=data.variable.min(), + col_max=data.variable.max(), + ) assert actual == expected - @pytest.mark.parametrize("bin_edges, starting_precision, expected", - [([-10, 0, 1, 2], 1, 1), - ([-10, 0, 1, 1.01], 0, 2), - ([-10, 0, 1, 1.1], 1, 1), - ([-10, 0, 1, 2], -1, 0), - ([-10, 0, 10, 21], -1, -1)], - ids=["less precision", "more precision", - "equal precision", "negative start", - "round up"]) - def test_compute_minimal_precision_of_bin_edges(self, bin_edges, - starting_precision, - expected): + @pytest.mark.parametrize( + "bin_edges, starting_precision, expected", + [ + ([-10, 0, 1, 2], 1, 1), + ([-10, 0, 1, 1.01], 0, 2), + ([-10, 0, 1, 1.1], 1, 1), + ([-10, 0, 1, 2], -1, 0), + ([-10, 0, 10, 21], -1, -1), + ], + ids=[ + "less precision", + "more precision", + "equal precision", + "negative start", + "round up", + ], + ) + def test_compute_minimal_precision_of_bin_edges( + self, bin_edges, starting_precision, expected + ): discretizer = KBinsDiscretizer(starting_precision=starting_precision) @@ -218,11 +256,14 @@ def test_compute_minimal_precision_of_bin_edges(self, bin_edges, assert actual == expected - @pytest.mark.parametrize("bin_edges, expected", - [([0, 1, 1.5, 2], [(0, 1), (1, 1.5), (1.5, 2)]), - ([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)]), - ([np.inf, 0.0, -np.inf], - [(np.inf, 0.0), (0.0, -np.inf)])]) + @pytest.mark.parametrize( + "bin_edges, expected", + [ + ([0, 1, 1.5, 2], [(0, 1), (1, 1.5), (1.5, 2)]), + ([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)]), + ([np.inf, 0.0, -np.inf], [(np.inf, 0.0), (0.0, -np.inf)]), + ], + ) def test_compute_bins_from_edges(self, bin_edges, expected): discretizer = KBinsDiscretizer() @@ -230,21 +271,19 @@ def test_compute_bins_from_edges(self, bin_edges, expected): assert actual == expected - @pytest.mark.parametrize("change_endpoint_format, closed, bins, expected", - [(False, "right", [(0, 1), (1, 2), (2, 3)], - ["0 - 1", "1 - 2", "2 - 3"]), - (True, "right", [(0, 1), (1, 2), (2, 3)], - ["<= 1", "1 - 2", "> 2"]), - (True, "left", [(0, 1), (1, 2), (2, 3)], - ["< 1", "1 - 2", ">= 2"])], - ids=["standard format", "different endpoints", - "different endpoints left"]) - def test_create_bin_labels(self, change_endpoint_format, closed, - bins, expected): + @pytest.mark.parametrize( + "change_endpoint_format, closed, bins, expected", + [ + (False, "right", [(0, 1), (1, 2), (2, 3)], ["0 - 1", "1 - 2", "2 - 3"]), + (True, "right", [(0, 1), (1, 2), (2, 3)], ["<= 1", "1 - 2", "> 2"]), + (True, "left", [(0, 1), (1, 2), (2, 3)], ["< 1", "1 - 2", ">= 2"]), + ], + ids=["standard format", "different endpoints", "different endpoints left"], + ) + def test_create_bin_labels(self, change_endpoint_format, closed, bins, expected): discretizer = KBinsDiscretizer( - closed=closed, - change_endpoint_format=change_endpoint_format + closed=closed, change_endpoint_format=change_endpoint_format ) actual = discretizer._create_bin_labels(bins) diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py index 08f5b63..ba4c579 100644 --- a/tests/preprocessing/test_preprocessor.py +++ b/tests/preprocessing/test_preprocessor.py @@ -161,83 +161,638 @@ def test_get_variable_list( assert actual == expected @pytest.mark.parametrize( - ("input, expected"), - [ - # example 1 - ( - pd.DataFrame({ - "ID": list(range(20)), - "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8], - "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5, - "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17], - "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5 - } - ), - pd.DataFrame({ - 'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], - 'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8], - 'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'], - 'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17], - 'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], - 'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'], - 'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'], - 'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8], - 'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0], - 'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5], - 'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0] - } - ), - ) - ] + ("input, expected"), + [ + # example 1 + ( + pd.DataFrame( + { + "ID": list(range(20)), + "A": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 9, + 8, + 9, + 8, + 9, + 6, + 5, + 6, + 6, + 9, + 8, + ], + "B": ["Cat"] * 5 + ["Dog"] * 10 + ["Fish"] * 5, + "C": [ + 1, + 2, + 3, + 4, + 9, + 10, + 11, + 12, + 13, + 5, + 6, + 7, + 8, + 15, + 19, + 18, + 14, + 16, + 13, + 17, + ], + "Target": [1] * 2 + [0] * 5 + [1] * 3 + [0] * 5 + [1] * 5, + } + ), + pd.DataFrame( + { + "ID": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + ], + "A": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 9, + 8, + 9, + 8, + 9, + 6, + 5, + 6, + 6, + 9, + 8, + ], + "B": [ + "Cat", + "Cat", + "Cat", + "Cat", + "Cat", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Fish", + "Fish", + "Fish", + "Fish", + "Fish", + ], + "C": [ + 1, + 2, + 3, + 4, + 9, + 10, + 11, + 12, + 13, + 5, + 6, + 7, + 8, + 15, + 19, + 18, + 14, + 16, + 13, + 17, + ], + "Target": [ + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + ], + "C_bin": [ + "1.0 - 3.0", + "1.0 - 3.0", + "1.0 - 3.0", + "3.0 - 5.0", + "7.0 - 9.0", + "9.0 - 10.0", + "10.0 - 12.0", + "10.0 - 12.0", + "12.0 - 13.0", + "3.0 - 5.0", + "5.0 - 7.0", + "5.0 - 7.0", + "7.0 - 9.0", + "13.0 - 15.0", + "17.0 - 19.0", + "17.0 - 19.0", + "13.0 - 15.0", + "15.0 - 17.0", + "12.0 - 13.0", + "15.0 - 17.0", + ], + "B_processed": [ + "Cat", + "Cat", + "Cat", + "Cat", + "Cat", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Fish", + "Fish", + "Fish", + "Fish", + "Fish", + ], + "A_processed": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 9, + 8, + 9, + 8, + 9, + 6, + 5, + 6, + 6, + 9, + 8, + ], + "B_enc": [ + 0.4, + 0.4, + 0.4, + 0.4, + 0.4, + 0.3, + 0.3, + 0.3, + 0.3, + 0.3, + 0.3, + 0.3, + 0.3, + 0.3, + 0.3, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + ], + "A_enc": [ + 1.0, + 1.0, + 0.0, + 0.0, + 0.5, + 0.5, + 0.0, + 0.5, + 0.6, + 0.6, + 0.5, + 0.6, + 0.5, + 0.6, + 0.5, + 0.5, + 0.5, + 0.5, + 0.6, + 0.5, + ], + "C_enc": [ + 0.6666666666666666, + 0.6666666666666666, + 0.6666666666666666, + 0.5, + 0.0, + 0.0, + 0.5, + 0.5, + 1.0, + 0.5, + 0.0, + 0.0, + 0.0, + 0.5, + 0.5, + 0.5, + 0.5, + 1.0, + 1.0, + 1.0, + ], + } + ), + ) + ], ) def test_fit_transform_without_id_col_name(self, input, expected): - + preprocessor = PreProcessor.from_params(model_type="classification") - - continuous_vars, discrete_vars = preprocessor.get_continuous_and_discrete_columns(input, "ID","Target") + + ( + continuous_vars, + discrete_vars, + ) = preprocessor.get_continuous_and_discrete_columns(input, "ID", "Target") calculated = preprocessor.fit_transform( input, continuous_vars=continuous_vars, discrete_vars=discrete_vars, - target_column_name="Target" - ) - pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False) + target_column_name="Target", + ) + pd.testing.assert_frame_equal( + calculated, expected, check_dtype=False, check_categorical=False + ) @pytest.mark.parametrize( - ("input, expected"), - [ - # example 1 - ( - pd.DataFrame({ - "ID": list(range(20)), - "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8], - "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5, - "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17], - "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5 - } - ), - pd.DataFrame({ - 'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], - 'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8], - 'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'], - 'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17], - 'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], - 'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'], - 'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'], - 'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8], - 'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0], - 'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5], - 'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0] - } - ), - ) - ] + ("input, expected"), + [ + # example 1 + ( + pd.DataFrame( + { + "ID": list(range(20)), + "A": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 9, + 8, + 9, + 8, + 9, + 6, + 5, + 6, + 6, + 9, + 8, + ], + "B": ["Cat"] * 5 + ["Dog"] * 10 + ["Fish"] * 5, + "C": [ + 1, + 2, + 3, + 4, + 9, + 10, + 11, + 12, + 13, + 5, + 6, + 7, + 8, + 15, + 19, + 18, + 14, + 16, + 13, + 17, + ], + "Target": [1] * 2 + [0] * 5 + [1] * 3 + [0] * 5 + [1] * 5, + } + ), + pd.DataFrame( + { + "ID": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + ], + "A": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 9, + 8, + 9, + 8, + 9, + 6, + 5, + 6, + 6, + 9, + 8, + ], + "B": [ + "Cat", + "Cat", + "Cat", + "Cat", + "Cat", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Fish", + "Fish", + "Fish", + "Fish", + "Fish", + ], + "C": [ + 1, + 2, + 3, + 4, + 9, + 10, + 11, + 12, + 13, + 5, + 6, + 7, + 8, + 15, + 19, + 18, + 14, + 16, + 13, + 17, + ], + "Target": [ + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + ], + "C_bin": [ + "1.0 - 3.0", + "1.0 - 3.0", + "1.0 - 3.0", + "3.0 - 5.0", + "7.0 - 9.0", + "9.0 - 10.0", + "10.0 - 12.0", + "10.0 - 12.0", + "12.0 - 13.0", + "3.0 - 5.0", + "5.0 - 7.0", + "5.0 - 7.0", + "7.0 - 9.0", + "13.0 - 15.0", + "17.0 - 19.0", + "17.0 - 19.0", + "13.0 - 15.0", + "15.0 - 17.0", + "12.0 - 13.0", + "15.0 - 17.0", + ], + "B_processed": [ + "Cat", + "Cat", + "Cat", + "Cat", + "Cat", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Dog", + "Fish", + "Fish", + "Fish", + "Fish", + "Fish", + ], + "A_processed": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 9, + 8, + 9, + 8, + 9, + 6, + 5, + 6, + 6, + 9, + 8, + ], + "B_enc": [ + 0.4, + 0.4, + 0.4, + 0.4, + 0.4, + 0.3, + 0.3, + 0.3, + 0.3, + 0.3, + 0.3, + 0.3, + 0.3, + 0.3, + 0.3, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + ], + "A_enc": [ + 1.0, + 1.0, + 0.0, + 0.0, + 0.5, + 0.5, + 0.0, + 0.5, + 0.6, + 0.6, + 0.5, + 0.6, + 0.5, + 0.6, + 0.5, + 0.5, + 0.5, + 0.5, + 0.6, + 0.5, + ], + "C_enc": [ + 0.6666666666666666, + 0.6666666666666666, + 0.6666666666666666, + 0.5, + 0.0, + 0.0, + 0.5, + 0.5, + 1.0, + 0.5, + 0.0, + 0.0, + 0.0, + 0.5, + 0.5, + 0.5, + 0.5, + 1.0, + 1.0, + 1.0, + ], + } + ), + ) + ], ) def test_fit_transform_with_id_col_name(self, input, expected): - + preprocessor = PreProcessor.from_params(model_type="classification") - + # continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target") calculated = preprocessor.fit_transform( @@ -245,9 +800,11 @@ def test_fit_transform_with_id_col_name(self, input, expected): continuous_vars=None, discrete_vars=None, target_column_name="Target", - id_col_name="ID" - ) - pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False) + id_col_name="ID", + ) + pd.testing.assert_frame_equal( + calculated, expected, check_dtype=False, check_categorical=False + ) @staticmethod def mock_transform(df: pd.DataFrame, args): diff --git a/tests/preprocessing/test_target_encoder.py b/tests/preprocessing/test_target_encoder.py index 51ebd79..525317b 100644 --- a/tests/preprocessing/test_target_encoder.py +++ b/tests/preprocessing/test_target_encoder.py @@ -1,12 +1,11 @@ - import pytest import pandas as pd from sklearn.exceptions import NotFittedError from cobra.preprocessing.target_encoder import TargetEncoder -class TestTargetEncoder: +class TestTargetEncoder: def test_target_encoder_constructor_weight_value_error(self): with pytest.raises(ValueError): TargetEncoder(weight=-1) @@ -19,8 +18,10 @@ def test_target_encoder_constructor_imputation_value_error(self): def test_target_encoder_attributes_to_dict(self): encoder = TargetEncoder() - mapping_data = pd.Series(data=[0.333333, 0.50000, 0.666667], - index=["negative", "neutral", "positive"]) + mapping_data = pd.Series( + data=[0.333333, 0.50000, 0.666667], + index=["negative", "neutral", "positive"], + ) mapping_data.index.name = "variable" encoder._mapping["variable"] = mapping_data @@ -29,20 +30,24 @@ def test_target_encoder_attributes_to_dict(self): actual = encoder.attributes_to_dict() - expected = {"weight": 0.0, - "imputation_strategy": "mean", - "_global_mean": 0.5, - "_mapping": {"variable": { - "negative": 0.333333, - "neutral": 0.50000, - "positive": 0.666667 - }}} + expected = { + "weight": 0.0, + "imputation_strategy": "mean", + "_global_mean": 0.5, + "_mapping": { + "variable": { + "negative": 0.333333, + "neutral": 0.50000, + "positive": 0.666667, + } + }, + } assert actual == expected - @pytest.mark.parametrize("attribute", - ["weight", "mapping"], - ids=["test_weight", "test_mapping"]) + @pytest.mark.parametrize( + "attribute", ["weight", "mapping"], ids=["test_weight", "test_mapping"] + ) def test_target_encoder_set_attributes_from_dict_unfitted(self, attribute): encoder = TargetEncoder() @@ -63,18 +68,24 @@ def test_target_encoder_set_attributes_from_dict_unfitted(self, attribute): def test_target_encoder_set_attributes_from_dict(self): encoder = TargetEncoder() - data = {"weight": 0.0, - "_global_mean": 0.5, - "_mapping": {"variable": { + data = { + "weight": 0.0, + "_global_mean": 0.5, + "_mapping": { + "variable": { "negative": 0.333333, "neutral": 0.50000, - "positive": 0.666667 - }}} + "positive": 0.666667, + } + }, + } encoder.set_attributes_from_dict(data) - expected = pd.Series(data=[0.333333, 0.50000, 0.666667], - index=["negative", "neutral", "positive"]) + expected = pd.Series( + data=[0.333333, 0.50000, 0.666667], + index=["negative", "neutral", "positive"], + ) expected.index.name = "variable" actual = encoder._mapping["variable"] @@ -83,63 +94,118 @@ def test_target_encoder_set_attributes_from_dict(self): # Tests for _fit_column: def test_target_encoder_fit_column_binary_classification(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral'], - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + df = pd.DataFrame( + { + "variable": [ + "positive", + "positive", + "negative", + "neutral", + "negative", + "positive", + "negative", + "neutral", + "neutral", + "neutral", + ], + "target": [1, 1, 0, 0, 1, 0, 0, 0, 1, 1], + } + ) encoder = TargetEncoder() encoder._global_mean = 0.5 actual = encoder._fit_column(X=df.variable, y=df.target) - expected = pd.Series(data=[0.333333, 0.50000, 0.666667], - index=["negative", "neutral", "positive"]) + expected = pd.Series( + data=[0.333333, 0.50000, 0.666667], + index=["negative", "neutral", "positive"], + ) expected.index.name = "variable" pd.testing.assert_series_equal(actual, expected) def test_target_encoder_fit_column_linear_regression(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral', 'positive'], - 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) + df = pd.DataFrame( + { + "variable": [ + "positive", + "positive", + "negative", + "neutral", + "negative", + "positive", + "negative", + "neutral", + "neutral", + "neutral", + "positive", + ], + "target": [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4], + } + ) encoder = TargetEncoder() encoder._global_mean = 0.454545 actual = encoder._fit_column(X=df.variable, y=df.target) - expected = pd.Series(data=[-4.666667, 0.250000, 4.500000], - index=["negative", "neutral", "positive"]) + expected = pd.Series( + data=[-4.666667, 0.250000, 4.500000], + index=["negative", "neutral", "positive"], + ) expected.index.name = "variable" pd.testing.assert_series_equal(actual, expected) def test_target_encoder_fit_column_global_mean_binary_classification(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral'], - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + df = pd.DataFrame( + { + "variable": [ + "positive", + "positive", + "negative", + "neutral", + "negative", + "positive", + "negative", + "neutral", + "neutral", + "neutral", + ], + "target": [1, 1, 0, 0, 1, 0, 0, 0, 1, 1], + } + ) encoder = TargetEncoder(weight=1) encoder._global_mean = df.target.sum() / df.target.count() # is 0.5 actual = encoder._fit_column(X=df.variable, y=df.target) - expected = pd.Series(data=[0.375, 0.500, 0.625], - index=["negative", "neutral", "positive"]) + expected = pd.Series( + data=[0.375, 0.500, 0.625], index=["negative", "neutral", "positive"] + ) expected.index.name = "variable" pd.testing.assert_series_equal(actual, expected) def test_target_encoder_fit_column_global_mean_linear_regression(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral', 'positive'], - 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) + df = pd.DataFrame( + { + "variable": [ + "positive", + "positive", + "negative", + "neutral", + "negative", + "positive", + "negative", + "neutral", + "neutral", + "neutral", + "positive", + ], + "target": [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4], + } + ) encoder = TargetEncoder(weight=1) encoder._global_mean = 0.454545 @@ -149,10 +215,14 @@ def test_target_encoder_fit_column_global_mean_linear_regression(self): # expected new value: # [count of the value * its mean encoding + weight (= 1) * global mean] # / [count of the value + weight (=1)]. - expected = pd.Series(data=[(3 * -4.666667 + 1 * 0.454545) / (3 + 1), - (4 * 0.250000 + 1 * 0.454545) / (4 + 1), - (4 * 4.500000 + 1 * 0.454545) / (4 + 1)], - index=["negative", "neutral", "positive"]) + expected = pd.Series( + data=[ + (3 * -4.666667 + 1 * 0.454545) / (3 + 1), + (4 * 0.250000 + 1 * 0.454545) / (4 + 1), + (4 * 4.500000 + 1 * 0.454545) / (4 + 1), + ], + index=["negative", "neutral", "positive"], + ) expected.index.name = "variable" pd.testing.assert_series_equal(actual, expected) @@ -161,17 +231,31 @@ def test_target_encoder_fit_column_global_mean_linear_regression(self): def test_target_encoder_fit_binary_classification(self): # test_target_encoder_fit_column_linear_regression() tested on one # column input as a numpy series; this test runs on a dataframe input. - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral'], - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + df = pd.DataFrame( + { + "variable": [ + "positive", + "positive", + "negative", + "neutral", + "negative", + "positive", + "negative", + "neutral", + "neutral", + "neutral", + ], + "target": [1, 1, 0, 0, 1, 0, 0, 0, 1, 1], + } + ) encoder = TargetEncoder() encoder.fit(data=df, column_names=["variable"], target_column="target") - expected = pd.Series(data=[0.333333, 0.50000, 0.666667], - index=["negative", "neutral", "positive"]) + expected = pd.Series( + data=[0.333333, 0.50000, 0.666667], + index=["negative", "neutral", "positive"], + ) expected.index.name = "variable" actual = encoder._mapping["variable"] @@ -180,17 +264,32 @@ def test_target_encoder_fit_binary_classification(self): def test_target_encoder_fit_linear_regression(self): # test_target_encoder_fit_column_linear_regression() tested on one # column input as a numpy series; this test runs on a dataframe input. - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral', 'positive'], - 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) + df = pd.DataFrame( + { + "variable": [ + "positive", + "positive", + "negative", + "neutral", + "negative", + "positive", + "negative", + "neutral", + "neutral", + "neutral", + "positive", + ], + "target": [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4], + } + ) encoder = TargetEncoder() encoder.fit(data=df, column_names=["variable"], target_column="target") - expected = pd.Series(data=[-4.666667, 0.250000, 4.500000], - index=["negative", "neutral", "positive"]) + expected = pd.Series( + data=[-4.666667, 0.250000, 4.500000], + index=["negative", "neutral", "positive"], + ) expected.index.name = "variable" actual = encoder._mapping["variable"] @@ -198,11 +297,23 @@ def test_target_encoder_fit_linear_regression(self): # Tests for transform method def test_target_encoder_transform_when_not_fitted(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral'], - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + df = pd.DataFrame( + { + "variable": [ + "positive", + "positive", + "negative", + "neutral", + "negative", + "positive", + "negative", + "neutral", + "neutral", + "neutral", + ], + "target": [1, 1, 0, 0, 1, 0, 0, 0, 1, 1], + } + ) # inputs of TargetEncoder will be of dtype category df["variable"] = df["variable"].astype("category") @@ -212,19 +323,40 @@ def test_target_encoder_transform_when_not_fitted(self): encoder.transform(data=df, column_names=["variable"]) def test_target_encoder_transform_binary_classification(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral'], - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + df = pd.DataFrame( + { + "variable": [ + "positive", + "positive", + "negative", + "neutral", + "negative", + "positive", + "negative", + "neutral", + "neutral", + "neutral", + ], + "target": [1, 1, 0, 0, 1, 0, 0, 0, 1, 1], + } + ) # inputs of TargetEncoder will be of dtype category df["variable"] = df["variable"].astype("category") expected = df.copy() - expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000, - 0.333333, 0.666667, 0.333333, 0.50000, - 0.50000, 0.50000] + expected["variable_enc"] = [ + 0.666667, + 0.666667, + 0.333333, + 0.50000, + 0.333333, + 0.666667, + 0.333333, + 0.50000, + 0.50000, + 0.50000, + ] encoder = TargetEncoder() encoder.fit(data=df, column_names=["variable"], target_column="target") @@ -233,19 +365,42 @@ def test_target_encoder_transform_binary_classification(self): pd.testing.assert_frame_equal(actual, expected) def test_target_encoder_transform_linear_regression(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral', 'positive'], - 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) + df = pd.DataFrame( + { + "variable": [ + "positive", + "positive", + "negative", + "neutral", + "negative", + "positive", + "negative", + "neutral", + "neutral", + "neutral", + "positive", + ], + "target": [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4], + } + ) # inputs of TargetEncoder will be of dtype category df["variable"] = df["variable"].astype("category") expected = df.copy() - expected["variable_enc"] = [4.500000, 4.500000, -4.666667, 0.250000, - -4.666667, 4.500000, -4.666667, 0.250000, - 0.250000, 0.250000, 4.500000] + expected["variable_enc"] = [ + 4.500000, + 4.500000, + -4.666667, + 0.250000, + -4.666667, + 4.500000, + -4.666667, + 0.250000, + 0.250000, + 0.250000, + 4.500000, + ] encoder = TargetEncoder() encoder.fit(data=df, column_names=["variable"], target_column="target") @@ -254,23 +409,44 @@ def test_target_encoder_transform_linear_regression(self): pd.testing.assert_frame_equal(actual, expected) def test_target_encoder_transform_new_category_binary_classification(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral'], - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) - - df_appended = df.append({"variable": "new", "target": 1}, - ignore_index=True) + df = pd.DataFrame( + { + "variable": [ + "positive", + "positive", + "negative", + "neutral", + "negative", + "positive", + "negative", + "neutral", + "neutral", + "neutral", + ], + "target": [1, 1, 0, 0, 1, 0, 0, 0, 1, 1], + } + ) + + df_appended = df.append({"variable": "new", "target": 1}, ignore_index=True) # inputs of TargetEncoder will be of dtype category df["variable"] = df["variable"].astype("category") df_appended["variable"] = df_appended["variable"].astype("category") expected = df_appended.copy() - expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000, - 0.333333, 0.666667, 0.333333, 0.50000, - 0.50000, 0.50000, 0.333333] + expected["variable_enc"] = [ + 0.666667, + 0.666667, + 0.333333, + 0.50000, + 0.333333, + 0.666667, + 0.333333, + 0.50000, + 0.50000, + 0.50000, + 0.333333, + ] encoder = TargetEncoder(imputation_strategy="min") encoder.fit(data=df, column_names=["variable"], target_column="target") @@ -279,24 +455,46 @@ def test_target_encoder_transform_new_category_binary_classification(self): pd.testing.assert_frame_equal(actual, expected) def test_target_encoder_transform_new_category_linear_regression(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral', 'positive'], - 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) - - df_appended = df.append({"variable": "new", "target": 10}, - ignore_index=True) + df = pd.DataFrame( + { + "variable": [ + "positive", + "positive", + "negative", + "neutral", + "negative", + "positive", + "negative", + "neutral", + "neutral", + "neutral", + "positive", + ], + "target": [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4], + } + ) + + df_appended = df.append({"variable": "new", "target": 10}, ignore_index=True) # inputs of TargetEncoder will be of dtype category df["variable"] = df["variable"].astype("category") df_appended["variable"] = df_appended["variable"].astype("category") expected = df_appended.copy() - expected["variable_enc"] = [4.500000, 4.500000, -4.666667, 0.250000, - -4.666667, 4.500000, -4.666667, 0.250000, - 0.250000, 0.250000, 4.500000, - -4.666667] # min imputation for new value + expected["variable_enc"] = [ + 4.500000, + 4.500000, + -4.666667, + 0.250000, + -4.666667, + 4.500000, + -4.666667, + 0.250000, + 0.250000, + 0.250000, + 4.500000, + -4.666667, + ] # min imputation for new value encoder = TargetEncoder(imputation_strategy="min") encoder.fit(data=df, column_names=["variable"], target_column="target")