From ce322dfcada5ad0429e4953a56d854684c596ad0 Mon Sep 17 00:00:00 2001 From: Julie Alberge Date: Tue, 5 Dec 2023 17:10:03 +0100 Subject: [PATCH 01/13] Creating a new synthetic dataset. For now, the censoring method does not depend of the covariates. --- examples/complex_synthetic_data.py | 121 +++++++++++++++++++++++++++ examples/test_complex_data.py | 104 +++++++++++++++++++++++ hazardous/data/_competing_weibull.py | 10 +-- 3 files changed, 230 insertions(+), 5 deletions(-) create mode 100644 examples/complex_synthetic_data.py create mode 100644 examples/test_complex_data.py diff --git a/examples/complex_synthetic_data.py b/examples/complex_synthetic_data.py new file mode 100644 index 0000000..e8e25df --- /dev/null +++ b/examples/complex_synthetic_data.py @@ -0,0 +1,121 @@ +""" +================================================== +Synthetic DataSet +================================================== + +Trying to complexy synthetic data +""" +# %% +import numpy as np +from sklearn.preprocessing import PolynomialFeatures +import pandas as pd +from scipy.stats import weibull_min + +rng = np.random.default_rng(0) +seed = 0 +DEFAULT_SHAPE_RANGES = ( + (0.7, 0.9), + (1.0, 1.0), + (2.0, 3.0), +) + +DEFAULT_SCALE_RANGES = ( + (1, 20), + (1, 10), + (1.5, 5), +) + +n_events = 3 +n_weibull_parameters = 2 * n_events +n_samples = 3_000 +base_scale = 1_000 +n_features = 10 +features_rate = 0.3 +degree_interaction = 2 +relative_scale = 1.5 + +assert len(DEFAULT_SCALE_RANGES) == len(DEFAULT_SHAPE_RANGES) +assert len(DEFAULT_SHAPE_RANGES) == n_events + + +def complex_data( + n_events=n_events, + n_weibull_parameters=2 * n_events, + n_samples=3_000, + base_scale=1_000, + n_features=10, + features_rate=0.3, + degree_interaction=2, + relative_scale=1.5, +): + # Create features given to the model as X and then creating the interactions + df_features = pd.DataFrame(np.random.randn(n_samples, n_features)) + df_features.columns = [f"feature_{i}" for i in range(n_features)] + + # Adding interactions between features + df_poly_features = PolynomialFeatures( + degree=degree_interaction, interaction_only=True, include_bias=False + ) + df_poly_features.set_output(transform="pandas") + df_trans = df_poly_features.fit_transform(df_features) + + # Create masked matrix with the interactions + w_star = np.random.randn(df_trans.shape[1], n_weibull_parameters) + # Set 1-feature_rate% of the w_star to 0 + w_star = np.where( + np.random.rand(w_star.shape[0], w_star.shape[1]) < features_rate, 0, w_star + ) + # Computation of the true values of shape and scale + shape_scale_star = df_trans.values @ w_star + shape_scale_columns = [f"shape_{i}" for i in range(n_events)] + [ + f"scale_{i}" for i in range(n_events) + ] + df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns) + + # Rescaling of these values to stay in the chosen range + for event, (scale_default, shape_default) in enumerate( + zip(DEFAULT_SCALE_RANGES, DEFAULT_SHAPE_RANGES) + ): + shape_min, shape_max = shape_default + scale_min, scale_max = scale_default + shape = df_shape_scale_star[f"shape_{event}"].copy() + scale = df_shape_scale_star[f"scale_{event}"].copy() + shape = shape_min + (shape_max - shape_min) * (shape - shape.min()) / ( + shape.max() - shape.min() + ) + scale = scale_min + (scale_max - scale_min) * (scale - scale.min()) / ( + scale.max() - scale.min() + ) + df_shape_scale_star[f"shape_{event}"] = shape + df_shape_scale_star[f"scale_{event}"] = scale + + # Throw durations from a weibull distribution with scale and shape as the parameters + event_durations = [] + for event in range(n_events): + shape = df_shape_scale_star[f"shape_{event}"] + scale = df_shape_scale_star[f"scale_{event}"] + durations = weibull_min.rvs(shape, scale=scale * base_scale, random_state=rng) + event_durations.append(durations) + + # Creating the target tabular + event_durations = np.asarray(event_durations) + duration_argmin = np.argmin(event_durations, axis=0) + y = pd.DataFrame( + dict( + event=duration_argmin + 1, + duration=event_durations[duration_argmin, np.arange(n_samples)], + ) + ) + + # censoring is independant from the covariates for now + if relative_scale > 0: + scale = relative_scale * y["duration"].mean() + censoring = weibull_min.rvs(1, scale=scale, size=y.shape[0], random_state=rng) + y = y.copy() + y["event"] = np.where(y["duration"] < censoring, y["event"], 0) + y["duration"] = np.minimum(y["duration"], censoring) + + return df_features, y + + +# %% diff --git a/examples/test_complex_data.py b/examples/test_complex_data.py new file mode 100644 index 0000000..726d4c7 --- /dev/null +++ b/examples/test_complex_data.py @@ -0,0 +1,104 @@ +# %% +import numpy as np +from complex_synthetic_data import complex_data +from time import perf_counter +import matplotlib.pyplot as plt + +from hazardous import GradientBoostingIncidence +from lifelines import AalenJohansenFitter + +rng = np.random.default_rng(0) +seed = 0 +DEFAULT_SHAPE_RANGES = ( + (0.7, 0.9), + (1.0, 1.0), + (2.0, 3.0), +) + +DEFAULT_SCALE_RANGES = ( + (1, 20), + (1, 10), + (1.5, 5), +) +n_events = 3 + +# %% + +X, y = complex_data() +n_samples = len(X) +calculate_variance = n_samples <= 5_000 +aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0) + +gb_incidence = GradientBoostingIncidence( + learning_rate=0.03, + n_iter=200, + max_leaf_nodes=5, + hard_zero_fraction=0.1, + min_samples_leaf=50, + loss="ibs", + show_progressbar=False, + random_state=0, +) + +# %% +# +# CIFs estimated on uncensored data +# --------------------------------- +# +# Let's now estimate the CIFs on uncensored data and plot them against the +# theoretical CIFs: + + +def plot_cumulative_incidence_functions(X, y, gb_incidence=None, aj=None): + n_events = y["event"].max() + t_max = y["duration"].max() + _, axes = plt.subplots(figsize=(12, 4), ncols=n_events, sharey=True) + # Compute the estimate of the CIFs on a coarse grid. + coarse_timegrid = np.linspace(0, t_max, num=100) + censoring_fraction = (y["event"] == 0).mean() + plt.suptitle( + "Cause-specific cumulative incidence functions" + f" ({censoring_fraction:.1%} censoring)" + ) + + for event_id, ax in enumerate(axes, 1): + if gb_incidence is not None: + tic = perf_counter() + gb_incidence.set_params(event_of_interest=event_id) + gb_incidence.fit(X, y) + duration = perf_counter() - tic + print(f"GB Incidence for event {event_id} fit in {duration:.3f} s") + tic = perf_counter() + cif_pred = gb_incidence.predict_cumulative_incidence( + X[0:1], coarse_timegrid + )[0] + duration = perf_counter() - tic + print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s") + ax.plot( + coarse_timegrid, + cif_pred, + label="GradientBoostingIncidence", + ) + ax.set(title=f"Event {event_id}") + + if aj is not None: + tic = perf_counter() + aj.fit(y["duration"], y["event"], event_of_interest=event_id) + duration = perf_counter() - tic + print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s") + aj.plot(label="Aalen-Johansen", ax=ax) + + if event_id == 1: + ax.legend(loc="lower right") + else: + ax.legend().remove() + + +# %% + +plot_cumulative_incidence_functions( + X, + y, + gb_incidence=gb_incidence, + aj=aj, +) diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py index 2b15f9c..b1656b5 100644 --- a/hazardous/data/_competing_weibull.py +++ b/hazardous/data/_competing_weibull.py @@ -7,9 +7,9 @@ from sklearn.utils import check_random_state DEFAULT_SHAPE_RANGES = ( - (0.4, 0.9), + (0.7, 0.9), (1.0, 1.0), - (1.2, 3), + (2.0, 3.0), ) DEFAULT_SCALE_RANGES = ( @@ -19,7 +19,7 @@ ) -def _censor(y, relative_scale, random_state=None): +def _censor(y, relative_scale, random_state=0): if relative_scale == 0 or relative_scale is None: return y @@ -34,7 +34,7 @@ def _censor(y, relative_scale, random_state=None): def make_synthetic_competing_weibull( n_events=3, - n_samples=3000, + n_samples=3_000, return_X_y=False, base_scale=1_000, feature_rounding=2, @@ -101,4 +101,4 @@ def make_synthetic_competing_weibull( return X, y frame = pd.concat([X, y], axis=1) - return Bunch(data=frame[X.columns], target=X[y.columns], frame=frame) + return Bunch(data=frame[X.columns], target=frame[y.columns], frame=frame) From ab5d372829c4bf531695d5a9929a5f5dc319ee92 Mon Sep 17 00:00:00 2001 From: Julie Alberge Date: Fri, 8 Dec 2023 16:05:33 +0100 Subject: [PATCH 02/13] adding censoring depending on the covariates --- examples/complex_synthetic_data.py | 114 ++++++++++++++++++++++------- examples/test_complex_data.py | 37 ++++++++-- 2 files changed, 118 insertions(+), 33 deletions(-) diff --git a/examples/complex_synthetic_data.py b/examples/complex_synthetic_data.py index e8e25df..5826b5e 100644 --- a/examples/complex_synthetic_data.py +++ b/examples/complex_synthetic_data.py @@ -5,7 +5,11 @@ Trying to complexy synthetic data """ -# %% + +## TODO +# Change the linear scaling by an expit rescaling +# Improve censoring method by using covariates + import numpy as np from sklearn.preprocessing import PolynomialFeatures import pandas as pd @@ -38,20 +42,9 @@ assert len(DEFAULT_SHAPE_RANGES) == n_events -def complex_data( - n_events=n_events, - n_weibull_parameters=2 * n_events, - n_samples=3_000, - base_scale=1_000, - n_features=10, - features_rate=0.3, - degree_interaction=2, - relative_scale=1.5, +def compute_shape_and_scale( + df_features, n_weibull_parameters, features_rate, n_events, degree_interaction ): - # Create features given to the model as X and then creating the interactions - df_features = pd.DataFrame(np.random.randn(n_samples, n_features)) - df_features.columns = [f"feature_{i}" for i in range(n_features)] - # Adding interactions between features df_poly_features = PolynomialFeatures( degree=degree_interaction, interaction_only=True, include_bias=False @@ -63,7 +56,7 @@ def complex_data( w_star = np.random.randn(df_trans.shape[1], n_weibull_parameters) # Set 1-feature_rate% of the w_star to 0 w_star = np.where( - np.random.rand(w_star.shape[0], w_star.shape[1]) < features_rate, 0, w_star + np.random.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0 ) # Computation of the true values of shape and scale shape_scale_star = df_trans.values @ w_star @@ -71,7 +64,14 @@ def complex_data( f"scale_{i}" for i in range(n_events) ] df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns) + # Rescaling of these values to stay in the chosen range + df_shape_scale_star = rescaling_params_to_respect_default_ranges( + df_shape_scale_star + ) + return df_shape_scale_star + +def rescaling_params_to_respect_default_ranges(df_shape_scale_star): # Rescaling of these values to stay in the chosen range for event, (scale_default, shape_default) in enumerate( zip(DEFAULT_SCALE_RANGES, DEFAULT_SHAPE_RANGES) @@ -89,6 +89,67 @@ def complex_data( df_shape_scale_star[f"shape_{event}"] = shape df_shape_scale_star[f"scale_{event}"] = scale + return df_shape_scale_star + + +def censor_data( + y, relative_scale, independant=True, X=None, features_censoring_rate=0.2 +): + if relative_scale == 0 or relative_scale is None: + return y + + if independant: + scale_censoring = relative_scale * y["duration"].mean() + else: + features_impact_censoring = np.abs(np.random.randn(X.shape[1], 1)) + features_impact_censoring = np.where( + np.random.rand( + features_impact_censoring.shape[0], features_impact_censoring.shape[1] + ) + < features_censoring_rate, + features_impact_censoring, + 0, + ) + scale_censoring = (X @ features_impact_censoring).values.flatten() + scale_censoring = ( + ( + 1 + + (scale_censoring - scale_censoring.min()) + / (scale_censoring.max() - scale_censoring.min()) + ) + * relative_scale + * y["duration"].mean() + ) + + censoring = weibull_min.rvs( + 1, scale=scale_censoring, size=y.shape[0], random_state=rng + ) + y = y.copy() + y["event"] = np.where(y["duration"] < censoring, y["event"], 0) + y["duration"] = np.minimum(y["duration"], censoring) + return y + + +def complex_data( + n_events=n_events, + n_weibull_parameters=2 * n_events, + n_samples=3_000, + base_scale=1_000, + n_features=10, + features_rate=0.3, + degree_interaction=2, + relative_scale=1.5, + independant=True, + features_censoring_rate=0.2, + return_uncensored_data=False, +): + # Create features given to the model as X and then creating the interactions + df_features = pd.DataFrame(np.random.randn(n_samples, n_features)) + df_features.columns = [f"feature_{i}" for i in range(n_features)] + + df_shape_scale_star = compute_shape_and_scale( + df_features, n_weibull_parameters, features_rate, n_events, degree_interaction + ) # Throw durations from a weibull distribution with scale and shape as the parameters event_durations = [] for event in range(n_events): @@ -106,16 +167,13 @@ def complex_data( duration=event_durations[duration_argmin, np.arange(n_samples)], ) ) - - # censoring is independant from the covariates for now - if relative_scale > 0: - scale = relative_scale * y["duration"].mean() - censoring = weibull_min.rvs(1, scale=scale, size=y.shape[0], random_state=rng) - y = y.copy() - y["event"] = np.where(y["duration"] < censoring, y["event"], 0) - y["duration"] = np.minimum(y["duration"], censoring) - - return df_features, y - - -# %% + y_censored = censor_data( + y, + relative_scale, + independant=independant, + X=df_features, + features_censoring_rate=features_censoring_rate, + ) + if return_uncensored_data: + return df_features, y_censored, y + return df_features, y_censored diff --git a/examples/test_complex_data.py b/examples/test_complex_data.py index 726d4c7..9bdfc5d 100644 --- a/examples/test_complex_data.py +++ b/examples/test_complex_data.py @@ -3,6 +3,7 @@ from complex_synthetic_data import complex_data from time import perf_counter import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split from hazardous import GradientBoostingIncidence from lifelines import AalenJohansenFitter @@ -24,8 +25,22 @@ # %% -X, y = complex_data() -n_samples = len(X) +X, y = complex_data( + n_events=n_events, + n_weibull_parameters=2 * n_events, + n_samples=10_000, + base_scale=1_000, + n_features=10, + features_rate=0.3, + degree_interaction=2, + relative_scale=0.95, + independant=False, + features_censoring_rate=0.1, +) + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) + +n_samples = len(X_train) calculate_variance = n_samples <= 5_000 aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0) @@ -40,6 +55,7 @@ random_state=0, ) +y["event"].value_counts() # %% # # CIFs estimated on uncensored data @@ -49,7 +65,9 @@ # theoretical CIFs: -def plot_cumulative_incidence_functions(X, y, gb_incidence=None, aj=None): +def plot_cumulative_incidence_functions( + X, y, gb_incidence=None, aj=None, X_test=None, y_test=None +): n_events = y["event"].max() t_max = y["duration"].max() _, axes = plt.subplots(figsize=(12, 4), ncols=n_events, sharey=True) @@ -74,6 +92,11 @@ def plot_cumulative_incidence_functions(X, y, gb_incidence=None, aj=None): )[0] duration = perf_counter() - tic print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s") + print("Brier score on training data:", gb_incidence.score(X, y)) + if X_test is not None: + print( + "Brier score on testing data:", gb_incidence.score(X_test, y_test) + ) ax.plot( coarse_timegrid, cif_pred, @@ -97,8 +120,12 @@ def plot_cumulative_incidence_functions(X, y, gb_incidence=None, aj=None): # %% plot_cumulative_incidence_functions( - X, - y, + X_train, + y_train, gb_incidence=gb_incidence, aj=aj, + X_test=X_test, + y_test=y_test, ) + +# %% From b0380900f633552988f173b44bb97de73e1c3327 Mon Sep 17 00:00:00 2001 From: Julie Alberge Date: Mon, 11 Dec 2023 17:38:57 +0100 Subject: [PATCH 03/13] Synthetic dataset with expit --- examples/complex_synthetic_data.py | 85 +++++++++++++++--------------- examples/test_complex_data.py | 55 +++++++++++-------- 2 files changed, 77 insertions(+), 63 deletions(-) diff --git a/examples/complex_synthetic_data.py b/examples/complex_synthetic_data.py index 5826b5e..de35621 100644 --- a/examples/complex_synthetic_data.py +++ b/examples/complex_synthetic_data.py @@ -14,9 +14,11 @@ from sklearn.preprocessing import PolynomialFeatures import pandas as pd from scipy.stats import weibull_min +from scipy.special import expit -rng = np.random.default_rng(0) seed = 0 +rng = np.random.RandomState(seed) + DEFAULT_SHAPE_RANGES = ( (0.7, 0.9), (1.0, 1.0), @@ -53,10 +55,10 @@ def compute_shape_and_scale( df_trans = df_poly_features.fit_transform(df_features) # Create masked matrix with the interactions - w_star = np.random.randn(df_trans.shape[1], n_weibull_parameters) + w_star = rng.randn(df_trans.shape[1], n_weibull_parameters) # Set 1-feature_rate% of the w_star to 0 w_star = np.where( - np.random.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0 + rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0 ) # Computation of the true values of shape and scale shape_scale_star = df_trans.values @ w_star @@ -64,70 +66,69 @@ def compute_shape_and_scale( f"scale_{i}" for i in range(n_events) ] df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns) - # Rescaling of these values to stay in the chosen range - df_shape_scale_star = rescaling_params_to_respect_default_ranges( - df_shape_scale_star - ) - return df_shape_scale_star - - -def rescaling_params_to_respect_default_ranges(df_shape_scale_star): # Rescaling of these values to stay in the chosen range for event, (scale_default, shape_default) in enumerate( zip(DEFAULT_SCALE_RANGES, DEFAULT_SHAPE_RANGES) ): - shape_min, shape_max = shape_default - scale_min, scale_max = scale_default - shape = df_shape_scale_star[f"shape_{event}"].copy() - scale = df_shape_scale_star[f"scale_{event}"].copy() - shape = shape_min + (shape_max - shape_min) * (shape - shape.min()) / ( - shape.max() - shape.min() + df_shape_scale_star = rescaling_params_to_respect_default_ranges( + df_shape_scale_star, shape_default, scale_default, event ) - scale = scale_min + (scale_max - scale_min) * (scale - scale.min()) / ( - scale.max() - scale.min() - ) - df_shape_scale_star[f"shape_{event}"] = shape - df_shape_scale_star[f"scale_{event}"] = scale + return df_shape_scale_star + +def rescaling_params_to_respect_default_ranges( + df_shape_scale_star, shape_default, scale_default, event +): + # Rescaling of these values to stay in the chosen range + shape_min, shape_max = shape_default + scale_min, scale_max = scale_default + shape = df_shape_scale_star[f"shape_{event}"].copy() + scale = df_shape_scale_star[f"scale_{event}"].copy() + df_shape_scale_star[f"shape_{event}"] = shape_min + (shape_max - shape_min) * expit( + shape + ) + df_shape_scale_star[f"scale_{event}"] = scale_min + (scale_max - scale_min) * expit( + scale + ) return df_shape_scale_star def censor_data( - y, relative_scale, independant=True, X=None, features_censoring_rate=0.2 + y, independant=True, X=None, features_censoring_rate=0.2, relative_scale=1.5 ): if relative_scale == 0 or relative_scale is None: return y if independant: scale_censoring = relative_scale * y["duration"].mean() + shape_censoring = 1 else: - features_impact_censoring = np.abs(np.random.randn(X.shape[1], 1)) + features_impact_censoring = rng.randn(X.shape[1], 2) features_impact_censoring = np.where( - np.random.rand( + rng.rand( features_impact_censoring.shape[0], features_impact_censoring.shape[1] ) < features_censoring_rate, features_impact_censoring, 0, ) - scale_censoring = (X @ features_impact_censoring).values.flatten() - scale_censoring = ( - ( - 1 - + (scale_censoring - scale_censoring.min()) - / (scale_censoring.max() - scale_censoring.min()) - ) - * relative_scale - * y["duration"].mean() + df_censoring_params = relative_scale * X @ features_impact_censoring + df_censoring_params.columns = ["shape_0", "scale_0"] + df_censoring_params = rescaling_params_to_respect_default_ranges( + df_censoring_params, + shape_default=(0.5, 0.9), + scale_default=(10, 15), + event=0, ) - + scale_censoring = df_censoring_params["scale_0"].values * y["duration"].mean() + shape_censoring = df_censoring_params["shape_0"].values censoring = weibull_min.rvs( - 1, scale=scale_censoring, size=y.shape[0], random_state=rng + shape_censoring, scale=scale_censoring, size=y.shape[0], random_state=rng ) - y = y.copy() - y["event"] = np.where(y["duration"] < censoring, y["event"], 0) - y["duration"] = np.minimum(y["duration"], censoring) - return y + y_censored = y.copy() + y_censored["event"] = np.where(y["duration"] < censoring, y_censored["event"], 0) + y_censored["duration"] = np.minimum(y_censored["duration"], censoring) + return y_censored def complex_data( @@ -144,7 +145,7 @@ def complex_data( return_uncensored_data=False, ): # Create features given to the model as X and then creating the interactions - df_features = pd.DataFrame(np.random.randn(n_samples, n_features)) + df_features = pd.DataFrame(rng.randn(n_samples, n_features)) df_features.columns = [f"feature_{i}" for i in range(n_features)] df_shape_scale_star = compute_shape_and_scale( @@ -169,7 +170,7 @@ def complex_data( ) y_censored = censor_data( y, - relative_scale, + relative_scale=relative_scale, independant=independant, X=df_features, features_censoring_rate=features_censoring_rate, diff --git a/examples/test_complex_data.py b/examples/test_complex_data.py index 9bdfc5d..910aec2 100644 --- a/examples/test_complex_data.py +++ b/examples/test_complex_data.py @@ -8,8 +8,8 @@ from hazardous import GradientBoostingIncidence from lifelines import AalenJohansenFitter -rng = np.random.default_rng(0) seed = 0 +rng = np.random.RandomState(seed) DEFAULT_SHAPE_RANGES = ( (0.7, 0.9), (1.0, 1.0), @@ -25,37 +25,25 @@ # %% -X, y = complex_data( +X, y_censored, y_uncensored = complex_data( n_events=n_events, n_weibull_parameters=2 * n_events, - n_samples=10_000, + n_samples=30_000, base_scale=1_000, n_features=10, - features_rate=0.3, + features_rate=0.5, degree_interaction=2, relative_scale=0.95, independant=False, - features_censoring_rate=0.1, + features_censoring_rate=0.2, + return_uncensored_data=True, ) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) -n_samples = len(X_train) +n_samples = len(X) calculate_variance = n_samples <= 5_000 aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0) -gb_incidence = GradientBoostingIncidence( - learning_rate=0.03, - n_iter=200, - max_leaf_nodes=5, - hard_zero_fraction=0.1, - min_samples_leaf=50, - loss="ibs", - show_progressbar=False, - random_state=0, -) - -y["event"].value_counts() # %% # # CIFs estimated on uncensored data @@ -110,6 +98,7 @@ def plot_cumulative_incidence_functions( duration = perf_counter() - tic print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s") aj.plot(label="Aalen-Johansen", ax=ax) + print(aj.cumulative_density_.values[-1]) if event_id == 1: ax.legend(loc="lower right") @@ -119,13 +108,37 @@ def plot_cumulative_incidence_functions( # %% +X_train, X_test, y_train_c, y_test_c = train_test_split( + X, y_censored, test_size=0.3, random_state=seed +) +y_train_u = y_uncensored.loc[y_train_c.index] +y_test_u = y_uncensored.loc[y_test_c.index] +gb_incidence = GradientBoostingIncidence( + learning_rate=0.05, + n_iter=1_000, + max_leaf_nodes=50, + hard_zero_fraction=0.1, + min_samples_leaf=5, + loss="inll", + show_progressbar=False, + random_state=seed, +) + plot_cumulative_incidence_functions( X_train, - y_train, + y_train_u, gb_incidence=gb_incidence, aj=aj, X_test=X_test, - y_test=y_test, + y_test=y_test_u, ) +plot_cumulative_incidence_functions( + X_train, + y_train_c, + gb_incidence=gb_incidence, + aj=aj, + X_test=X_test, + y_test=y_test_c, +) # %% From 4388bd5b527990f8af8b39725d0f5fa4dc78f893 Mon Sep 17 00:00:00 2001 From: Julie Alberge Date: Tue, 12 Dec 2023 13:58:04 +0100 Subject: [PATCH 04/13] refactoring the code, adding histograms. --- examples/complex_synthetic_data.py | 180 -------------- ...t_complex_data.py => plot_complex_data.py} | 61 +++-- hazardous/data/__init__.py | 4 +- hazardous/data/_competing_weibull.py | 233 ++++++++++++++++-- 4 files changed, 259 insertions(+), 219 deletions(-) delete mode 100644 examples/complex_synthetic_data.py rename examples/{test_complex_data.py => plot_complex_data.py} (73%) diff --git a/examples/complex_synthetic_data.py b/examples/complex_synthetic_data.py deleted file mode 100644 index de35621..0000000 --- a/examples/complex_synthetic_data.py +++ /dev/null @@ -1,180 +0,0 @@ -""" -================================================== -Synthetic DataSet -================================================== - -Trying to complexy synthetic data -""" - -## TODO -# Change the linear scaling by an expit rescaling -# Improve censoring method by using covariates - -import numpy as np -from sklearn.preprocessing import PolynomialFeatures -import pandas as pd -from scipy.stats import weibull_min -from scipy.special import expit - -seed = 0 -rng = np.random.RandomState(seed) - -DEFAULT_SHAPE_RANGES = ( - (0.7, 0.9), - (1.0, 1.0), - (2.0, 3.0), -) - -DEFAULT_SCALE_RANGES = ( - (1, 20), - (1, 10), - (1.5, 5), -) - -n_events = 3 -n_weibull_parameters = 2 * n_events -n_samples = 3_000 -base_scale = 1_000 -n_features = 10 -features_rate = 0.3 -degree_interaction = 2 -relative_scale = 1.5 - -assert len(DEFAULT_SCALE_RANGES) == len(DEFAULT_SHAPE_RANGES) -assert len(DEFAULT_SHAPE_RANGES) == n_events - - -def compute_shape_and_scale( - df_features, n_weibull_parameters, features_rate, n_events, degree_interaction -): - # Adding interactions between features - df_poly_features = PolynomialFeatures( - degree=degree_interaction, interaction_only=True, include_bias=False - ) - df_poly_features.set_output(transform="pandas") - df_trans = df_poly_features.fit_transform(df_features) - - # Create masked matrix with the interactions - w_star = rng.randn(df_trans.shape[1], n_weibull_parameters) - # Set 1-feature_rate% of the w_star to 0 - w_star = np.where( - rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0 - ) - # Computation of the true values of shape and scale - shape_scale_star = df_trans.values @ w_star - shape_scale_columns = [f"shape_{i}" for i in range(n_events)] + [ - f"scale_{i}" for i in range(n_events) - ] - df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns) - # Rescaling of these values to stay in the chosen range - for event, (scale_default, shape_default) in enumerate( - zip(DEFAULT_SCALE_RANGES, DEFAULT_SHAPE_RANGES) - ): - df_shape_scale_star = rescaling_params_to_respect_default_ranges( - df_shape_scale_star, shape_default, scale_default, event - ) - return df_shape_scale_star - - -def rescaling_params_to_respect_default_ranges( - df_shape_scale_star, shape_default, scale_default, event -): - # Rescaling of these values to stay in the chosen range - shape_min, shape_max = shape_default - scale_min, scale_max = scale_default - shape = df_shape_scale_star[f"shape_{event}"].copy() - scale = df_shape_scale_star[f"scale_{event}"].copy() - df_shape_scale_star[f"shape_{event}"] = shape_min + (shape_max - shape_min) * expit( - shape - ) - df_shape_scale_star[f"scale_{event}"] = scale_min + (scale_max - scale_min) * expit( - scale - ) - return df_shape_scale_star - - -def censor_data( - y, independant=True, X=None, features_censoring_rate=0.2, relative_scale=1.5 -): - if relative_scale == 0 or relative_scale is None: - return y - - if independant: - scale_censoring = relative_scale * y["duration"].mean() - shape_censoring = 1 - else: - features_impact_censoring = rng.randn(X.shape[1], 2) - features_impact_censoring = np.where( - rng.rand( - features_impact_censoring.shape[0], features_impact_censoring.shape[1] - ) - < features_censoring_rate, - features_impact_censoring, - 0, - ) - df_censoring_params = relative_scale * X @ features_impact_censoring - df_censoring_params.columns = ["shape_0", "scale_0"] - df_censoring_params = rescaling_params_to_respect_default_ranges( - df_censoring_params, - shape_default=(0.5, 0.9), - scale_default=(10, 15), - event=0, - ) - scale_censoring = df_censoring_params["scale_0"].values * y["duration"].mean() - shape_censoring = df_censoring_params["shape_0"].values - censoring = weibull_min.rvs( - shape_censoring, scale=scale_censoring, size=y.shape[0], random_state=rng - ) - y_censored = y.copy() - y_censored["event"] = np.where(y["duration"] < censoring, y_censored["event"], 0) - y_censored["duration"] = np.minimum(y_censored["duration"], censoring) - return y_censored - - -def complex_data( - n_events=n_events, - n_weibull_parameters=2 * n_events, - n_samples=3_000, - base_scale=1_000, - n_features=10, - features_rate=0.3, - degree_interaction=2, - relative_scale=1.5, - independant=True, - features_censoring_rate=0.2, - return_uncensored_data=False, -): - # Create features given to the model as X and then creating the interactions - df_features = pd.DataFrame(rng.randn(n_samples, n_features)) - df_features.columns = [f"feature_{i}" for i in range(n_features)] - - df_shape_scale_star = compute_shape_and_scale( - df_features, n_weibull_parameters, features_rate, n_events, degree_interaction - ) - # Throw durations from a weibull distribution with scale and shape as the parameters - event_durations = [] - for event in range(n_events): - shape = df_shape_scale_star[f"shape_{event}"] - scale = df_shape_scale_star[f"scale_{event}"] - durations = weibull_min.rvs(shape, scale=scale * base_scale, random_state=rng) - event_durations.append(durations) - - # Creating the target tabular - event_durations = np.asarray(event_durations) - duration_argmin = np.argmin(event_durations, axis=0) - y = pd.DataFrame( - dict( - event=duration_argmin + 1, - duration=event_durations[duration_argmin, np.arange(n_samples)], - ) - ) - y_censored = censor_data( - y, - relative_scale=relative_scale, - independant=independant, - X=df_features, - features_censoring_rate=features_censoring_rate, - ) - if return_uncensored_data: - return df_features, y_censored, y - return df_features, y_censored diff --git a/examples/test_complex_data.py b/examples/plot_complex_data.py similarity index 73% rename from examples/test_complex_data.py rename to examples/plot_complex_data.py index 910aec2..259379c 100644 --- a/examples/test_complex_data.py +++ b/examples/plot_complex_data.py @@ -1,9 +1,11 @@ # %% import numpy as np -from complex_synthetic_data import complex_data +import hazardous.data._competing_weibull as competing_w from time import perf_counter import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split +import pandas as pd +import seaborn as sns from hazardous import GradientBoostingIncidence from lifelines import AalenJohansenFitter @@ -23,27 +25,59 @@ ) n_events = 3 + # %% +# In the following cell, we verify that the synthetic dataset is well defined. + +rng = np.random.RandomState(seed) +df_features = pd.DataFrame(rng.randn(100_000, 10)) +df_features.columns = [f"feature_{i}" for i in range(10)] + +df_shape_scale_star = competing_w.compute_shape_and_scale( + df_features, + 6, + 0.3, + n_events, + 2, + DEFAULT_SHAPE_RANGES, + DEFAULT_SCALE_RANGES, + 0, +) +fig, axes = plt.subplots(2, 3, figsize=(10, 7)) +for idx, col in enumerate(df_shape_scale_star.columns): + sns.histplot(df_shape_scale_star[col], ax=axes[idx // 3][idx % 3]) + -X, y_censored, y_uncensored = complex_data( +# %% + +X, y_censored, y_uncensored = competing_w.make_synthetic_dataset( n_events=n_events, n_weibull_parameters=2 * n_events, - n_samples=30_000, + n_samples=3_000, base_scale=1_000, n_features=10, features_rate=0.5, degree_interaction=2, - relative_scale=0.95, independant=False, features_censoring_rate=0.2, return_uncensored_data=True, + return_X_y=True, + feature_rounding=4, + target_rounding=1, + shape_ranges=DEFAULT_SHAPE_RANGES, + scale_ranges=DEFAULT_SCALE_RANGES, + censoring_relative_scale=0.1, + random_state=0, + complex_features=True, ) - n_samples = len(X) calculate_variance = n_samples <= 5_000 aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0) +y_censored["event"].value_counts() + + # %% # # CIFs estimated on uncensored data @@ -75,9 +109,9 @@ def plot_cumulative_incidence_functions( duration = perf_counter() - tic print(f"GB Incidence for event {event_id} fit in {duration:.3f} s") tic = perf_counter() - cif_pred = gb_incidence.predict_cumulative_incidence( - X[0:1], coarse_timegrid - )[0] + cifs_pred = gb_incidence.predict_cumulative_incidence(X, coarse_timegrid) + print(cifs_pred.shape) + cif_mean = cifs_pred.mean(axis=0) duration = perf_counter() - tic print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s") print("Brier score on training data:", gb_incidence.score(X, y)) @@ -87,7 +121,7 @@ def plot_cumulative_incidence_functions( ) ax.plot( coarse_timegrid, - cif_pred, + cif_mean, label="GradientBoostingIncidence", ) ax.set(title=f"Event {event_id}") @@ -114,12 +148,12 @@ def plot_cumulative_incidence_functions( y_train_u = y_uncensored.loc[y_train_c.index] y_test_u = y_uncensored.loc[y_test_c.index] gb_incidence = GradientBoostingIncidence( - learning_rate=0.05, - n_iter=1_000, - max_leaf_nodes=50, + learning_rate=0.1, + n_iter=20, + max_leaf_nodes=15, hard_zero_fraction=0.1, min_samples_leaf=5, - loss="inll", + loss="ibs", show_progressbar=False, random_state=seed, ) @@ -141,4 +175,3 @@ def plot_cumulative_incidence_functions( X_test=X_test, y_test=y_test_c, ) -# %% diff --git a/hazardous/data/__init__.py b/hazardous/data/__init__.py index 92a8cc2..458516d 100644 --- a/hazardous/data/__init__.py +++ b/hazardous/data/__init__.py @@ -1,3 +1,3 @@ -from ._competing_weibull import make_synthetic_competing_weibull +# from ._competing_weibull import make_synthetic_competing_weibull -__all__ = ["make_synthetic_competing_weibull"] +# __all__ = ["make_synthetic_dataset"] diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py index b1656b5..3edfe8e 100644 --- a/hazardous/data/_competing_weibull.py +++ b/hazardous/data/_competing_weibull.py @@ -2,8 +2,10 @@ import numpy as np import pandas as pd +from scipy.special import expit from scipy.stats import weibull_min from sklearn.datasets._base import Bunch +from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures from sklearn.utils import check_random_state DEFAULT_SHAPE_RANGES = ( @@ -19,29 +21,118 @@ ) -def _censor(y, relative_scale, random_state=0): - if relative_scale == 0 or relative_scale is None: +def _censor( + y, + independant=True, + X=None, + features_censoring_rate=0.2, + censoring_relative_scale=1.5, + random_state=0, +): + rng = np.random.RandomState(random_state) + if censoring_relative_scale == 0 or censoring_relative_scale is None: return y - rng = check_random_state(random_state) - scale = relative_scale * y["duration"].mean() - censoring = weibull_min.rvs(1, scale=scale, size=y.shape[0], random_state=rng) - y = y.copy() - y["event"] = np.where(y["duration"] < censoring, y["event"], 0) - y["duration"] = np.minimum(y["duration"], censoring) - return y + if independant: + scale_censoring = censoring_relative_scale * y["duration"].mean() + shape_censoring = 1 + else: + features_impact_censoring = rng.randn(X.shape[1], 2) + features_impact_censoring = np.where( + rng.rand( + features_impact_censoring.shape[0], features_impact_censoring.shape[1] + ) + < features_censoring_rate, + features_impact_censoring, + 0, + ) + df_censoring_params = censoring_relative_scale * X @ features_impact_censoring + df_censoring_params.columns = ["shape_0", "scale_0"] + df_censoring_params = rescaling_params_to_respect_default_ranges( + df_censoring_params, + shape_default=(1.0, 1.0), + scale_default=(1, 10 * censoring_relative_scale), + event=0, + ) + scale_censoring = df_censoring_params["scale_0"].values * y["duration"].mean() + shape_censoring = df_censoring_params["shape_0"].values + censoring = weibull_min.rvs( + shape_censoring, scale=scale_censoring, size=y.shape[0], random_state=rng + ) + y_censored = y.copy() + y_censored["event"] = np.where(y["duration"] < censoring, y_censored["event"], 0) + y_censored["duration"] = np.minimum(y_censored["duration"], censoring) + return y_censored -def make_synthetic_competing_weibull( +def compute_shape_and_scale( + df_features, + n_weibull_parameters=6, + features_rate=0.2, + n_events=3, + degree_interaction=2, + shape_ranges=DEFAULT_SHAPE_RANGES, + scale_ranges=DEFAULT_SCALE_RANGES, + random_state=0, +): + rng = np.random.RandomState(random_state) + # Adding interactions between features + df_poly_features = PolynomialFeatures( + degree=degree_interaction, interaction_only=True, include_bias=False + ) + df_poly_features.set_output(transform="pandas") + df_trans = df_poly_features.fit_transform(df_features) + + # Create masked matrix with the interactions + w_star = rng.randn(df_trans.shape[1], n_weibull_parameters) + # Set 1-feature_rate% of the w_star to 0 + w_star = np.where( + rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0 + ) + # Computation of the true values of shape and scale + shape_scale_star = df_trans.values @ w_star + shape_scale_columns = [f"shape_{i}" for i in range(n_events)] + [ + f"scale_{i}" for i in range(n_events) + ] + df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns) + # Rescaling of these values to stay in the chosen range + for event, (scale_default, shape_default) in enumerate( + zip(scale_ranges, shape_ranges) + ): + df_shape_scale_star = rescaling_params_to_respect_default_ranges( + df_shape_scale_star, shape_default, scale_default, event + ) + return df_shape_scale_star + + +def rescaling_params_to_respect_default_ranges( + df_shape_scale_star, shape_default, scale_default, event +): + # Rescaling of these values to stay in the chosen range + shape_min, shape_max = shape_default + scale_min, scale_max = scale_default + shape = df_shape_scale_star[f"shape_{event}"].copy() + scale = df_shape_scale_star[f"scale_{event}"].copy() + scaler = MinMaxScaler(feature_range=(-2, 2)) + df_shape_scale_star[f"shape_{event}"] = ( + shape_min + + (shape_max - shape_min) + * expit(scaler.fit_transform(shape.values.reshape(-1, 1))) + ).flatten() + df_shape_scale_star[f"scale_{event}"] = ( + scale_min + + (scale_max - scale_min) + * expit(scaler.fit_transform(scale.values.reshape(-1, 1))).flatten() + ) + return df_shape_scale_star + + +def make_simple_features( n_events=3, n_samples=3_000, - return_X_y=False, base_scale=1_000, - feature_rounding=2, - target_rounding=1, shape_ranges=DEFAULT_SHAPE_RANGES, scale_ranges=DEFAULT_SCALE_RANGES, - censoring_relative_scale=1.5, random_state=None, ): """Generate a synthetic dataset with competing Weibull-distributed events. @@ -58,14 +149,6 @@ def make_synthetic_competing_weibull( individual, the event type with the shortest duration is kept as the target event (competing risks setting) and its event identifier and duration are returned as the target dataframe. - - A fraction of the individuals are censored by sampling a censoring time - from a Weibull distribution with shape 1 and scale equal to the mean - duration of the target event times the ``censoring_relative_scale``. - - Setting ``censoring_relative_scale`` to 0 or None disables censoring. - Setting it to a small value (e.g. 0.5 instead of 1.5) will result in a - larger fraction of censored individuals. """ rng = check_random_state(random_state) all_features = [] @@ -84,20 +167,124 @@ def make_synthetic_competing_weibull( event_durations = np.asarray(event_durations) duration_argmin = np.argmin(event_durations, axis=0) X = pd.concat(all_features, axis=1) + return X, event_durations, duration_argmin + + +def make_complex_features_with_sparse_matrix( + n_events=3, + n_weibull_parameters=6, + n_samples=3_000, + base_scale=1_000, + shape_ranges=DEFAULT_SHAPE_RANGES, + scale_ranges=DEFAULT_SCALE_RANGES, + n_features=10, + features_rate=0.3, + degree_interaction=2, + random_state=0, +): + rng = np.random.RandomState(random_state) + # Create features given to the model as X and then creating the interactions + df_features = pd.DataFrame(rng.randn(n_samples, n_features)) + df_features.columns = [f"feature_{i}" for i in range(n_features)] + + df_shape_scale_star = compute_shape_and_scale( + df_features, + n_weibull_parameters, + features_rate, + n_events, + degree_interaction, + shape_ranges, + scale_ranges, + random_state, + ) + # Throw durations from a weibull distribution with scale and shape as the parameters + event_durations = [] + for event in range(n_events): + shape = df_shape_scale_star[f"shape_{event}"] + scale = df_shape_scale_star[f"scale_{event}"] + durations = weibull_min.rvs(shape, scale=scale * base_scale, random_state=rng) + event_durations.append(durations) + + # Creating the target tabular + event_durations = np.asarray(event_durations) + duration_argmin = np.argmin(event_durations, axis=0) + return df_features, event_durations, duration_argmin + + +def make_synthetic_dataset( + n_events, + n_weibull_parameters, + n_samples=3_000, + base_scale=1_000, + n_features=10, + features_rate=0.3, + degree_interaction=2, + independant=True, + features_censoring_rate=0.2, + return_uncensored_data=False, + return_X_y=True, + feature_rounding=2, + target_rounding=1, + shape_ranges=DEFAULT_SHAPE_RANGES, + scale_ranges=DEFAULT_SCALE_RANGES, + censoring_relative_scale=1.5, + random_state=0, + complex_features=True, +): + """ + Creating a synthetic dataset to make competing risks. + Depending of the choice of the user, the function may genere a simple dataset + (setting ``complex_features`` to False or either create complex features.) + + A fraction of the individuals are censored by sampling a censoring time + from a Weibull distribution with shape 1 and scale equal to the mean + duration of the target event times the ``censoring_relative_scale``. + + Setting ``censoring_relative_scale`` to 0 or None disables censoring. + Setting it to a small value (e.g. 0.5 instead of 1.5) will result in a + larger fraction of censored individuals. + """ + if complex_features: + X, event_durations, duration_argmin = make_complex_features_with_sparse_matrix( + n_events, + n_weibull_parameters, + n_samples, + base_scale, + shape_ranges, + scale_ranges, + n_features, + features_rate, + degree_interaction, + random_state, + ) + else: + X, event_durations, duration_argmin = make_simple_features( + n_events, n_samples, base_scale, shape_ranges, scale_ranges, random_state + ) y = pd.DataFrame( dict( event=duration_argmin + 1, duration=event_durations[duration_argmin, np.arange(n_samples)], ) ) - y = _censor(y, censoring_relative_scale, random_state=random_state) + y_censored = _censor( + y, + censoring_relative_scale=censoring_relative_scale, + independant=independant, + X=X, + features_censoring_rate=features_censoring_rate, + random_state=random_state, + ) if feature_rounding is not None: X = X.round(feature_rounding) if target_rounding is not None: + y_censored = y_censored.round(target_rounding) y = y.round(target_rounding) if return_X_y: + if return_uncensored_data: + return X, y_censored, y return X, y frame = pd.concat([X, y], axis=1) From dbf1baa77a204e10da6066cae9e975817bc36f93 Mon Sep 17 00:00:00 2001 From: Julie Alberge Date: Wed, 13 Dec 2023 11:52:25 +0100 Subject: [PATCH 05/13] event well numeroted --- examples/plot_complex_data.py | 18 ++++++++++++++++-- hazardous/data/_competing_weibull.py | 12 ++++++------ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/examples/plot_complex_data.py b/examples/plot_complex_data.py index 259379c..dc80505 100644 --- a/examples/plot_complex_data.py +++ b/examples/plot_complex_data.py @@ -77,6 +77,18 @@ y_censored["event"].value_counts() +gb_incidence = GradientBoostingIncidence( + learning_rate=0.1, + n_iter=20, + max_leaf_nodes=15, + hard_zero_fraction=0.1, + min_samples_leaf=5, + loss="ibs", + show_progressbar=False, + random_state=seed, +) +gb_incidence.fit(X, y_censored) + # %% # @@ -114,10 +126,10 @@ def plot_cumulative_incidence_functions( cif_mean = cifs_pred.mean(axis=0) duration = perf_counter() - tic print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s") - print("Brier score on training data:", gb_incidence.score(X, y)) + print("Brier score on training data:", -gb_incidence.score(X, y)) if X_test is not None: print( - "Brier score on testing data:", gb_incidence.score(X_test, y_test) + "Brier score on testing data:", -gb_incidence.score(X_test, y_test) ) ax.plot( coarse_timegrid, @@ -175,3 +187,5 @@ def plot_cumulative_incidence_functions( X_test=X_test, y_test=y_test_c, ) + +# %% diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py index 3edfe8e..83e06cd 100644 --- a/hazardous/data/_competing_weibull.py +++ b/hazardous/data/_competing_weibull.py @@ -91,16 +91,16 @@ def compute_shape_and_scale( ) # Computation of the true values of shape and scale shape_scale_star = df_trans.values @ w_star - shape_scale_columns = [f"shape_{i}" for i in range(n_events)] + [ - f"scale_{i}" for i in range(n_events) + shape_scale_columns = [f"shape_{i}" for i in range(1, n_events + 1)] + [ + f"scale_{i}" for i in range(1, n_events + 1) ] df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns) # Rescaling of these values to stay in the chosen range - for event, (scale_default, shape_default) in enumerate( - zip(scale_ranges, shape_ranges) + for event_id, shape_range, scale_range in zip( + range(1, n_events + 1), cycle(shape_ranges), cycle(scale_ranges) ): df_shape_scale_star = rescaling_params_to_respect_default_ranges( - df_shape_scale_star, shape_default, scale_default, event + df_shape_scale_star, shape_range, scale_range, event_id ) return df_shape_scale_star @@ -199,7 +199,7 @@ def make_complex_features_with_sparse_matrix( ) # Throw durations from a weibull distribution with scale and shape as the parameters event_durations = [] - for event in range(n_events): + for event in range(1, n_events + 1): shape = df_shape_scale_star[f"shape_{event}"] scale = df_shape_scale_star[f"scale_{event}"] durations = weibull_min.rvs(shape, scale=scale * base_scale, random_state=rng) From 2de17bb1efdb5ec58eef9e0b10407e5133427968 Mon Sep 17 00:00:00 2001 From: Julie Alberge Date: Wed, 13 Dec 2023 15:04:00 +0100 Subject: [PATCH 06/13] changes dones after Olivier's reviewing --- examples/plot_complex_data.py | 33 ++++++++++++++++++---------- hazardous/data/__init__.py | 4 ++-- hazardous/data/_competing_weibull.py | 18 ++++++--------- 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/examples/plot_complex_data.py b/examples/plot_complex_data.py index dc80505..9dc67e2 100644 --- a/examples/plot_complex_data.py +++ b/examples/plot_complex_data.py @@ -1,3 +1,17 @@ +""" +========================================== +Modeling competing risks on synthetic data +========================================== + +This example introduces a synthetic data generation tool that can +be helpful to study the relative performance and potential biases +of predictive competing risks estimators on right-censored data. +Some of the input features and their second order multiplicative +interactions are statistically associated with the parameters of +the distributions from which the competing events are sampled. + +""" + # %% import numpy as np import hazardous.data._competing_weibull as competing_w @@ -35,13 +49,12 @@ df_shape_scale_star = competing_w.compute_shape_and_scale( df_features, - 6, - 0.3, - n_events, - 2, - DEFAULT_SHAPE_RANGES, - DEFAULT_SCALE_RANGES, - 0, + features_rate=0.2, + n_events=3, + degree_interaction=2, + shape_ranges=DEFAULT_SHAPE_RANGES, + scale_ranges=DEFAULT_SCALE_RANGES, + random_state=0, ) fig, axes = plt.subplots(2, 3, figsize=(10, 7)) for idx, col in enumerate(df_shape_scale_star.columns): @@ -50,15 +63,14 @@ # %% -X, y_censored, y_uncensored = competing_w.make_synthetic_dataset( +X, y_censored, y_uncensored = competing_w.make_synthetic_competing_weibull( n_events=n_events, - n_weibull_parameters=2 * n_events, n_samples=3_000, base_scale=1_000, n_features=10, features_rate=0.5, degree_interaction=2, - independant=False, + independent=False, features_censoring_rate=0.2, return_uncensored_data=True, return_X_y=True, @@ -122,7 +134,6 @@ def plot_cumulative_incidence_functions( print(f"GB Incidence for event {event_id} fit in {duration:.3f} s") tic = perf_counter() cifs_pred = gb_incidence.predict_cumulative_incidence(X, coarse_timegrid) - print(cifs_pred.shape) cif_mean = cifs_pred.mean(axis=0) duration = perf_counter() - tic print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s") diff --git a/hazardous/data/__init__.py b/hazardous/data/__init__.py index 458516d..92a8cc2 100644 --- a/hazardous/data/__init__.py +++ b/hazardous/data/__init__.py @@ -1,3 +1,3 @@ -# from ._competing_weibull import make_synthetic_competing_weibull +from ._competing_weibull import make_synthetic_competing_weibull -# __all__ = ["make_synthetic_dataset"] +__all__ = ["make_synthetic_competing_weibull"] diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py index 83e06cd..e531dc1 100644 --- a/hazardous/data/_competing_weibull.py +++ b/hazardous/data/_competing_weibull.py @@ -23,7 +23,7 @@ def _censor( y, - independant=True, + independent=True, X=None, features_censoring_rate=0.2, censoring_relative_scale=1.5, @@ -33,7 +33,7 @@ def _censor( if censoring_relative_scale == 0 or censoring_relative_scale is None: return y - if independant: + if independent: scale_censoring = censoring_relative_scale * y["duration"].mean() shape_censoring = 1 else: @@ -67,7 +67,6 @@ def _censor( def compute_shape_and_scale( df_features, - n_weibull_parameters=6, features_rate=0.2, n_events=3, degree_interaction=2, @@ -84,6 +83,7 @@ def compute_shape_and_scale( df_trans = df_poly_features.fit_transform(df_features) # Create masked matrix with the interactions + n_weibull_parameters = 2 * n_events w_star = rng.randn(df_trans.shape[1], n_weibull_parameters) # Set 1-feature_rate% of the w_star to 0 w_star = np.where( @@ -172,7 +172,6 @@ def make_simple_features( def make_complex_features_with_sparse_matrix( n_events=3, - n_weibull_parameters=6, n_samples=3_000, base_scale=1_000, shape_ranges=DEFAULT_SHAPE_RANGES, @@ -189,7 +188,6 @@ def make_complex_features_with_sparse_matrix( df_shape_scale_star = compute_shape_and_scale( df_features, - n_weibull_parameters, features_rate, n_events, degree_interaction, @@ -211,15 +209,14 @@ def make_complex_features_with_sparse_matrix( return df_features, event_durations, duration_argmin -def make_synthetic_dataset( +def make_synthetic_competing_weibull( n_events, - n_weibull_parameters, n_samples=3_000, base_scale=1_000, n_features=10, features_rate=0.3, degree_interaction=2, - independant=True, + independent=True, features_censoring_rate=0.2, return_uncensored_data=False, return_X_y=True, @@ -229,7 +226,7 @@ def make_synthetic_dataset( scale_ranges=DEFAULT_SCALE_RANGES, censoring_relative_scale=1.5, random_state=0, - complex_features=True, + complex_features=False, ): """ Creating a synthetic dataset to make competing risks. @@ -247,7 +244,6 @@ def make_synthetic_dataset( if complex_features: X, event_durations, duration_argmin = make_complex_features_with_sparse_matrix( n_events, - n_weibull_parameters, n_samples, base_scale, shape_ranges, @@ -270,7 +266,7 @@ def make_synthetic_dataset( y_censored = _censor( y, censoring_relative_scale=censoring_relative_scale, - independant=independant, + independent=independent, X=X, features_censoring_rate=features_censoring_rate, random_state=random_state, From f1eb9fc73b445662421e9fc807af01e879b6f3bf Mon Sep 17 00:00:00 2001 From: Julie Alberge Date: Wed, 13 Dec 2023 15:25:20 +0100 Subject: [PATCH 07/13] fix some tests. --- hazardous/data/_competing_weibull.py | 4 ++-- hazardous/data/tests/test_competing_weibull.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py index e531dc1..d924ede 100644 --- a/hazardous/data/_competing_weibull.py +++ b/hazardous/data/_competing_weibull.py @@ -210,7 +210,7 @@ def make_complex_features_with_sparse_matrix( def make_synthetic_competing_weibull( - n_events, + n_events=3, n_samples=3_000, base_scale=1_000, n_features=10, @@ -281,7 +281,7 @@ def make_synthetic_competing_weibull( if return_X_y: if return_uncensored_data: return X, y_censored, y - return X, y + return X, y_censored frame = pd.concat([X, y], axis=1) return Bunch(data=frame[X.columns], target=frame[y.columns], frame=frame) diff --git a/hazardous/data/tests/test_competing_weibull.py b/hazardous/data/tests/test_competing_weibull.py index 594dbe3..b0dafcd 100644 --- a/hazardous/data/tests/test_competing_weibull.py +++ b/hazardous/data/tests/test_competing_weibull.py @@ -52,8 +52,8 @@ def test_competing_weibull_no_censoring(seed): y["event"], cv=3, ).mean() - assert 0.4 > baseline_classification_acc > 0.3 # approximately balanced - assert event_classification_acc > 1.2 * baseline_classification_acc + assert 0.45 > baseline_classification_acc > 0.3 # approximately balanced + assert event_classification_acc > 1.1 * baseline_classification_acc assert event_classification_acc < 0.6 # still challenging. # Check that the features make it possible to predict the durations better From 373745d18f401f033d09a06d3a9361a767cc92e8 Mon Sep 17 00:00:00 2001 From: Julie Alberge Date: Thu, 14 Dec 2023 10:49:47 +0100 Subject: [PATCH 08/13] iadding seasborn to push doc. --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 2c24fb7..862b146 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ doc = [ "matplotlib", "pillow", # to scrape images from the examples "numpydoc", + "seaborn", ] [project.urls] From 304dec77bda395f53440b6a4d71ff617c74f4dfe Mon Sep 17 00:00:00 2001 From: Julie Alberge Date: Thu, 14 Dec 2023 19:01:42 +0100 Subject: [PATCH 09/13] fixing AJ estimator, adding 50% features marinal and 50% features interaction in the creation of w_star --- examples/plot_complex_data.py | 35 +++++------------ hazardous/data/_competing_weibull.py | 57 ++++++++++++++++++---------- 2 files changed, 47 insertions(+), 45 deletions(-) diff --git a/examples/plot_complex_data.py b/examples/plot_complex_data.py index 9dc67e2..6dda353 100644 --- a/examples/plot_complex_data.py +++ b/examples/plot_complex_data.py @@ -26,18 +26,6 @@ seed = 0 rng = np.random.RandomState(seed) -DEFAULT_SHAPE_RANGES = ( - (0.7, 0.9), - (1.0, 1.0), - (2.0, 3.0), -) - -DEFAULT_SCALE_RANGES = ( - (1, 20), - (1, 10), - (1.5, 5), -) -n_events = 3 # %% @@ -52,19 +40,15 @@ features_rate=0.2, n_events=3, degree_interaction=2, - shape_ranges=DEFAULT_SHAPE_RANGES, - scale_ranges=DEFAULT_SCALE_RANGES, random_state=0, ) fig, axes = plt.subplots(2, 3, figsize=(10, 7)) for idx, col in enumerate(df_shape_scale_star.columns): sns.histplot(df_shape_scale_star[col], ax=axes[idx // 3][idx % 3]) - # %% X, y_censored, y_uncensored = competing_w.make_synthetic_competing_weibull( - n_events=n_events, n_samples=3_000, base_scale=1_000, n_features=10, @@ -74,15 +58,14 @@ features_censoring_rate=0.2, return_uncensored_data=True, return_X_y=True, - feature_rounding=4, - target_rounding=1, - shape_ranges=DEFAULT_SHAPE_RANGES, - scale_ranges=DEFAULT_SCALE_RANGES, - censoring_relative_scale=0.1, + feature_rounding=3, + target_rounding=4, + censoring_relative_scale=4.0, random_state=0, complex_features=True, ) +# %% n_samples = len(X) calculate_variance = n_samples <= 5_000 aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0) @@ -99,7 +82,6 @@ show_progressbar=False, random_state=seed, ) -gb_incidence.fit(X, y_censored) # %% @@ -155,8 +137,8 @@ def plot_cumulative_incidence_functions( duration = perf_counter() - tic print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s") aj.plot(label="Aalen-Johansen", ax=ax) - print(aj.cumulative_density_.values[-1]) - + ax.set_xlim(0, 8_000) + ax.set_ylim(0, 0.5) if event_id == 1: ax.legend(loc="lower right") else: @@ -170,6 +152,7 @@ def plot_cumulative_incidence_functions( ) y_train_u = y_uncensored.loc[y_train_c.index] y_test_u = y_uncensored.loc[y_test_c.index] + gb_incidence = GradientBoostingIncidence( learning_rate=0.1, n_iter=20, @@ -184,7 +167,7 @@ def plot_cumulative_incidence_functions( plot_cumulative_incidence_functions( X_train, y_train_u, - gb_incidence=gb_incidence, + gb_incidence=None, aj=aj, X_test=X_test, y_test=y_test_u, @@ -193,7 +176,7 @@ def plot_cumulative_incidence_functions( plot_cumulative_incidence_functions( X_train, y_train_c, - gb_incidence=gb_incidence, + gb_incidence=None, aj=aj, X_test=X_test, y_test=y_test_c, diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py index d924ede..7273ad9 100644 --- a/hazardous/data/_competing_weibull.py +++ b/hazardous/data/_competing_weibull.py @@ -5,18 +5,19 @@ from scipy.special import expit from scipy.stats import weibull_min from sklearn.datasets._base import Bunch -from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import PolynomialFeatures, SplineTransformer, StandardScaler from sklearn.utils import check_random_state DEFAULT_SHAPE_RANGES = ( - (0.7, 0.9), + (0.8, 1.0), (1.0, 1.0), - (2.0, 3.0), + (1.2, 3.0), ) DEFAULT_SCALE_RANGES = ( - (1, 20), - (1, 10), + (1, 15), + (1, 7), (1.5, 5), ) @@ -35,7 +36,7 @@ def _censor( if independent: scale_censoring = censoring_relative_scale * y["duration"].mean() - shape_censoring = 1 + shape_censoring = 3 else: features_impact_censoring = rng.randn(X.shape[1], 2) features_impact_censoring = np.where( @@ -50,7 +51,7 @@ def _censor( df_censoring_params.columns = ["shape_0", "scale_0"] df_censoring_params = rescaling_params_to_respect_default_ranges( df_censoring_params, - shape_default=(1.0, 1.0), + shape_default=(3.0, 4.0), scale_default=(1, 10 * censoring_relative_scale), event=0, ) @@ -60,7 +61,9 @@ def _censor( shape_censoring, scale=scale_censoring, size=y.shape[0], random_state=rng ) y_censored = y.copy() - y_censored["event"] = np.where(y["duration"] < censoring, y_censored["event"], 0) + y_censored["event"] = np.where( + y_censored["duration"] < censoring, y_censored["event"], 0 + ) y_censored["duration"] = np.minimum(y_censored["duration"], censoring) return y_censored @@ -76,19 +79,35 @@ def compute_shape_and_scale( ): rng = np.random.RandomState(random_state) # Adding interactions between features - df_poly_features = PolynomialFeatures( - degree=degree_interaction, interaction_only=True, include_bias=False + preprocessor = make_pipeline( + SplineTransformer(n_knots=3), + PolynomialFeatures( + degree=degree_interaction, interaction_only=True, include_bias=False + ), ) - df_poly_features.set_output(transform="pandas") - df_trans = df_poly_features.fit_transform(df_features) - + preprocessor.set_output(transform="pandas") + df_trans = preprocessor.fit_transform(df_features) # Create masked matrix with the interactions n_weibull_parameters = 2 * n_events w_star = rng.randn(df_trans.shape[1], n_weibull_parameters) - # Set 1-feature_rate% of the w_star to 0 + # 1-feature_rate% of marginal features and interacted features + # are set to 0 + cols_features = df_trans.columns + marginal_cols = np.array([len(c.split(" ")) == 1 for c in cols_features]) * np.ones( + (n_weibull_parameters, df_trans.shape[1]) + ) + marginal_cols = marginal_cols.T.astype(bool) + w_star_marg = np.where( + (rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate) & marginal_cols, + w_star, + 0, + ) w_star = np.where( - rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0 + (rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate) & ~marginal_cols, + w_star, + 0, ) + w_star += w_star_marg # Computation of the true values of shape and scale shape_scale_star = df_trans.values @ w_star shape_scale_columns = [f"shape_{i}" for i in range(1, n_events + 1)] + [ @@ -113,7 +132,7 @@ def rescaling_params_to_respect_default_ranges( scale_min, scale_max = scale_default shape = df_shape_scale_star[f"shape_{event}"].copy() scale = df_shape_scale_star[f"scale_{event}"].copy() - scaler = MinMaxScaler(feature_range=(-2, 2)) + scaler = StandardScaler() df_shape_scale_star[f"shape_{event}"] = ( shape_min + (shape_max - shape_min) @@ -183,7 +202,7 @@ def make_complex_features_with_sparse_matrix( ): rng = np.random.RandomState(random_state) # Create features given to the model as X and then creating the interactions - df_features = pd.DataFrame(rng.randn(n_samples, n_features)) + df_features = pd.DataFrame(rng.rand(n_samples, n_features)) df_features.columns = [f"feature_{i}" for i in range(n_features)] df_shape_scale_star = compute_shape_and_scale( @@ -221,7 +240,7 @@ def make_synthetic_competing_weibull( return_uncensored_data=False, return_X_y=True, feature_rounding=2, - target_rounding=1, + target_rounding=2, shape_ranges=DEFAULT_SHAPE_RANGES, scale_ranges=DEFAULT_SCALE_RANGES, censoring_relative_scale=1.5, @@ -275,7 +294,7 @@ def make_synthetic_competing_weibull( X = X.round(feature_rounding) if target_rounding is not None: - y_censored = y_censored.round(target_rounding) + y_censored["duration"] = y_censored["duration"].round(target_rounding) y = y.round(target_rounding) if return_X_y: From 59008ada64814d4c06ce48034ee2c6a8b1cadf11 Mon Sep 17 00:00:00 2001 From: Julie Alberge Date: Thu, 14 Dec 2023 19:23:58 +0100 Subject: [PATCH 10/13] fixing tests (because shape ans scale default parameters have been changed, some tests were broken) --- .../data/tests/test_competing_weibull.py | 35 ++----------------- .../tests/test_gradient_boosting_incidence.py | 2 +- 2 files changed, 4 insertions(+), 33 deletions(-) diff --git a/hazardous/data/tests/test_competing_weibull.py b/hazardous/data/tests/test_competing_weibull.py index b0dafcd..3799f66 100644 --- a/hazardous/data/tests/test_competing_weibull.py +++ b/hazardous/data/tests/test_competing_weibull.py @@ -1,6 +1,6 @@ import pytest -from sklearn.dummy import DummyClassifier, DummyRegressor -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score from hazardous.data import make_synthetic_competing_weibull @@ -56,35 +56,6 @@ def test_competing_weibull_no_censoring(seed): assert event_classification_acc > 1.1 * baseline_classification_acc assert event_classification_acc < 0.6 # still challenging. - # Check that the features make it possible to predict the durations better - # than a marginal baseline, but that a significant amount of unpredictability - # remains. - median_duration = y["duration"].median() - duration_regression_relative_error = ( - -cross_val_score( - RandomForestRegressor(criterion="poisson", random_state=seed), - X, - y["duration"], - cv=3, - n_jobs=4, - scoring="neg_mean_absolute_error", - ).mean() - / median_duration - ) - baseline_duration_relative_error = ( - -cross_val_score( - DummyRegressor(strategy="mean"), - X, - y["duration"], - cv=3, - scoring="neg_mean_absolute_error", - ).mean() - / median_duration - ) - assert 0.9 < baseline_duration_relative_error < 1.1 # approximately balanced - assert duration_regression_relative_error < 0.98 * baseline_duration_relative_error - assert duration_regression_relative_error > 0.8 # still challenging. - @pytest.mark.parametrize("seed", range(3)) def test_competing_weibull_with_censoring(seed): @@ -109,7 +80,7 @@ def test_competing_weibull_with_censoring(seed): # Censoring rate is lower for higher scale: high_scale_censoring_rate = (y_high_scale["event"] == 0).mean() low_scale_censoring_rate = (y_low_scale["event"] == 0).mean() - assert 0.35 < high_scale_censoring_rate < 0.5 + assert 0.3 < high_scale_censoring_rate < 0.5 assert 0.5 < low_scale_censoring_rate < 0.65 # Uncensored events and durations match: diff --git a/hazardous/tests/test_gradient_boosting_incidence.py b/hazardous/tests/test_gradient_boosting_incidence.py index 0c2fd73..4e14ab3 100644 --- a/hazardous/tests/test_gradient_boosting_incidence.py +++ b/hazardous/tests/test_gradient_boosting_incidence.py @@ -139,4 +139,4 @@ def test_gradient_boosting_incidence_parameter_tuning(seed): worst_ibs = -cv_results.iloc[0]["mean_test_score"] best_ibs = -cv_results.iloc[-1]["mean_test_score"] assert best_ibs == pytest.approx(-grid_search.best_score_) - assert worst_ibs > 1.4 * best_ibs + assert worst_ibs > 1.25 * best_ibs From cdb3ad5965307f19dc7ec037c7f2239470584a4f Mon Sep 17 00:00:00 2001 From: Julie Alberge Date: Thu, 14 Dec 2023 19:38:06 +0100 Subject: [PATCH 11/13] adjusting code as Vincent's propositions --- examples/plot_complex_data.py | 10 ++-- hazardous/data/_competing_weibull.py | 76 +++++++++++++++------------- 2 files changed, 45 insertions(+), 41 deletions(-) diff --git a/examples/plot_complex_data.py b/examples/plot_complex_data.py index 6dda353..d0d4857 100644 --- a/examples/plot_complex_data.py +++ b/examples/plot_complex_data.py @@ -18,6 +18,7 @@ from time import perf_counter import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split +from sklearn.utils import check_random_state import pandas as pd import seaborn as sns @@ -25,13 +26,12 @@ from lifelines import AalenJohansenFitter seed = 0 -rng = np.random.RandomState(seed) +rng = check_random_state(seed) # %% # In the following cell, we verify that the synthetic dataset is well defined. -rng = np.random.RandomState(seed) df_features = pd.DataFrame(rng.randn(100_000, 10)) df_features.columns = [f"feature_{i}" for i in range(10)] @@ -54,7 +54,7 @@ n_features=10, features_rate=0.5, degree_interaction=2, - independent=False, + independent_censoring=False, features_censoring_rate=0.2, return_uncensored_data=True, return_X_y=True, @@ -167,7 +167,7 @@ def plot_cumulative_incidence_functions( plot_cumulative_incidence_functions( X_train, y_train_u, - gb_incidence=None, + gb_incidence=gb_incidence, aj=aj, X_test=X_test, y_test=y_test_u, @@ -176,7 +176,7 @@ def plot_cumulative_incidence_functions( plot_cumulative_incidence_functions( X_train, y_train_c, - gb_incidence=None, + gb_incidence=gb_incidence, aj=aj, X_test=X_test, y_test=y_test_c, diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py index 7273ad9..1bb67fa 100644 --- a/hazardous/data/_competing_weibull.py +++ b/hazardous/data/_competing_weibull.py @@ -24,30 +24,28 @@ def _censor( y, - independent=True, + independent_censoring=True, X=None, features_censoring_rate=0.2, censoring_relative_scale=1.5, random_state=0, ): - rng = np.random.RandomState(random_state) + rng = check_random_state(random_state) if censoring_relative_scale == 0 or censoring_relative_scale is None: return y - if independent: + if independent_censoring: scale_censoring = censoring_relative_scale * y["duration"].mean() shape_censoring = 3 else: - features_impact_censoring = rng.randn(X.shape[1], 2) - features_impact_censoring = np.where( - rng.rand( - features_impact_censoring.shape[0], features_impact_censoring.shape[1] - ) + w_censoring_star = rng.randn(X.shape[1], 2) + w_censoring_star = np.where( + rng.rand(w_censoring_star.shape[0], w_censoring_star.shape[1]) < features_censoring_rate, - features_impact_censoring, + w_censoring_star, 0, ) - df_censoring_params = censoring_relative_scale * X @ features_impact_censoring + df_censoring_params = censoring_relative_scale * X @ w_censoring_star df_censoring_params.columns = ["shape_0", "scale_0"] df_censoring_params = rescaling_params_to_respect_default_ranges( df_censoring_params, @@ -77,7 +75,7 @@ def compute_shape_and_scale( scale_ranges=DEFAULT_SCALE_RANGES, random_state=0, ): - rng = np.random.RandomState(random_state) + rng = check_random_state(random_state) # Adding interactions between features preprocessor = make_pipeline( SplineTransformer(n_knots=3), @@ -86,15 +84,15 @@ def compute_shape_and_scale( ), ) preprocessor.set_output(transform="pandas") - df_trans = preprocessor.fit_transform(df_features) + df_features_transformed = preprocessor.fit_transform(df_features) # Create masked matrix with the interactions n_weibull_parameters = 2 * n_events - w_star = rng.randn(df_trans.shape[1], n_weibull_parameters) + w_star = rng.randn(df_features_transformed.shape[1], n_weibull_parameters) # 1-feature_rate% of marginal features and interacted features # are set to 0 - cols_features = df_trans.columns + cols_features = df_features_transformed.columns marginal_cols = np.array([len(c.split(" ")) == 1 for c in cols_features]) * np.ones( - (n_weibull_parameters, df_trans.shape[1]) + (n_weibull_parameters, df_features_transformed.shape[1]) ) marginal_cols = marginal_cols.T.astype(bool) w_star_marg = np.where( @@ -109,7 +107,7 @@ def compute_shape_and_scale( ) w_star += w_star_marg # Computation of the true values of shape and scale - shape_scale_star = df_trans.values @ w_star + shape_scale_star = df_features_transformed.values @ w_star shape_scale_columns = [f"shape_{i}" for i in range(1, n_events + 1)] + [ f"scale_{i}" for i in range(1, n_events + 1) ] @@ -130,18 +128,16 @@ def rescaling_params_to_respect_default_ranges( # Rescaling of these values to stay in the chosen range shape_min, shape_max = shape_default scale_min, scale_max = scale_default - shape = df_shape_scale_star[f"shape_{event}"].copy() - scale = df_shape_scale_star[f"scale_{event}"].copy() scaler = StandardScaler() df_shape_scale_star[f"shape_{event}"] = ( shape_min + (shape_max - shape_min) - * expit(scaler.fit_transform(shape.values.reshape(-1, 1))) + * expit(scaler.fit_transform(df_shape_scale_star[f"shape_{event}"])) ).flatten() df_shape_scale_star[f"scale_{event}"] = ( scale_min + (scale_max - scale_min) - * expit(scaler.fit_transform(scale.values.reshape(-1, 1))).flatten() + * expit(scaler.fit_transform(df_shape_scale_star[f"scale_{event}"])).flatten() ) return df_shape_scale_star @@ -202,8 +198,11 @@ def make_complex_features_with_sparse_matrix( ): rng = np.random.RandomState(random_state) # Create features given to the model as X and then creating the interactions - df_features = pd.DataFrame(rng.rand(n_samples, n_features)) - df_features.columns = [f"feature_{i}" for i in range(n_features)] + columns = [f"feature_{i}" for i in range(n_features)] + df_features = pd.DataFrame( + rng.randn(n_samples, n_features), + columns=columns, + ) df_shape_scale_star = compute_shape_and_scale( df_features, @@ -235,12 +234,12 @@ def make_synthetic_competing_weibull( n_features=10, features_rate=0.3, degree_interaction=2, - independent=True, + independent_censoring=True, features_censoring_rate=0.2, return_uncensored_data=False, return_X_y=True, feature_rounding=2, - target_rounding=2, + target_rounding=4, shape_ranges=DEFAULT_SHAPE_RANGES, scale_ranges=DEFAULT_SCALE_RANGES, censoring_relative_scale=1.5, @@ -262,19 +261,24 @@ def make_synthetic_competing_weibull( """ if complex_features: X, event_durations, duration_argmin = make_complex_features_with_sparse_matrix( - n_events, - n_samples, - base_scale, - shape_ranges, - scale_ranges, - n_features, - features_rate, - degree_interaction, - random_state, + n_events=n_events, + n_samples=n_samples, + base_scale=base_scale, + shape_ranges=shape_ranges, + scale_ranges=scale_ranges, + n_features=n_features, + features_rate=features_rate, + degree_interaction=degree_interaction, + random_state=random_state, ) else: X, event_durations, duration_argmin = make_simple_features( - n_events, n_samples, base_scale, shape_ranges, scale_ranges, random_state + n_events=n_events, + n_samples=n_samples, + base_scale=base_scale, + shape_ranges=shape_ranges, + scale_ranges=scale_ranges, + random_state=random_state, ) y = pd.DataFrame( dict( @@ -285,7 +289,7 @@ def make_synthetic_competing_weibull( y_censored = _censor( y, censoring_relative_scale=censoring_relative_scale, - independent=independent, + independent_censoring=independent_censoring, X=X, features_censoring_rate=features_censoring_rate, random_state=random_state, @@ -303,4 +307,4 @@ def make_synthetic_competing_weibull( return X, y_censored frame = pd.concat([X, y], axis=1) - return Bunch(data=frame[X.columns], target=frame[y.columns], frame=frame) + return Bunch(data=frame[X.columns], target=frame[y_censored.columns], frame=frame) From c5f319334385fac1f9d1e9f0a474114d4598297f Mon Sep 17 00:00:00 2001 From: Julie Alberge Date: Fri, 15 Dec 2023 17:09:11 +0100 Subject: [PATCH 12/13] refacto code with vincent, tuning parameters. --- hazardous/data/_competing_weibull.py | 84 ++++++++++--------- .../data/tests/test_competing_weibull.py | 2 +- 2 files changed, 47 insertions(+), 39 deletions(-) diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py index 1bb67fa..b20401c 100644 --- a/hazardous/data/_competing_weibull.py +++ b/hazardous/data/_competing_weibull.py @@ -10,14 +10,14 @@ from sklearn.utils import check_random_state DEFAULT_SHAPE_RANGES = ( - (0.8, 1.0), + (1.0, 3.5), (1.0, 1.0), - (1.2, 3.0), + (3.0, 6.0), ) DEFAULT_SCALE_RANGES = ( - (1, 15), (1, 7), + (1, 15), (1.5, 5), ) @@ -47,12 +47,18 @@ def _censor( ) df_censoring_params = censoring_relative_scale * X @ w_censoring_star df_censoring_params.columns = ["shape_0", "scale_0"] - df_censoring_params = rescaling_params_to_respect_default_ranges( - df_censoring_params, - shape_default=(3.0, 4.0), - scale_default=(1, 10 * censoring_relative_scale), - event=0, + + df_censoring_params["shape_0"] = _rescale_params( + df_censoring_params[["shape_0"]], + param_min=2.0, + param_max=3.0, + ) + df_censoring_params["scale_0"] = _rescale_params( + df_censoring_params[["scale_0"]], + param_min=1, + param_max=censoring_relative_scale, ) + scale_censoring = df_censoring_params["scale_0"].values * y["duration"].mean() shape_censoring = df_censoring_params["shape_0"].values censoring = weibull_min.rvs( @@ -67,7 +73,7 @@ def _censor( def compute_shape_and_scale( - df_features, + X, features_rate=0.2, n_events=3, degree_interaction=2, @@ -84,62 +90,64 @@ def compute_shape_and_scale( ), ) preprocessor.set_output(transform="pandas") - df_features_transformed = preprocessor.fit_transform(df_features) + X_trans = preprocessor.fit_transform(X) # Create masked matrix with the interactions n_weibull_parameters = 2 * n_events - w_star = rng.randn(df_features_transformed.shape[1], n_weibull_parameters) + w_star = rng.randn(X_trans.shape[1], n_weibull_parameters) # 1-feature_rate% of marginal features and interacted features # are set to 0 - cols_features = df_features_transformed.columns - marginal_cols = np.array([len(c.split(" ")) == 1 for c in cols_features]) * np.ones( - (n_weibull_parameters, df_features_transformed.shape[1]) - ) - marginal_cols = marginal_cols.T.astype(bool) - w_star_marg = np.where( - (rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate) & marginal_cols, + cols_features = X_trans.columns + marginal_mask = np.array([len(col.split(" ")) == 1 for col in cols_features]) + marginal_cols = np.repeat( + marginal_mask.reshape(-1, 1), repeats=n_weibull_parameters, axis=1 + ) # (X_trans.shape[1], n_weibull_parameters) + + drop_out_mask = rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate + w_star_marginal = np.where( + drop_out_mask & marginal_cols, w_star, 0, ) w_star = np.where( - (rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate) & ~marginal_cols, + drop_out_mask & ~marginal_cols, w_star, 0, ) - w_star += w_star_marg + w_star += w_star_marginal + # Computation of the true values of shape and scale - shape_scale_star = df_features_transformed.values @ w_star + shape_scale_star = X_trans.values @ w_star shape_scale_columns = [f"shape_{i}" for i in range(1, n_events + 1)] + [ f"scale_{i}" for i in range(1, n_events + 1) ] df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns) # Rescaling of these values to stay in the chosen range + + return rescale_params(df_shape_scale_star, n_events, shape_ranges, scale_ranges) + + +def rescale_params(df_shape_scale_star, n_events, shape_ranges, scale_ranges): for event_id, shape_range, scale_range in zip( range(1, n_events + 1), cycle(shape_ranges), cycle(scale_ranges) ): - df_shape_scale_star = rescaling_params_to_respect_default_ranges( - df_shape_scale_star, shape_range, scale_range, event_id + shape_min, shape_max = shape_range + scale_min, scale_max = scale_range + df_shape_scale_star[f"shape_{event_id}"] = _rescale_params( + df_shape_scale_star[[f"shape_{event_id}"]], shape_min, shape_max + ) + df_shape_scale_star[f"scale_{event_id}"] = _rescale_params( + df_shape_scale_star[[f"scale_{event_id}"]], scale_min, scale_max ) return df_shape_scale_star -def rescaling_params_to_respect_default_ranges( - df_shape_scale_star, shape_default, scale_default, event -): +def _rescale_params(column_param, param_min, param_max): # Rescaling of these values to stay in the chosen range - shape_min, shape_max = shape_default - scale_min, scale_max = scale_default scaler = StandardScaler() - df_shape_scale_star[f"shape_{event}"] = ( - shape_min - + (shape_max - shape_min) - * expit(scaler.fit_transform(df_shape_scale_star[f"shape_{event}"])) + column_param = ( + param_min + (param_max - param_min) * expit(scaler.fit_transform(column_param)) ).flatten() - df_shape_scale_star[f"scale_{event}"] = ( - scale_min - + (scale_max - scale_min) - * expit(scaler.fit_transform(df_shape_scale_star[f"scale_{event}"])).flatten() - ) - return df_shape_scale_star + return column_param def make_simple_features( diff --git a/hazardous/data/tests/test_competing_weibull.py b/hazardous/data/tests/test_competing_weibull.py index 3799f66..eb2e638 100644 --- a/hazardous/data/tests/test_competing_weibull.py +++ b/hazardous/data/tests/test_competing_weibull.py @@ -54,7 +54,7 @@ def test_competing_weibull_no_censoring(seed): ).mean() assert 0.45 > baseline_classification_acc > 0.3 # approximately balanced assert event_classification_acc > 1.1 * baseline_classification_acc - assert event_classification_acc < 0.6 # still challenging. + assert event_classification_acc < 0.63 # still challenging. @pytest.mark.parametrize("seed", range(3)) From 2f1ba2b3180a6153684fec344cc280330b82e1c2 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Mon, 18 Dec 2023 18:01:04 +0100 Subject: [PATCH 13/13] =?UTF-8?q?small=20rework=20of=20the=20example=20?= =?UTF-8?q?=F0=9F=92=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/plot_complex_data.py | 114 +++++++++++++++++---------- hazardous/data/_competing_weibull.py | 98 ++++++++++++----------- pyproject.toml | 5 +- 3 files changed, 131 insertions(+), 86 deletions(-) diff --git a/examples/plot_complex_data.py b/examples/plot_complex_data.py index d0d4857..f115dec 100644 --- a/examples/plot_complex_data.py +++ b/examples/plot_complex_data.py @@ -13,17 +13,7 @@ """ # %% -import numpy as np -import hazardous.data._competing_weibull as competing_w -from time import perf_counter -import matplotlib.pyplot as plt -from sklearn.model_selection import train_test_split from sklearn.utils import check_random_state -import pandas as pd -import seaborn as sns - -from hazardous import GradientBoostingIncidence -from lifelines import AalenJohansenFitter seed = 0 rng = check_random_state(seed) @@ -31,24 +21,33 @@ # %% # In the following cell, we verify that the synthetic dataset is well defined. +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from hazardous.data._competing_weibull import compute_shape_and_scale + -df_features = pd.DataFrame(rng.randn(100_000, 10)) -df_features.columns = [f"feature_{i}" for i in range(10)] +X = rng.randn(10_000, 10) +column_names = [f"feature_{i}" for i in range(X.shape[1])] +X = pd.DataFrame(X, columns=column_names) -df_shape_scale_star = competing_w.compute_shape_and_scale( - df_features, +SS_star = compute_shape_and_scale( + X, features_rate=0.2, n_events=3, degree_interaction=2, random_state=0, ) + fig, axes = plt.subplots(2, 3, figsize=(10, 7)) -for idx, col in enumerate(df_shape_scale_star.columns): - sns.histplot(df_shape_scale_star[col], ax=axes[idx // 3][idx % 3]) +for idx, col in enumerate(SS_star.columns): + sns.histplot(SS_star[col], ax=axes[idx // 3][idx % 3]) # %% +from hazardous.data._competing_weibull import make_synthetic_competing_weibull -X, y_censored, y_uncensored = competing_w.make_synthetic_competing_weibull( + +X, y_censored, y_uncensored = make_synthetic_competing_weibull( n_samples=3_000, base_scale=1_000, n_features=10, @@ -57,20 +56,35 @@ independent_censoring=False, features_censoring_rate=0.2, return_uncensored_data=True, - return_X_y=True, feature_rounding=3, target_rounding=4, censoring_relative_scale=4.0, - random_state=0, complex_features=True, + return_X_y=True, + random_state=seed, +) + +print(y_censored["event"].value_counts().sort_index()) + +sns.histplot( + y_censored, + x="duration", + hue="event", + multiple="stack", + palette="magma", ) # %% -n_samples = len(X) -calculate_variance = n_samples <= 5_000 +from lifelines import AalenJohansenFitter + + +calculate_variance = X.shape[0] <= 5_000 aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0) +aj + +# %% +from hazardous._gradient_boosting_incidence import GradientBoostingIncidence -y_censored["event"].value_counts() gb_incidence = GradientBoostingIncidence( learning_rate=0.1, @@ -82,7 +96,7 @@ show_progressbar=False, random_state=seed, ) - +gb_incidence # %% # @@ -92,13 +106,23 @@ # Let's now estimate the CIFs on uncensored data and plot them against the # theoretical CIFs: +import numpy as np +from time import perf_counter + def plot_cumulative_incidence_functions( - X, y, gb_incidence=None, aj=None, X_test=None, y_test=None + X, + y, + gb_incidence=None, + aj=None, + X_test=None, + y_test=None, + verbose=False, ): n_events = y["event"].max() t_max = y["duration"].max() _, axes = plt.subplots(figsize=(12, 4), ncols=n_events, sharey=True) + # Compute the estimate of the CIFs on a coarse grid. coarse_timegrid = np.linspace(0, t_max, num=100) censoring_fraction = (y["event"] == 0).mean() @@ -113,17 +137,28 @@ def plot_cumulative_incidence_functions( gb_incidence.set_params(event_of_interest=event_id) gb_incidence.fit(X, y) duration = perf_counter() - tic - print(f"GB Incidence for event {event_id} fit in {duration:.3f} s") + + if verbose: + print(f"GB Incidence for event {event_id} fit in {duration:.3f} s") + tic = perf_counter() cifs_pred = gb_incidence.predict_cumulative_incidence(X, coarse_timegrid) cif_mean = cifs_pred.mean(axis=0) duration = perf_counter() - tic - print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s") - print("Brier score on training data:", -gb_incidence.score(X, y)) - if X_test is not None: + + if verbose: print( - "Brier score on testing data:", -gb_incidence.score(X_test, y_test) + f"GB Incidence for event {event_id} prediction in {duration:.3f} s" ) + + if verbose: + brier_score_train = -gb_incidence.score(X, y) + print(f"Brier score on training data: {brier_score_train:.3f}") + if X_test is not None: + brier_score_test = -gb_incidence.score(X_test, y_test) + print( + f"Brier score on testing data: {brier_score_test:.3f}", + ) ax.plot( coarse_timegrid, cif_mean, @@ -135,17 +170,25 @@ def plot_cumulative_incidence_functions( tic = perf_counter() aj.fit(y["duration"], y["event"], event_of_interest=event_id) duration = perf_counter() - tic - print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s") + if verbose: + print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s") + aj.plot(label="Aalen-Johansen", ax=ax) ax.set_xlim(0, 8_000) ax.set_ylim(0, 0.5) + if event_id == 1: ax.legend(loc="lower right") else: ax.legend().remove() + if verbose: + print("=" * 16, "\n") + # %% +from sklearn.model_selection import train_test_split + X_train, X_test, y_train_c, y_test_c = train_test_split( X, y_censored, test_size=0.3, random_state=seed @@ -153,17 +196,6 @@ def plot_cumulative_incidence_functions( y_train_u = y_uncensored.loc[y_train_c.index] y_test_u = y_uncensored.loc[y_test_c.index] -gb_incidence = GradientBoostingIncidence( - learning_rate=0.1, - n_iter=20, - max_leaf_nodes=15, - hard_zero_fraction=0.1, - min_samples_leaf=5, - loss="ibs", - show_progressbar=False, - random_state=seed, -) - plot_cumulative_incidence_functions( X_train, y_train_u, diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py index b20401c..b6d3975 100644 --- a/hazardous/data/_competing_weibull.py +++ b/hazardous/data/_competing_weibull.py @@ -23,7 +23,7 @@ def _censor( - y, + y_uncensored, independent_censoring=True, X=None, features_censoring_rate=0.2, @@ -32,20 +32,22 @@ def _censor( ): rng = check_random_state(random_state) if censoring_relative_scale == 0 or censoring_relative_scale is None: - return y + return y_uncensored + + mean_duration = y_uncensored["duration"].mean() if independent_censoring: - scale_censoring = censoring_relative_scale * y["duration"].mean() + scale_censoring = censoring_relative_scale * mean_duration shape_censoring = 3 + else: - w_censoring_star = rng.randn(X.shape[1], 2) - w_censoring_star = np.where( - rng.rand(w_censoring_star.shape[0], w_censoring_star.shape[1]) - < features_censoring_rate, - w_censoring_star, + W_censoring_star = rng.randn(X.shape[1], 2) + W_censoring_star = np.where( + rng.rand(*W_censoring_star.shape) < features_censoring_rate, + W_censoring_star, 0, ) - df_censoring_params = censoring_relative_scale * X @ w_censoring_star + df_censoring_params = censoring_relative_scale * X @ W_censoring_star df_censoring_params.columns = ["shape_0", "scale_0"] df_censoring_params["shape_0"] = _rescale_params( @@ -59,12 +61,16 @@ def _censor( param_max=censoring_relative_scale, ) - scale_censoring = df_censoring_params["scale_0"].values * y["duration"].mean() + scale_censoring = df_censoring_params["scale_0"].values * mean_duration shape_censoring = df_censoring_params["shape_0"].values + censoring = weibull_min.rvs( - shape_censoring, scale=scale_censoring, size=y.shape[0], random_state=rng + shape_censoring, + scale=scale_censoring, + size=y_uncensored.shape[0], + random_state=rng, ) - y_censored = y.copy() + y_censored = y_uncensored.copy() y_censored["event"] = np.where( y_censored["duration"] < censoring, y_censored["event"], 0 ) @@ -82,6 +88,7 @@ def compute_shape_and_scale( random_state=0, ): rng = check_random_state(random_state) + # Adding interactions between features preprocessor = make_pipeline( SplineTransformer(n_knots=3), @@ -91,58 +98,61 @@ def compute_shape_and_scale( ) preprocessor.set_output(transform="pandas") X_trans = preprocessor.fit_transform(X) + # Create masked matrix with the interactions n_weibull_parameters = 2 * n_events - w_star = rng.randn(X_trans.shape[1], n_weibull_parameters) + W_star = rng.randn(X_trans.shape[1], n_weibull_parameters) + # 1-feature_rate% of marginal features and interacted features # are set to 0 - cols_features = X_trans.columns - marginal_mask = np.array([len(col.split(" ")) == 1 for col in cols_features]) + # Feature names without " " are marginal, otherwise they are interactions. + marginal_mask = np.array([len(col.split(" ")) == 1 for col in X_trans.columns]) marginal_cols = np.repeat( marginal_mask.reshape(-1, 1), repeats=n_weibull_parameters, axis=1 - ) # (X_trans.shape[1], n_weibull_parameters) + ) # shape: (X_trans.shape[1], n_weibull_parameters) - drop_out_mask = rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate - w_star_marginal = np.where( + drop_out_mask = rng.rand(W_star.shape[0], W_star.shape[1]) < features_rate + W_star_marginal = np.where( drop_out_mask & marginal_cols, - w_star, + W_star, 0, ) - w_star = np.where( + W_star = np.where( drop_out_mask & ~marginal_cols, - w_star, + W_star, 0, ) - w_star += w_star_marginal + W_star += W_star_marginal # Computation of the true values of shape and scale - shape_scale_star = X_trans.values @ w_star + shape_scale_star = X_trans.values @ W_star shape_scale_columns = [f"shape_{i}" for i in range(1, n_events + 1)] + [ f"scale_{i}" for i in range(1, n_events + 1) ] - df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns) - # Rescaling of these values to stay in the chosen range + SS_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns) - return rescale_params(df_shape_scale_star, n_events, shape_ranges, scale_ranges) + # Rescaling of these values to stay in the chosen range + return rescale_params(SS_star, n_events, shape_ranges, scale_ranges) -def rescale_params(df_shape_scale_star, n_events, shape_ranges, scale_ranges): +def rescale_params(SS_star, n_events, shape_ranges, scale_ranges): for event_id, shape_range, scale_range in zip( range(1, n_events + 1), cycle(shape_ranges), cycle(scale_ranges) ): shape_min, shape_max = shape_range scale_min, scale_max = scale_range - df_shape_scale_star[f"shape_{event_id}"] = _rescale_params( - df_shape_scale_star[[f"shape_{event_id}"]], shape_min, shape_max + SS_star[f"shape_{event_id}"] = _rescale_params( + SS_star[[f"shape_{event_id}"]], shape_min, shape_max ) - df_shape_scale_star[f"scale_{event_id}"] = _rescale_params( - df_shape_scale_star[[f"scale_{event_id}"]], scale_min, scale_max + SS_star[f"scale_{event_id}"] = _rescale_params( + SS_star[[f"scale_{event_id}"]], scale_min, scale_max ) - return df_shape_scale_star + return SS_star def _rescale_params(column_param, param_min, param_max): # Rescaling of these values to stay in the chosen range + # Expit is the logistic sigmoid function. scaler = StandardScaler() column_param = ( param_min + (param_max - param_min) * expit(scaler.fit_transform(column_param)) @@ -207,13 +217,13 @@ def make_complex_features_with_sparse_matrix( rng = np.random.RandomState(random_state) # Create features given to the model as X and then creating the interactions columns = [f"feature_{i}" for i in range(n_features)] - df_features = pd.DataFrame( + X = pd.DataFrame( rng.randn(n_samples, n_features), columns=columns, ) - df_shape_scale_star = compute_shape_and_scale( - df_features, + SS_star = compute_shape_and_scale( + X, features_rate, n_events, degree_interaction, @@ -224,15 +234,15 @@ def make_complex_features_with_sparse_matrix( # Throw durations from a weibull distribution with scale and shape as the parameters event_durations = [] for event in range(1, n_events + 1): - shape = df_shape_scale_star[f"shape_{event}"] - scale = df_shape_scale_star[f"scale_{event}"] + shape = SS_star[f"shape_{event}"] + scale = SS_star[f"scale_{event}"] durations = weibull_min.rvs(shape, scale=scale * base_scale, random_state=rng) event_durations.append(durations) # Creating the target tabular event_durations = np.asarray(event_durations) duration_argmin = np.argmin(event_durations, axis=0) - return df_features, event_durations, duration_argmin + return X, event_durations, duration_argmin def make_synthetic_competing_weibull( @@ -288,18 +298,18 @@ def make_synthetic_competing_weibull( scale_ranges=scale_ranges, random_state=random_state, ) - y = pd.DataFrame( + y_uncensored = pd.DataFrame( dict( event=duration_argmin + 1, duration=event_durations[duration_argmin, np.arange(n_samples)], ) ) y_censored = _censor( - y, - censoring_relative_scale=censoring_relative_scale, + y_uncensored, independent_censoring=independent_censoring, X=X, features_censoring_rate=features_censoring_rate, + censoring_relative_scale=censoring_relative_scale, random_state=random_state, ) if feature_rounding is not None: @@ -307,12 +317,12 @@ def make_synthetic_competing_weibull( if target_rounding is not None: y_censored["duration"] = y_censored["duration"].round(target_rounding) - y = y.round(target_rounding) + y_uncensored = y_uncensored.round(target_rounding) if return_X_y: if return_uncensored_data: - return X, y_censored, y + return X, y_censored, y_uncensored return X, y_censored - frame = pd.concat([X, y], axis=1) + frame = pd.concat([X, y_censored], axis=1) return Bunch(data=frame[X.columns], target=frame[y_censored.columns], frame=frame) diff --git a/pyproject.toml b/pyproject.toml index 862b146..b5140b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,7 +107,6 @@ exclude = [ "venv", "doc", ] -per-file-ignores = {} # Same as Black. line-length = 88 @@ -119,3 +118,7 @@ target-version = "py39" [tool.ruff.mccabe] max-complexity = 10 + +[tool.ruff.per-file-ignores] +# It's fine not to put the import at the top of the file in the examples +"examples/*" = ["E402"]