diff --git a/examples/complex_synthetic_data.py b/examples/complex_synthetic_data.py index 5826b5e..de35621 100644 --- a/examples/complex_synthetic_data.py +++ b/examples/complex_synthetic_data.py @@ -14,9 +14,11 @@ from sklearn.preprocessing import PolynomialFeatures import pandas as pd from scipy.stats import weibull_min +from scipy.special import expit -rng = np.random.default_rng(0) seed = 0 +rng = np.random.RandomState(seed) + DEFAULT_SHAPE_RANGES = ( (0.7, 0.9), (1.0, 1.0), @@ -53,10 +55,10 @@ def compute_shape_and_scale( df_trans = df_poly_features.fit_transform(df_features) # Create masked matrix with the interactions - w_star = np.random.randn(df_trans.shape[1], n_weibull_parameters) + w_star = rng.randn(df_trans.shape[1], n_weibull_parameters) # Set 1-feature_rate% of the w_star to 0 w_star = np.where( - np.random.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0 + rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0 ) # Computation of the true values of shape and scale shape_scale_star = df_trans.values @ w_star @@ -64,70 +66,69 @@ def compute_shape_and_scale( f"scale_{i}" for i in range(n_events) ] df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns) - # Rescaling of these values to stay in the chosen range - df_shape_scale_star = rescaling_params_to_respect_default_ranges( - df_shape_scale_star - ) - return df_shape_scale_star - - -def rescaling_params_to_respect_default_ranges(df_shape_scale_star): # Rescaling of these values to stay in the chosen range for event, (scale_default, shape_default) in enumerate( zip(DEFAULT_SCALE_RANGES, DEFAULT_SHAPE_RANGES) ): - shape_min, shape_max = shape_default - scale_min, scale_max = scale_default - shape = df_shape_scale_star[f"shape_{event}"].copy() - scale = df_shape_scale_star[f"scale_{event}"].copy() - shape = shape_min + (shape_max - shape_min) * (shape - shape.min()) / ( - shape.max() - shape.min() + df_shape_scale_star = rescaling_params_to_respect_default_ranges( + df_shape_scale_star, shape_default, scale_default, event ) - scale = scale_min + (scale_max - scale_min) * (scale - scale.min()) / ( - scale.max() - scale.min() - ) - df_shape_scale_star[f"shape_{event}"] = shape - df_shape_scale_star[f"scale_{event}"] = scale + return df_shape_scale_star + +def rescaling_params_to_respect_default_ranges( + df_shape_scale_star, shape_default, scale_default, event +): + # Rescaling of these values to stay in the chosen range + shape_min, shape_max = shape_default + scale_min, scale_max = scale_default + shape = df_shape_scale_star[f"shape_{event}"].copy() + scale = df_shape_scale_star[f"scale_{event}"].copy() + df_shape_scale_star[f"shape_{event}"] = shape_min + (shape_max - shape_min) * expit( + shape + ) + df_shape_scale_star[f"scale_{event}"] = scale_min + (scale_max - scale_min) * expit( + scale + ) return df_shape_scale_star def censor_data( - y, relative_scale, independant=True, X=None, features_censoring_rate=0.2 + y, independant=True, X=None, features_censoring_rate=0.2, relative_scale=1.5 ): if relative_scale == 0 or relative_scale is None: return y if independant: scale_censoring = relative_scale * y["duration"].mean() + shape_censoring = 1 else: - features_impact_censoring = np.abs(np.random.randn(X.shape[1], 1)) + features_impact_censoring = rng.randn(X.shape[1], 2) features_impact_censoring = np.where( - np.random.rand( + rng.rand( features_impact_censoring.shape[0], features_impact_censoring.shape[1] ) < features_censoring_rate, features_impact_censoring, 0, ) - scale_censoring = (X @ features_impact_censoring).values.flatten() - scale_censoring = ( - ( - 1 - + (scale_censoring - scale_censoring.min()) - / (scale_censoring.max() - scale_censoring.min()) - ) - * relative_scale - * y["duration"].mean() + df_censoring_params = relative_scale * X @ features_impact_censoring + df_censoring_params.columns = ["shape_0", "scale_0"] + df_censoring_params = rescaling_params_to_respect_default_ranges( + df_censoring_params, + shape_default=(0.5, 0.9), + scale_default=(10, 15), + event=0, ) - + scale_censoring = df_censoring_params["scale_0"].values * y["duration"].mean() + shape_censoring = df_censoring_params["shape_0"].values censoring = weibull_min.rvs( - 1, scale=scale_censoring, size=y.shape[0], random_state=rng + shape_censoring, scale=scale_censoring, size=y.shape[0], random_state=rng ) - y = y.copy() - y["event"] = np.where(y["duration"] < censoring, y["event"], 0) - y["duration"] = np.minimum(y["duration"], censoring) - return y + y_censored = y.copy() + y_censored["event"] = np.where(y["duration"] < censoring, y_censored["event"], 0) + y_censored["duration"] = np.minimum(y_censored["duration"], censoring) + return y_censored def complex_data( @@ -144,7 +145,7 @@ def complex_data( return_uncensored_data=False, ): # Create features given to the model as X and then creating the interactions - df_features = pd.DataFrame(np.random.randn(n_samples, n_features)) + df_features = pd.DataFrame(rng.randn(n_samples, n_features)) df_features.columns = [f"feature_{i}" for i in range(n_features)] df_shape_scale_star = compute_shape_and_scale( @@ -169,7 +170,7 @@ def complex_data( ) y_censored = censor_data( y, - relative_scale, + relative_scale=relative_scale, independant=independant, X=df_features, features_censoring_rate=features_censoring_rate, diff --git a/examples/test_complex_data.py b/examples/test_complex_data.py index 9bdfc5d..910aec2 100644 --- a/examples/test_complex_data.py +++ b/examples/test_complex_data.py @@ -8,8 +8,8 @@ from hazardous import GradientBoostingIncidence from lifelines import AalenJohansenFitter -rng = np.random.default_rng(0) seed = 0 +rng = np.random.RandomState(seed) DEFAULT_SHAPE_RANGES = ( (0.7, 0.9), (1.0, 1.0), @@ -25,37 +25,25 @@ # %% -X, y = complex_data( +X, y_censored, y_uncensored = complex_data( n_events=n_events, n_weibull_parameters=2 * n_events, - n_samples=10_000, + n_samples=30_000, base_scale=1_000, n_features=10, - features_rate=0.3, + features_rate=0.5, degree_interaction=2, relative_scale=0.95, independant=False, - features_censoring_rate=0.1, + features_censoring_rate=0.2, + return_uncensored_data=True, ) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) -n_samples = len(X_train) +n_samples = len(X) calculate_variance = n_samples <= 5_000 aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0) -gb_incidence = GradientBoostingIncidence( - learning_rate=0.03, - n_iter=200, - max_leaf_nodes=5, - hard_zero_fraction=0.1, - min_samples_leaf=50, - loss="ibs", - show_progressbar=False, - random_state=0, -) - -y["event"].value_counts() # %% # # CIFs estimated on uncensored data @@ -110,6 +98,7 @@ def plot_cumulative_incidence_functions( duration = perf_counter() - tic print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s") aj.plot(label="Aalen-Johansen", ax=ax) + print(aj.cumulative_density_.values[-1]) if event_id == 1: ax.legend(loc="lower right") @@ -119,13 +108,37 @@ def plot_cumulative_incidence_functions( # %% +X_train, X_test, y_train_c, y_test_c = train_test_split( + X, y_censored, test_size=0.3, random_state=seed +) +y_train_u = y_uncensored.loc[y_train_c.index] +y_test_u = y_uncensored.loc[y_test_c.index] +gb_incidence = GradientBoostingIncidence( + learning_rate=0.05, + n_iter=1_000, + max_leaf_nodes=50, + hard_zero_fraction=0.1, + min_samples_leaf=5, + loss="inll", + show_progressbar=False, + random_state=seed, +) + plot_cumulative_incidence_functions( X_train, - y_train, + y_train_u, gb_incidence=gb_incidence, aj=aj, X_test=X_test, - y_test=y_test, + y_test=y_test_u, ) +plot_cumulative_incidence_functions( + X_train, + y_train_c, + gb_incidence=gb_incidence, + aj=aj, + X_test=X_test, + y_test=y_test_c, +) # %%