Synthetic dataset with expit

soda-inria · Dec 11, 2023 · b038090 · b038090
1 parent ab5d372
commit b038090
Show file tree

Hide file tree

Showing 2 changed files with 77 additions and 63 deletions.
diff --git a/examples/complex_synthetic_data.py b/examples/complex_synthetic_data.py
@@ -14,9 +14,11 @@
 from sklearn.preprocessing import PolynomialFeatures
 import pandas as pd
 from scipy.stats import weibull_min
+from scipy.special import expit
 
-rng = np.random.default_rng(0)
 seed = 0
+rng = np.random.RandomState(seed)
+
 DEFAULT_SHAPE_RANGES = (
     (0.7, 0.9),
     (1.0, 1.0),
@@ -53,81 +55,80 @@ def compute_shape_and_scale(
     df_trans = df_poly_features.fit_transform(df_features)
 
     # Create masked matrix with the interactions
-    w_star = np.random.randn(df_trans.shape[1], n_weibull_parameters)
+    w_star = rng.randn(df_trans.shape[1], n_weibull_parameters)
     # Set 1-feature_rate% of the w_star to 0
     w_star = np.where(
-        np.random.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0
+        rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0
     )
     # Computation of the true values of shape and scale
     shape_scale_star = df_trans.values @ w_star
     shape_scale_columns = [f"shape_{i}" for i in range(n_events)] + [
         f"scale_{i}" for i in range(n_events)
     ]
     df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns)
-    # Rescaling of these values to stay in the chosen range
-    df_shape_scale_star = rescaling_params_to_respect_default_ranges(
-        df_shape_scale_star
-    )
-    return df_shape_scale_star
-
-
-def rescaling_params_to_respect_default_ranges(df_shape_scale_star):
     # Rescaling of these values to stay in the chosen range
     for event, (scale_default, shape_default) in enumerate(
         zip(DEFAULT_SCALE_RANGES, DEFAULT_SHAPE_RANGES)
     ):
-        shape_min, shape_max = shape_default
-        scale_min, scale_max = scale_default
-        shape = df_shape_scale_star[f"shape_{event}"].copy()
-        scale = df_shape_scale_star[f"scale_{event}"].copy()
-        shape = shape_min + (shape_max - shape_min) * (shape - shape.min()) / (
-            shape.max() - shape.min()
+        df_shape_scale_star = rescaling_params_to_respect_default_ranges(
+            df_shape_scale_star, shape_default, scale_default, event
         )
-        scale = scale_min + (scale_max - scale_min) * (scale - scale.min()) / (
-            scale.max() - scale.min()
-        )
-        df_shape_scale_star[f"shape_{event}"] = shape
-        df_shape_scale_star[f"scale_{event}"] = scale
+    return df_shape_scale_star
 
+
+def rescaling_params_to_respect_default_ranges(
+    df_shape_scale_star, shape_default, scale_default, event
+):
+    # Rescaling of these values to stay in the chosen range
+    shape_min, shape_max = shape_default
+    scale_min, scale_max = scale_default
+    shape = df_shape_scale_star[f"shape_{event}"].copy()
+    scale = df_shape_scale_star[f"scale_{event}"].copy()
+    df_shape_scale_star[f"shape_{event}"] = shape_min + (shape_max - shape_min) * expit(
+        shape
+    )
+    df_shape_scale_star[f"scale_{event}"] = scale_min + (scale_max - scale_min) * expit(
+        scale
+    )
     return df_shape_scale_star
 
 
 def censor_data(
-    y, relative_scale, independant=True, X=None, features_censoring_rate=0.2
+    y, independant=True, X=None, features_censoring_rate=0.2, relative_scale=1.5
 ):
     if relative_scale == 0 or relative_scale is None:
         return y
 
     if independant:
         scale_censoring = relative_scale * y["duration"].mean()
+        shape_censoring = 1
     else:
-        features_impact_censoring = np.abs(np.random.randn(X.shape[1], 1))
+        features_impact_censoring = rng.randn(X.shape[1], 2)
         features_impact_censoring = np.where(
-            np.random.rand(
+            rng.rand(
                 features_impact_censoring.shape[0], features_impact_censoring.shape[1]
             )
             < features_censoring_rate,
             features_impact_censoring,
             0,
         )
-        scale_censoring = (X @ features_impact_censoring).values.flatten()
-        scale_censoring = (
-            (
-                1
-                + (scale_censoring - scale_censoring.min())
-                / (scale_censoring.max() - scale_censoring.min())
-            )
-            * relative_scale
-            * y["duration"].mean()
+        df_censoring_params = relative_scale * X @ features_impact_censoring
+        df_censoring_params.columns = ["shape_0", "scale_0"]
+        df_censoring_params = rescaling_params_to_respect_default_ranges(
+            df_censoring_params,
+            shape_default=(0.5, 0.9),
+            scale_default=(10, 15),
+            event=0,
         )
-
+        scale_censoring = df_censoring_params["scale_0"].values * y["duration"].mean()
+        shape_censoring = df_censoring_params["shape_0"].values
     censoring = weibull_min.rvs(
-        1, scale=scale_censoring, size=y.shape[0], random_state=rng
+        shape_censoring, scale=scale_censoring, size=y.shape[0], random_state=rng
     )
-    y = y.copy()
-    y["event"] = np.where(y["duration"] < censoring, y["event"], 0)
-    y["duration"] = np.minimum(y["duration"], censoring)
-    return y
+    y_censored = y.copy()
+    y_censored["event"] = np.where(y["duration"] < censoring, y_censored["event"], 0)
+    y_censored["duration"] = np.minimum(y_censored["duration"], censoring)
+    return y_censored
 
 
 def complex_data(
@@ -144,7 +145,7 @@ def complex_data(
     return_uncensored_data=False,
 ):
     # Create features given to the model as X and then creating the interactions
-    df_features = pd.DataFrame(np.random.randn(n_samples, n_features))
+    df_features = pd.DataFrame(rng.randn(n_samples, n_features))
     df_features.columns = [f"feature_{i}" for i in range(n_features)]
 
     df_shape_scale_star = compute_shape_and_scale(
@@ -169,7 +170,7 @@ def complex_data(
     )
     y_censored = censor_data(
         y,
-        relative_scale,
+        relative_scale=relative_scale,
         independant=independant,
         X=df_features,
         features_censoring_rate=features_censoring_rate,

diff --git a/examples/test_complex_data.py b/examples/test_complex_data.py
@@ -8,8 +8,8 @@
 from hazardous import GradientBoostingIncidence
 from lifelines import AalenJohansenFitter
 
-rng = np.random.default_rng(0)
 seed = 0
+rng = np.random.RandomState(seed)
 DEFAULT_SHAPE_RANGES = (
     (0.7, 0.9),
     (1.0, 1.0),
@@ -25,37 +25,25 @@
 
 # %%
 
-X, y = complex_data(
+X, y_censored, y_uncensored = complex_data(
     n_events=n_events,
     n_weibull_parameters=2 * n_events,
-    n_samples=10_000,
+    n_samples=30_000,
     base_scale=1_000,
     n_features=10,
-    features_rate=0.3,
+    features_rate=0.5,
     degree_interaction=2,
     relative_scale=0.95,
     independant=False,
-    features_censoring_rate=0.1,
+    features_censoring_rate=0.2,
+    return_uncensored_data=True,
 )
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
 
-n_samples = len(X_train)
+n_samples = len(X)
 calculate_variance = n_samples <= 5_000
 aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0)
 
-gb_incidence = GradientBoostingIncidence(
-    learning_rate=0.03,
-    n_iter=200,
-    max_leaf_nodes=5,
-    hard_zero_fraction=0.1,
-    min_samples_leaf=50,
-    loss="ibs",
-    show_progressbar=False,
-    random_state=0,
-)
-
-y["event"].value_counts()
 # %%
 #
 # CIFs estimated on uncensored data
@@ -110,6 +98,7 @@ def plot_cumulative_incidence_functions(
             duration = perf_counter() - tic
             print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s")
             aj.plot(label="Aalen-Johansen", ax=ax)
+            print(aj.cumulative_density_.values[-1])
 
         if event_id == 1:
             ax.legend(loc="lower right")
@@ -119,13 +108,37 @@ def plot_cumulative_incidence_functions(
 
 # %%
 
+X_train, X_test, y_train_c, y_test_c = train_test_split(
+    X, y_censored, test_size=0.3, random_state=seed
+)
+y_train_u = y_uncensored.loc[y_train_c.index]
+y_test_u = y_uncensored.loc[y_test_c.index]
+gb_incidence = GradientBoostingIncidence(
+    learning_rate=0.05,
+    n_iter=1_000,
+    max_leaf_nodes=50,
+    hard_zero_fraction=0.1,
+    min_samples_leaf=5,
+    loss="inll",
+    show_progressbar=False,
+    random_state=seed,
+)
+
 plot_cumulative_incidence_functions(
     X_train,
-    y_train,
+    y_train_u,
     gb_incidence=gb_incidence,
     aj=aj,
     X_test=X_test,
-    y_test=y_test,
+    y_test=y_test_u,
 )
 
+plot_cumulative_incidence_functions(
+    X_train,
+    y_train_c,
+    gb_incidence=gb_incidence,
+    aj=aj,
+    X_test=X_test,
+    y_test=y_test_c,
+)
 # %%