From ce322dfcada5ad0429e4953a56d854684c596ad0 Mon Sep 17 00:00:00 2001
From: Julie Alberge <julie.alberge@gmail.com>
Date: Tue, 5 Dec 2023 17:10:03 +0100
Subject: [PATCH 01/13] Creating a new synthetic dataset. For now, the
 censoring method does not depend of the covariates.

---
 examples/complex_synthetic_data.py   | 121 +++++++++++++++++++++++++++
 examples/test_complex_data.py        | 104 +++++++++++++++++++++++
 hazardous/data/_competing_weibull.py |  10 +--
 3 files changed, 230 insertions(+), 5 deletions(-)
 create mode 100644 examples/complex_synthetic_data.py
 create mode 100644 examples/test_complex_data.py

diff --git a/examples/complex_synthetic_data.py b/examples/complex_synthetic_data.py
new file mode 100644
index 0000000..e8e25df
--- /dev/null
+++ b/examples/complex_synthetic_data.py
@@ -0,0 +1,121 @@
+"""
+==================================================
+Synthetic DataSet
+==================================================
+
+Trying to complexy synthetic data
+"""
+# %%
+import numpy as np
+from sklearn.preprocessing import PolynomialFeatures
+import pandas as pd
+from scipy.stats import weibull_min
+
+rng = np.random.default_rng(0)
+seed = 0
+DEFAULT_SHAPE_RANGES = (
+    (0.7, 0.9),
+    (1.0, 1.0),
+    (2.0, 3.0),
+)
+
+DEFAULT_SCALE_RANGES = (
+    (1, 20),
+    (1, 10),
+    (1.5, 5),
+)
+
+n_events = 3
+n_weibull_parameters = 2 * n_events
+n_samples = 3_000
+base_scale = 1_000
+n_features = 10
+features_rate = 0.3
+degree_interaction = 2
+relative_scale = 1.5
+
+assert len(DEFAULT_SCALE_RANGES) == len(DEFAULT_SHAPE_RANGES)
+assert len(DEFAULT_SHAPE_RANGES) == n_events
+
+
+def complex_data(
+    n_events=n_events,
+    n_weibull_parameters=2 * n_events,
+    n_samples=3_000,
+    base_scale=1_000,
+    n_features=10,
+    features_rate=0.3,
+    degree_interaction=2,
+    relative_scale=1.5,
+):
+    # Create features given to the model as X and then creating the interactions
+    df_features = pd.DataFrame(np.random.randn(n_samples, n_features))
+    df_features.columns = [f"feature_{i}" for i in range(n_features)]
+
+    # Adding interactions between features
+    df_poly_features = PolynomialFeatures(
+        degree=degree_interaction, interaction_only=True, include_bias=False
+    )
+    df_poly_features.set_output(transform="pandas")
+    df_trans = df_poly_features.fit_transform(df_features)
+
+    # Create masked matrix with the interactions
+    w_star = np.random.randn(df_trans.shape[1], n_weibull_parameters)
+    # Set 1-feature_rate% of the w_star to 0
+    w_star = np.where(
+        np.random.rand(w_star.shape[0], w_star.shape[1]) < features_rate, 0, w_star
+    )
+    # Computation of the true values of shape and scale
+    shape_scale_star = df_trans.values @ w_star
+    shape_scale_columns = [f"shape_{i}" for i in range(n_events)] + [
+        f"scale_{i}" for i in range(n_events)
+    ]
+    df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns)
+
+    # Rescaling of these values to stay in the chosen range
+    for event, (scale_default, shape_default) in enumerate(
+        zip(DEFAULT_SCALE_RANGES, DEFAULT_SHAPE_RANGES)
+    ):
+        shape_min, shape_max = shape_default
+        scale_min, scale_max = scale_default
+        shape = df_shape_scale_star[f"shape_{event}"].copy()
+        scale = df_shape_scale_star[f"scale_{event}"].copy()
+        shape = shape_min + (shape_max - shape_min) * (shape - shape.min()) / (
+            shape.max() - shape.min()
+        )
+        scale = scale_min + (scale_max - scale_min) * (scale - scale.min()) / (
+            scale.max() - scale.min()
+        )
+        df_shape_scale_star[f"shape_{event}"] = shape
+        df_shape_scale_star[f"scale_{event}"] = scale
+
+    # Throw durations from a weibull distribution with scale and shape as the parameters
+    event_durations = []
+    for event in range(n_events):
+        shape = df_shape_scale_star[f"shape_{event}"]
+        scale = df_shape_scale_star[f"scale_{event}"]
+        durations = weibull_min.rvs(shape, scale=scale * base_scale, random_state=rng)
+        event_durations.append(durations)
+
+    # Creating the target tabular
+    event_durations = np.asarray(event_durations)
+    duration_argmin = np.argmin(event_durations, axis=0)
+    y = pd.DataFrame(
+        dict(
+            event=duration_argmin + 1,
+            duration=event_durations[duration_argmin, np.arange(n_samples)],
+        )
+    )
+
+    # censoring is independant from the covariates for now
+    if relative_scale > 0:
+        scale = relative_scale * y["duration"].mean()
+        censoring = weibull_min.rvs(1, scale=scale, size=y.shape[0], random_state=rng)
+        y = y.copy()
+        y["event"] = np.where(y["duration"] < censoring, y["event"], 0)
+        y["duration"] = np.minimum(y["duration"], censoring)
+
+    return df_features, y
+
+
+# %%
diff --git a/examples/test_complex_data.py b/examples/test_complex_data.py
new file mode 100644
index 0000000..726d4c7
--- /dev/null
+++ b/examples/test_complex_data.py
@@ -0,0 +1,104 @@
+# %%
+import numpy as np
+from complex_synthetic_data import complex_data
+from time import perf_counter
+import matplotlib.pyplot as plt
+
+from hazardous import GradientBoostingIncidence
+from lifelines import AalenJohansenFitter
+
+rng = np.random.default_rng(0)
+seed = 0
+DEFAULT_SHAPE_RANGES = (
+    (0.7, 0.9),
+    (1.0, 1.0),
+    (2.0, 3.0),
+)
+
+DEFAULT_SCALE_RANGES = (
+    (1, 20),
+    (1, 10),
+    (1.5, 5),
+)
+n_events = 3
+
+# %%
+
+X, y = complex_data()
+n_samples = len(X)
+calculate_variance = n_samples <= 5_000
+aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0)
+
+gb_incidence = GradientBoostingIncidence(
+    learning_rate=0.03,
+    n_iter=200,
+    max_leaf_nodes=5,
+    hard_zero_fraction=0.1,
+    min_samples_leaf=50,
+    loss="ibs",
+    show_progressbar=False,
+    random_state=0,
+)
+
+# %%
+#
+# CIFs estimated on uncensored data
+# ---------------------------------
+#
+# Let's now estimate the CIFs on uncensored data and plot them against the
+# theoretical CIFs:
+
+
+def plot_cumulative_incidence_functions(X, y, gb_incidence=None, aj=None):
+    n_events = y["event"].max()
+    t_max = y["duration"].max()
+    _, axes = plt.subplots(figsize=(12, 4), ncols=n_events, sharey=True)
+    # Compute the estimate of the CIFs on a coarse grid.
+    coarse_timegrid = np.linspace(0, t_max, num=100)
+    censoring_fraction = (y["event"] == 0).mean()
+    plt.suptitle(
+        "Cause-specific cumulative incidence functions"
+        f" ({censoring_fraction:.1%} censoring)"
+    )
+
+    for event_id, ax in enumerate(axes, 1):
+        if gb_incidence is not None:
+            tic = perf_counter()
+            gb_incidence.set_params(event_of_interest=event_id)
+            gb_incidence.fit(X, y)
+            duration = perf_counter() - tic
+            print(f"GB Incidence for event {event_id} fit in {duration:.3f} s")
+            tic = perf_counter()
+            cif_pred = gb_incidence.predict_cumulative_incidence(
+                X[0:1], coarse_timegrid
+            )[0]
+            duration = perf_counter() - tic
+            print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s")
+            ax.plot(
+                coarse_timegrid,
+                cif_pred,
+                label="GradientBoostingIncidence",
+            )
+            ax.set(title=f"Event {event_id}")
+
+        if aj is not None:
+            tic = perf_counter()
+            aj.fit(y["duration"], y["event"], event_of_interest=event_id)
+            duration = perf_counter() - tic
+            print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s")
+            aj.plot(label="Aalen-Johansen", ax=ax)
+
+        if event_id == 1:
+            ax.legend(loc="lower right")
+        else:
+            ax.legend().remove()
+
+
+# %%
+
+plot_cumulative_incidence_functions(
+    X,
+    y,
+    gb_incidence=gb_incidence,
+    aj=aj,
+)
diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py
index 2b15f9c..b1656b5 100644
--- a/hazardous/data/_competing_weibull.py
+++ b/hazardous/data/_competing_weibull.py
@@ -7,9 +7,9 @@
 from sklearn.utils import check_random_state
 
 DEFAULT_SHAPE_RANGES = (
-    (0.4, 0.9),
+    (0.7, 0.9),
     (1.0, 1.0),
-    (1.2, 3),
+    (2.0, 3.0),
 )
 
 DEFAULT_SCALE_RANGES = (
@@ -19,7 +19,7 @@
 )
 
 
-def _censor(y, relative_scale, random_state=None):
+def _censor(y, relative_scale, random_state=0):
     if relative_scale == 0 or relative_scale is None:
         return y
 
@@ -34,7 +34,7 @@ def _censor(y, relative_scale, random_state=None):
 
 def make_synthetic_competing_weibull(
     n_events=3,
-    n_samples=3000,
+    n_samples=3_000,
     return_X_y=False,
     base_scale=1_000,
     feature_rounding=2,
@@ -101,4 +101,4 @@ def make_synthetic_competing_weibull(
         return X, y
 
     frame = pd.concat([X, y], axis=1)
-    return Bunch(data=frame[X.columns], target=X[y.columns], frame=frame)
+    return Bunch(data=frame[X.columns], target=frame[y.columns], frame=frame)

From ab5d372829c4bf531695d5a9929a5f5dc319ee92 Mon Sep 17 00:00:00 2001
From: Julie Alberge <julie.alberge@gmail.com>
Date: Fri, 8 Dec 2023 16:05:33 +0100
Subject: [PATCH 02/13] adding censoring depending on the covariates

---
 examples/complex_synthetic_data.py | 114 ++++++++++++++++++++++-------
 examples/test_complex_data.py      |  37 ++++++++--
 2 files changed, 118 insertions(+), 33 deletions(-)

diff --git a/examples/complex_synthetic_data.py b/examples/complex_synthetic_data.py
index e8e25df..5826b5e 100644
--- a/examples/complex_synthetic_data.py
+++ b/examples/complex_synthetic_data.py
@@ -5,7 +5,11 @@
 
 Trying to complexy synthetic data
 """
-# %%
+
+## TODO
+# Change the linear scaling by an expit rescaling
+# Improve censoring method by using covariates
+
 import numpy as np
 from sklearn.preprocessing import PolynomialFeatures
 import pandas as pd
@@ -38,20 +42,9 @@
 assert len(DEFAULT_SHAPE_RANGES) == n_events
 
 
-def complex_data(
-    n_events=n_events,
-    n_weibull_parameters=2 * n_events,
-    n_samples=3_000,
-    base_scale=1_000,
-    n_features=10,
-    features_rate=0.3,
-    degree_interaction=2,
-    relative_scale=1.5,
+def compute_shape_and_scale(
+    df_features, n_weibull_parameters, features_rate, n_events, degree_interaction
 ):
-    # Create features given to the model as X and then creating the interactions
-    df_features = pd.DataFrame(np.random.randn(n_samples, n_features))
-    df_features.columns = [f"feature_{i}" for i in range(n_features)]
-
     # Adding interactions between features
     df_poly_features = PolynomialFeatures(
         degree=degree_interaction, interaction_only=True, include_bias=False
@@ -63,7 +56,7 @@ def complex_data(
     w_star = np.random.randn(df_trans.shape[1], n_weibull_parameters)
     # Set 1-feature_rate% of the w_star to 0
     w_star = np.where(
-        np.random.rand(w_star.shape[0], w_star.shape[1]) < features_rate, 0, w_star
+        np.random.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0
     )
     # Computation of the true values of shape and scale
     shape_scale_star = df_trans.values @ w_star
@@ -71,7 +64,14 @@ def complex_data(
         f"scale_{i}" for i in range(n_events)
     ]
     df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns)
+    # Rescaling of these values to stay in the chosen range
+    df_shape_scale_star = rescaling_params_to_respect_default_ranges(
+        df_shape_scale_star
+    )
+    return df_shape_scale_star
+
 
+def rescaling_params_to_respect_default_ranges(df_shape_scale_star):
     # Rescaling of these values to stay in the chosen range
     for event, (scale_default, shape_default) in enumerate(
         zip(DEFAULT_SCALE_RANGES, DEFAULT_SHAPE_RANGES)
@@ -89,6 +89,67 @@ def complex_data(
         df_shape_scale_star[f"shape_{event}"] = shape
         df_shape_scale_star[f"scale_{event}"] = scale
 
+    return df_shape_scale_star
+
+
+def censor_data(
+    y, relative_scale, independant=True, X=None, features_censoring_rate=0.2
+):
+    if relative_scale == 0 or relative_scale is None:
+        return y
+
+    if independant:
+        scale_censoring = relative_scale * y["duration"].mean()
+    else:
+        features_impact_censoring = np.abs(np.random.randn(X.shape[1], 1))
+        features_impact_censoring = np.where(
+            np.random.rand(
+                features_impact_censoring.shape[0], features_impact_censoring.shape[1]
+            )
+            < features_censoring_rate,
+            features_impact_censoring,
+            0,
+        )
+        scale_censoring = (X @ features_impact_censoring).values.flatten()
+        scale_censoring = (
+            (
+                1
+                + (scale_censoring - scale_censoring.min())
+                / (scale_censoring.max() - scale_censoring.min())
+            )
+            * relative_scale
+            * y["duration"].mean()
+        )
+
+    censoring = weibull_min.rvs(
+        1, scale=scale_censoring, size=y.shape[0], random_state=rng
+    )
+    y = y.copy()
+    y["event"] = np.where(y["duration"] < censoring, y["event"], 0)
+    y["duration"] = np.minimum(y["duration"], censoring)
+    return y
+
+
+def complex_data(
+    n_events=n_events,
+    n_weibull_parameters=2 * n_events,
+    n_samples=3_000,
+    base_scale=1_000,
+    n_features=10,
+    features_rate=0.3,
+    degree_interaction=2,
+    relative_scale=1.5,
+    independant=True,
+    features_censoring_rate=0.2,
+    return_uncensored_data=False,
+):
+    # Create features given to the model as X and then creating the interactions
+    df_features = pd.DataFrame(np.random.randn(n_samples, n_features))
+    df_features.columns = [f"feature_{i}" for i in range(n_features)]
+
+    df_shape_scale_star = compute_shape_and_scale(
+        df_features, n_weibull_parameters, features_rate, n_events, degree_interaction
+    )
     # Throw durations from a weibull distribution with scale and shape as the parameters
     event_durations = []
     for event in range(n_events):
@@ -106,16 +167,13 @@ def complex_data(
             duration=event_durations[duration_argmin, np.arange(n_samples)],
         )
     )
-
-    # censoring is independant from the covariates for now
-    if relative_scale > 0:
-        scale = relative_scale * y["duration"].mean()
-        censoring = weibull_min.rvs(1, scale=scale, size=y.shape[0], random_state=rng)
-        y = y.copy()
-        y["event"] = np.where(y["duration"] < censoring, y["event"], 0)
-        y["duration"] = np.minimum(y["duration"], censoring)
-
-    return df_features, y
-
-
-# %%
+    y_censored = censor_data(
+        y,
+        relative_scale,
+        independant=independant,
+        X=df_features,
+        features_censoring_rate=features_censoring_rate,
+    )
+    if return_uncensored_data:
+        return df_features, y_censored, y
+    return df_features, y_censored
diff --git a/examples/test_complex_data.py b/examples/test_complex_data.py
index 726d4c7..9bdfc5d 100644
--- a/examples/test_complex_data.py
+++ b/examples/test_complex_data.py
@@ -3,6 +3,7 @@
 from complex_synthetic_data import complex_data
 from time import perf_counter
 import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
 
 from hazardous import GradientBoostingIncidence
 from lifelines import AalenJohansenFitter
@@ -24,8 +25,22 @@
 
 # %%
 
-X, y = complex_data()
-n_samples = len(X)
+X, y = complex_data(
+    n_events=n_events,
+    n_weibull_parameters=2 * n_events,
+    n_samples=10_000,
+    base_scale=1_000,
+    n_features=10,
+    features_rate=0.3,
+    degree_interaction=2,
+    relative_scale=0.95,
+    independant=False,
+    features_censoring_rate=0.1,
+)
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
+
+n_samples = len(X_train)
 calculate_variance = n_samples <= 5_000
 aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0)
 
@@ -40,6 +55,7 @@
     random_state=0,
 )
 
+y["event"].value_counts()
 # %%
 #
 # CIFs estimated on uncensored data
@@ -49,7 +65,9 @@
 # theoretical CIFs:
 
 
-def plot_cumulative_incidence_functions(X, y, gb_incidence=None, aj=None):
+def plot_cumulative_incidence_functions(
+    X, y, gb_incidence=None, aj=None, X_test=None, y_test=None
+):
     n_events = y["event"].max()
     t_max = y["duration"].max()
     _, axes = plt.subplots(figsize=(12, 4), ncols=n_events, sharey=True)
@@ -74,6 +92,11 @@ def plot_cumulative_incidence_functions(X, y, gb_incidence=None, aj=None):
             )[0]
             duration = perf_counter() - tic
             print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s")
+            print("Brier score on training data:", gb_incidence.score(X, y))
+            if X_test is not None:
+                print(
+                    "Brier score on testing data:", gb_incidence.score(X_test, y_test)
+                )
             ax.plot(
                 coarse_timegrid,
                 cif_pred,
@@ -97,8 +120,12 @@ def plot_cumulative_incidence_functions(X, y, gb_incidence=None, aj=None):
 # %%
 
 plot_cumulative_incidence_functions(
-    X,
-    y,
+    X_train,
+    y_train,
     gb_incidence=gb_incidence,
     aj=aj,
+    X_test=X_test,
+    y_test=y_test,
 )
+
+# %%

From b0380900f633552988f173b44bb97de73e1c3327 Mon Sep 17 00:00:00 2001
From: Julie Alberge <julie.alberge@gmail.com>
Date: Mon, 11 Dec 2023 17:38:57 +0100
Subject: [PATCH 03/13] Synthetic dataset with expit

---
 examples/complex_synthetic_data.py | 85 +++++++++++++++---------------
 examples/test_complex_data.py      | 55 +++++++++++--------
 2 files changed, 77 insertions(+), 63 deletions(-)

diff --git a/examples/complex_synthetic_data.py b/examples/complex_synthetic_data.py
index 5826b5e..de35621 100644
--- a/examples/complex_synthetic_data.py
+++ b/examples/complex_synthetic_data.py
@@ -14,9 +14,11 @@
 from sklearn.preprocessing import PolynomialFeatures
 import pandas as pd
 from scipy.stats import weibull_min
+from scipy.special import expit
 
-rng = np.random.default_rng(0)
 seed = 0
+rng = np.random.RandomState(seed)
+
 DEFAULT_SHAPE_RANGES = (
     (0.7, 0.9),
     (1.0, 1.0),
@@ -53,10 +55,10 @@ def compute_shape_and_scale(
     df_trans = df_poly_features.fit_transform(df_features)
 
     # Create masked matrix with the interactions
-    w_star = np.random.randn(df_trans.shape[1], n_weibull_parameters)
+    w_star = rng.randn(df_trans.shape[1], n_weibull_parameters)
     # Set 1-feature_rate% of the w_star to 0
     w_star = np.where(
-        np.random.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0
+        rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0
     )
     # Computation of the true values of shape and scale
     shape_scale_star = df_trans.values @ w_star
@@ -64,70 +66,69 @@ def compute_shape_and_scale(
         f"scale_{i}" for i in range(n_events)
     ]
     df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns)
-    # Rescaling of these values to stay in the chosen range
-    df_shape_scale_star = rescaling_params_to_respect_default_ranges(
-        df_shape_scale_star
-    )
-    return df_shape_scale_star
-
-
-def rescaling_params_to_respect_default_ranges(df_shape_scale_star):
     # Rescaling of these values to stay in the chosen range
     for event, (scale_default, shape_default) in enumerate(
         zip(DEFAULT_SCALE_RANGES, DEFAULT_SHAPE_RANGES)
     ):
-        shape_min, shape_max = shape_default
-        scale_min, scale_max = scale_default
-        shape = df_shape_scale_star[f"shape_{event}"].copy()
-        scale = df_shape_scale_star[f"scale_{event}"].copy()
-        shape = shape_min + (shape_max - shape_min) * (shape - shape.min()) / (
-            shape.max() - shape.min()
+        df_shape_scale_star = rescaling_params_to_respect_default_ranges(
+            df_shape_scale_star, shape_default, scale_default, event
         )
-        scale = scale_min + (scale_max - scale_min) * (scale - scale.min()) / (
-            scale.max() - scale.min()
-        )
-        df_shape_scale_star[f"shape_{event}"] = shape
-        df_shape_scale_star[f"scale_{event}"] = scale
+    return df_shape_scale_star
 
+
+def rescaling_params_to_respect_default_ranges(
+    df_shape_scale_star, shape_default, scale_default, event
+):
+    # Rescaling of these values to stay in the chosen range
+    shape_min, shape_max = shape_default
+    scale_min, scale_max = scale_default
+    shape = df_shape_scale_star[f"shape_{event}"].copy()
+    scale = df_shape_scale_star[f"scale_{event}"].copy()
+    df_shape_scale_star[f"shape_{event}"] = shape_min + (shape_max - shape_min) * expit(
+        shape
+    )
+    df_shape_scale_star[f"scale_{event}"] = scale_min + (scale_max - scale_min) * expit(
+        scale
+    )
     return df_shape_scale_star
 
 
 def censor_data(
-    y, relative_scale, independant=True, X=None, features_censoring_rate=0.2
+    y, independant=True, X=None, features_censoring_rate=0.2, relative_scale=1.5
 ):
     if relative_scale == 0 or relative_scale is None:
         return y
 
     if independant:
         scale_censoring = relative_scale * y["duration"].mean()
+        shape_censoring = 1
     else:
-        features_impact_censoring = np.abs(np.random.randn(X.shape[1], 1))
+        features_impact_censoring = rng.randn(X.shape[1], 2)
         features_impact_censoring = np.where(
-            np.random.rand(
+            rng.rand(
                 features_impact_censoring.shape[0], features_impact_censoring.shape[1]
             )
             < features_censoring_rate,
             features_impact_censoring,
             0,
         )
-        scale_censoring = (X @ features_impact_censoring).values.flatten()
-        scale_censoring = (
-            (
-                1
-                + (scale_censoring - scale_censoring.min())
-                / (scale_censoring.max() - scale_censoring.min())
-            )
-            * relative_scale
-            * y["duration"].mean()
+        df_censoring_params = relative_scale * X @ features_impact_censoring
+        df_censoring_params.columns = ["shape_0", "scale_0"]
+        df_censoring_params = rescaling_params_to_respect_default_ranges(
+            df_censoring_params,
+            shape_default=(0.5, 0.9),
+            scale_default=(10, 15),
+            event=0,
         )
-
+        scale_censoring = df_censoring_params["scale_0"].values * y["duration"].mean()
+        shape_censoring = df_censoring_params["shape_0"].values
     censoring = weibull_min.rvs(
-        1, scale=scale_censoring, size=y.shape[0], random_state=rng
+        shape_censoring, scale=scale_censoring, size=y.shape[0], random_state=rng
     )
-    y = y.copy()
-    y["event"] = np.where(y["duration"] < censoring, y["event"], 0)
-    y["duration"] = np.minimum(y["duration"], censoring)
-    return y
+    y_censored = y.copy()
+    y_censored["event"] = np.where(y["duration"] < censoring, y_censored["event"], 0)
+    y_censored["duration"] = np.minimum(y_censored["duration"], censoring)
+    return y_censored
 
 
 def complex_data(
@@ -144,7 +145,7 @@ def complex_data(
     return_uncensored_data=False,
 ):
     # Create features given to the model as X and then creating the interactions
-    df_features = pd.DataFrame(np.random.randn(n_samples, n_features))
+    df_features = pd.DataFrame(rng.randn(n_samples, n_features))
     df_features.columns = [f"feature_{i}" for i in range(n_features)]
 
     df_shape_scale_star = compute_shape_and_scale(
@@ -169,7 +170,7 @@ def complex_data(
     )
     y_censored = censor_data(
         y,
-        relative_scale,
+        relative_scale=relative_scale,
         independant=independant,
         X=df_features,
         features_censoring_rate=features_censoring_rate,
diff --git a/examples/test_complex_data.py b/examples/test_complex_data.py
index 9bdfc5d..910aec2 100644
--- a/examples/test_complex_data.py
+++ b/examples/test_complex_data.py
@@ -8,8 +8,8 @@
 from hazardous import GradientBoostingIncidence
 from lifelines import AalenJohansenFitter
 
-rng = np.random.default_rng(0)
 seed = 0
+rng = np.random.RandomState(seed)
 DEFAULT_SHAPE_RANGES = (
     (0.7, 0.9),
     (1.0, 1.0),
@@ -25,37 +25,25 @@
 
 # %%
 
-X, y = complex_data(
+X, y_censored, y_uncensored = complex_data(
     n_events=n_events,
     n_weibull_parameters=2 * n_events,
-    n_samples=10_000,
+    n_samples=30_000,
     base_scale=1_000,
     n_features=10,
-    features_rate=0.3,
+    features_rate=0.5,
     degree_interaction=2,
     relative_scale=0.95,
     independant=False,
-    features_censoring_rate=0.1,
+    features_censoring_rate=0.2,
+    return_uncensored_data=True,
 )
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
 
-n_samples = len(X_train)
+n_samples = len(X)
 calculate_variance = n_samples <= 5_000
 aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0)
 
-gb_incidence = GradientBoostingIncidence(
-    learning_rate=0.03,
-    n_iter=200,
-    max_leaf_nodes=5,
-    hard_zero_fraction=0.1,
-    min_samples_leaf=50,
-    loss="ibs",
-    show_progressbar=False,
-    random_state=0,
-)
-
-y["event"].value_counts()
 # %%
 #
 # CIFs estimated on uncensored data
@@ -110,6 +98,7 @@ def plot_cumulative_incidence_functions(
             duration = perf_counter() - tic
             print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s")
             aj.plot(label="Aalen-Johansen", ax=ax)
+            print(aj.cumulative_density_.values[-1])
 
         if event_id == 1:
             ax.legend(loc="lower right")
@@ -119,13 +108,37 @@ def plot_cumulative_incidence_functions(
 
 # %%
 
+X_train, X_test, y_train_c, y_test_c = train_test_split(
+    X, y_censored, test_size=0.3, random_state=seed
+)
+y_train_u = y_uncensored.loc[y_train_c.index]
+y_test_u = y_uncensored.loc[y_test_c.index]
+gb_incidence = GradientBoostingIncidence(
+    learning_rate=0.05,
+    n_iter=1_000,
+    max_leaf_nodes=50,
+    hard_zero_fraction=0.1,
+    min_samples_leaf=5,
+    loss="inll",
+    show_progressbar=False,
+    random_state=seed,
+)
+
 plot_cumulative_incidence_functions(
     X_train,
-    y_train,
+    y_train_u,
     gb_incidence=gb_incidence,
     aj=aj,
     X_test=X_test,
-    y_test=y_test,
+    y_test=y_test_u,
 )
 
+plot_cumulative_incidence_functions(
+    X_train,
+    y_train_c,
+    gb_incidence=gb_incidence,
+    aj=aj,
+    X_test=X_test,
+    y_test=y_test_c,
+)
 # %%

From 4388bd5b527990f8af8b39725d0f5fa4dc78f893 Mon Sep 17 00:00:00 2001
From: Julie Alberge <julie.alberge@gmail.com>
Date: Tue, 12 Dec 2023 13:58:04 +0100
Subject: [PATCH 04/13]  refactoring the code, adding histograms.

---
 examples/complex_synthetic_data.py            | 180 --------------
 ...t_complex_data.py => plot_complex_data.py} |  61 +++--
 hazardous/data/__init__.py                    |   4 +-
 hazardous/data/_competing_weibull.py          | 233 ++++++++++++++++--
 4 files changed, 259 insertions(+), 219 deletions(-)
 delete mode 100644 examples/complex_synthetic_data.py
 rename examples/{test_complex_data.py => plot_complex_data.py} (73%)

diff --git a/examples/complex_synthetic_data.py b/examples/complex_synthetic_data.py
deleted file mode 100644
index de35621..0000000
--- a/examples/complex_synthetic_data.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""
-==================================================
-Synthetic DataSet
-==================================================
-
-Trying to complexy synthetic data
-"""
-
-## TODO
-# Change the linear scaling by an expit rescaling
-# Improve censoring method by using covariates
-
-import numpy as np
-from sklearn.preprocessing import PolynomialFeatures
-import pandas as pd
-from scipy.stats import weibull_min
-from scipy.special import expit
-
-seed = 0
-rng = np.random.RandomState(seed)
-
-DEFAULT_SHAPE_RANGES = (
-    (0.7, 0.9),
-    (1.0, 1.0),
-    (2.0, 3.0),
-)
-
-DEFAULT_SCALE_RANGES = (
-    (1, 20),
-    (1, 10),
-    (1.5, 5),
-)
-
-n_events = 3
-n_weibull_parameters = 2 * n_events
-n_samples = 3_000
-base_scale = 1_000
-n_features = 10
-features_rate = 0.3
-degree_interaction = 2
-relative_scale = 1.5
-
-assert len(DEFAULT_SCALE_RANGES) == len(DEFAULT_SHAPE_RANGES)
-assert len(DEFAULT_SHAPE_RANGES) == n_events
-
-
-def compute_shape_and_scale(
-    df_features, n_weibull_parameters, features_rate, n_events, degree_interaction
-):
-    # Adding interactions between features
-    df_poly_features = PolynomialFeatures(
-        degree=degree_interaction, interaction_only=True, include_bias=False
-    )
-    df_poly_features.set_output(transform="pandas")
-    df_trans = df_poly_features.fit_transform(df_features)
-
-    # Create masked matrix with the interactions
-    w_star = rng.randn(df_trans.shape[1], n_weibull_parameters)
-    # Set 1-feature_rate% of the w_star to 0
-    w_star = np.where(
-        rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0
-    )
-    # Computation of the true values of shape and scale
-    shape_scale_star = df_trans.values @ w_star
-    shape_scale_columns = [f"shape_{i}" for i in range(n_events)] + [
-        f"scale_{i}" for i in range(n_events)
-    ]
-    df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns)
-    # Rescaling of these values to stay in the chosen range
-    for event, (scale_default, shape_default) in enumerate(
-        zip(DEFAULT_SCALE_RANGES, DEFAULT_SHAPE_RANGES)
-    ):
-        df_shape_scale_star = rescaling_params_to_respect_default_ranges(
-            df_shape_scale_star, shape_default, scale_default, event
-        )
-    return df_shape_scale_star
-
-
-def rescaling_params_to_respect_default_ranges(
-    df_shape_scale_star, shape_default, scale_default, event
-):
-    # Rescaling of these values to stay in the chosen range
-    shape_min, shape_max = shape_default
-    scale_min, scale_max = scale_default
-    shape = df_shape_scale_star[f"shape_{event}"].copy()
-    scale = df_shape_scale_star[f"scale_{event}"].copy()
-    df_shape_scale_star[f"shape_{event}"] = shape_min + (shape_max - shape_min) * expit(
-        shape
-    )
-    df_shape_scale_star[f"scale_{event}"] = scale_min + (scale_max - scale_min) * expit(
-        scale
-    )
-    return df_shape_scale_star
-
-
-def censor_data(
-    y, independant=True, X=None, features_censoring_rate=0.2, relative_scale=1.5
-):
-    if relative_scale == 0 or relative_scale is None:
-        return y
-
-    if independant:
-        scale_censoring = relative_scale * y["duration"].mean()
-        shape_censoring = 1
-    else:
-        features_impact_censoring = rng.randn(X.shape[1], 2)
-        features_impact_censoring = np.where(
-            rng.rand(
-                features_impact_censoring.shape[0], features_impact_censoring.shape[1]
-            )
-            < features_censoring_rate,
-            features_impact_censoring,
-            0,
-        )
-        df_censoring_params = relative_scale * X @ features_impact_censoring
-        df_censoring_params.columns = ["shape_0", "scale_0"]
-        df_censoring_params = rescaling_params_to_respect_default_ranges(
-            df_censoring_params,
-            shape_default=(0.5, 0.9),
-            scale_default=(10, 15),
-            event=0,
-        )
-        scale_censoring = df_censoring_params["scale_0"].values * y["duration"].mean()
-        shape_censoring = df_censoring_params["shape_0"].values
-    censoring = weibull_min.rvs(
-        shape_censoring, scale=scale_censoring, size=y.shape[0], random_state=rng
-    )
-    y_censored = y.copy()
-    y_censored["event"] = np.where(y["duration"] < censoring, y_censored["event"], 0)
-    y_censored["duration"] = np.minimum(y_censored["duration"], censoring)
-    return y_censored
-
-
-def complex_data(
-    n_events=n_events,
-    n_weibull_parameters=2 * n_events,
-    n_samples=3_000,
-    base_scale=1_000,
-    n_features=10,
-    features_rate=0.3,
-    degree_interaction=2,
-    relative_scale=1.5,
-    independant=True,
-    features_censoring_rate=0.2,
-    return_uncensored_data=False,
-):
-    # Create features given to the model as X and then creating the interactions
-    df_features = pd.DataFrame(rng.randn(n_samples, n_features))
-    df_features.columns = [f"feature_{i}" for i in range(n_features)]
-
-    df_shape_scale_star = compute_shape_and_scale(
-        df_features, n_weibull_parameters, features_rate, n_events, degree_interaction
-    )
-    # Throw durations from a weibull distribution with scale and shape as the parameters
-    event_durations = []
-    for event in range(n_events):
-        shape = df_shape_scale_star[f"shape_{event}"]
-        scale = df_shape_scale_star[f"scale_{event}"]
-        durations = weibull_min.rvs(shape, scale=scale * base_scale, random_state=rng)
-        event_durations.append(durations)
-
-    # Creating the target tabular
-    event_durations = np.asarray(event_durations)
-    duration_argmin = np.argmin(event_durations, axis=0)
-    y = pd.DataFrame(
-        dict(
-            event=duration_argmin + 1,
-            duration=event_durations[duration_argmin, np.arange(n_samples)],
-        )
-    )
-    y_censored = censor_data(
-        y,
-        relative_scale=relative_scale,
-        independant=independant,
-        X=df_features,
-        features_censoring_rate=features_censoring_rate,
-    )
-    if return_uncensored_data:
-        return df_features, y_censored, y
-    return df_features, y_censored
diff --git a/examples/test_complex_data.py b/examples/plot_complex_data.py
similarity index 73%
rename from examples/test_complex_data.py
rename to examples/plot_complex_data.py
index 910aec2..259379c 100644
--- a/examples/test_complex_data.py
+++ b/examples/plot_complex_data.py
@@ -1,9 +1,11 @@
 # %%
 import numpy as np
-from complex_synthetic_data import complex_data
+import hazardous.data._competing_weibull as competing_w
 from time import perf_counter
 import matplotlib.pyplot as plt
 from sklearn.model_selection import train_test_split
+import pandas as pd
+import seaborn as sns
 
 from hazardous import GradientBoostingIncidence
 from lifelines import AalenJohansenFitter
@@ -23,27 +25,59 @@
 )
 n_events = 3
 
+
 # %%
+# In the following cell, we verify that the synthetic dataset is well defined.
+
+rng = np.random.RandomState(seed)
+df_features = pd.DataFrame(rng.randn(100_000, 10))
+df_features.columns = [f"feature_{i}" for i in range(10)]
+
+df_shape_scale_star = competing_w.compute_shape_and_scale(
+    df_features,
+    6,
+    0.3,
+    n_events,
+    2,
+    DEFAULT_SHAPE_RANGES,
+    DEFAULT_SCALE_RANGES,
+    0,
+)
+fig, axes = plt.subplots(2, 3, figsize=(10, 7))
+for idx, col in enumerate(df_shape_scale_star.columns):
+    sns.histplot(df_shape_scale_star[col], ax=axes[idx // 3][idx % 3])
+
 
-X, y_censored, y_uncensored = complex_data(
+# %%
+
+X, y_censored, y_uncensored = competing_w.make_synthetic_dataset(
     n_events=n_events,
     n_weibull_parameters=2 * n_events,
-    n_samples=30_000,
+    n_samples=3_000,
     base_scale=1_000,
     n_features=10,
     features_rate=0.5,
     degree_interaction=2,
-    relative_scale=0.95,
     independant=False,
     features_censoring_rate=0.2,
     return_uncensored_data=True,
+    return_X_y=True,
+    feature_rounding=4,
+    target_rounding=1,
+    shape_ranges=DEFAULT_SHAPE_RANGES,
+    scale_ranges=DEFAULT_SCALE_RANGES,
+    censoring_relative_scale=0.1,
+    random_state=0,
+    complex_features=True,
 )
 
-
 n_samples = len(X)
 calculate_variance = n_samples <= 5_000
 aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0)
 
+y_censored["event"].value_counts()
+
+
 # %%
 #
 # CIFs estimated on uncensored data
@@ -75,9 +109,9 @@ def plot_cumulative_incidence_functions(
             duration = perf_counter() - tic
             print(f"GB Incidence for event {event_id} fit in {duration:.3f} s")
             tic = perf_counter()
-            cif_pred = gb_incidence.predict_cumulative_incidence(
-                X[0:1], coarse_timegrid
-            )[0]
+            cifs_pred = gb_incidence.predict_cumulative_incidence(X, coarse_timegrid)
+            print(cifs_pred.shape)
+            cif_mean = cifs_pred.mean(axis=0)
             duration = perf_counter() - tic
             print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s")
             print("Brier score on training data:", gb_incidence.score(X, y))
@@ -87,7 +121,7 @@ def plot_cumulative_incidence_functions(
                 )
             ax.plot(
                 coarse_timegrid,
-                cif_pred,
+                cif_mean,
                 label="GradientBoostingIncidence",
             )
             ax.set(title=f"Event {event_id}")
@@ -114,12 +148,12 @@ def plot_cumulative_incidence_functions(
 y_train_u = y_uncensored.loc[y_train_c.index]
 y_test_u = y_uncensored.loc[y_test_c.index]
 gb_incidence = GradientBoostingIncidence(
-    learning_rate=0.05,
-    n_iter=1_000,
-    max_leaf_nodes=50,
+    learning_rate=0.1,
+    n_iter=20,
+    max_leaf_nodes=15,
     hard_zero_fraction=0.1,
     min_samples_leaf=5,
-    loss="inll",
+    loss="ibs",
     show_progressbar=False,
     random_state=seed,
 )
@@ -141,4 +175,3 @@ def plot_cumulative_incidence_functions(
     X_test=X_test,
     y_test=y_test_c,
 )
-# %%
diff --git a/hazardous/data/__init__.py b/hazardous/data/__init__.py
index 92a8cc2..458516d 100644
--- a/hazardous/data/__init__.py
+++ b/hazardous/data/__init__.py
@@ -1,3 +1,3 @@
-from ._competing_weibull import make_synthetic_competing_weibull
+# from ._competing_weibull import make_synthetic_competing_weibull
 
-__all__ = ["make_synthetic_competing_weibull"]
+# __all__ = ["make_synthetic_dataset"]
diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py
index b1656b5..3edfe8e 100644
--- a/hazardous/data/_competing_weibull.py
+++ b/hazardous/data/_competing_weibull.py
@@ -2,8 +2,10 @@
 
 import numpy as np
 import pandas as pd
+from scipy.special import expit
 from scipy.stats import weibull_min
 from sklearn.datasets._base import Bunch
+from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
 from sklearn.utils import check_random_state
 
 DEFAULT_SHAPE_RANGES = (
@@ -19,29 +21,118 @@
 )
 
 
-def _censor(y, relative_scale, random_state=0):
-    if relative_scale == 0 or relative_scale is None:
+def _censor(
+    y,
+    independant=True,
+    X=None,
+    features_censoring_rate=0.2,
+    censoring_relative_scale=1.5,
+    random_state=0,
+):
+    rng = np.random.RandomState(random_state)
+    if censoring_relative_scale == 0 or censoring_relative_scale is None:
         return y
 
-    rng = check_random_state(random_state)
-    scale = relative_scale * y["duration"].mean()
-    censoring = weibull_min.rvs(1, scale=scale, size=y.shape[0], random_state=rng)
-    y = y.copy()
-    y["event"] = np.where(y["duration"] < censoring, y["event"], 0)
-    y["duration"] = np.minimum(y["duration"], censoring)
-    return y
+    if independant:
+        scale_censoring = censoring_relative_scale * y["duration"].mean()
+        shape_censoring = 1
+    else:
+        features_impact_censoring = rng.randn(X.shape[1], 2)
+        features_impact_censoring = np.where(
+            rng.rand(
+                features_impact_censoring.shape[0], features_impact_censoring.shape[1]
+            )
+            < features_censoring_rate,
+            features_impact_censoring,
+            0,
+        )
+        df_censoring_params = censoring_relative_scale * X @ features_impact_censoring
+        df_censoring_params.columns = ["shape_0", "scale_0"]
+        df_censoring_params = rescaling_params_to_respect_default_ranges(
+            df_censoring_params,
+            shape_default=(1.0, 1.0),
+            scale_default=(1, 10 * censoring_relative_scale),
+            event=0,
+        )
+        scale_censoring = df_censoring_params["scale_0"].values * y["duration"].mean()
+        shape_censoring = df_censoring_params["shape_0"].values
+    censoring = weibull_min.rvs(
+        shape_censoring, scale=scale_censoring, size=y.shape[0], random_state=rng
+    )
+    y_censored = y.copy()
+    y_censored["event"] = np.where(y["duration"] < censoring, y_censored["event"], 0)
+    y_censored["duration"] = np.minimum(y_censored["duration"], censoring)
+    return y_censored
 
 
-def make_synthetic_competing_weibull(
+def compute_shape_and_scale(
+    df_features,
+    n_weibull_parameters=6,
+    features_rate=0.2,
+    n_events=3,
+    degree_interaction=2,
+    shape_ranges=DEFAULT_SHAPE_RANGES,
+    scale_ranges=DEFAULT_SCALE_RANGES,
+    random_state=0,
+):
+    rng = np.random.RandomState(random_state)
+    # Adding interactions between features
+    df_poly_features = PolynomialFeatures(
+        degree=degree_interaction, interaction_only=True, include_bias=False
+    )
+    df_poly_features.set_output(transform="pandas")
+    df_trans = df_poly_features.fit_transform(df_features)
+
+    # Create masked matrix with the interactions
+    w_star = rng.randn(df_trans.shape[1], n_weibull_parameters)
+    # Set 1-feature_rate% of the w_star to 0
+    w_star = np.where(
+        rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0
+    )
+    # Computation of the true values of shape and scale
+    shape_scale_star = df_trans.values @ w_star
+    shape_scale_columns = [f"shape_{i}" for i in range(n_events)] + [
+        f"scale_{i}" for i in range(n_events)
+    ]
+    df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns)
+    # Rescaling of these values to stay in the chosen range
+    for event, (scale_default, shape_default) in enumerate(
+        zip(scale_ranges, shape_ranges)
+    ):
+        df_shape_scale_star = rescaling_params_to_respect_default_ranges(
+            df_shape_scale_star, shape_default, scale_default, event
+        )
+    return df_shape_scale_star
+
+
+def rescaling_params_to_respect_default_ranges(
+    df_shape_scale_star, shape_default, scale_default, event
+):
+    # Rescaling of these values to stay in the chosen range
+    shape_min, shape_max = shape_default
+    scale_min, scale_max = scale_default
+    shape = df_shape_scale_star[f"shape_{event}"].copy()
+    scale = df_shape_scale_star[f"scale_{event}"].copy()
+    scaler = MinMaxScaler(feature_range=(-2, 2))
+    df_shape_scale_star[f"shape_{event}"] = (
+        shape_min
+        + (shape_max - shape_min)
+        * expit(scaler.fit_transform(shape.values.reshape(-1, 1)))
+    ).flatten()
+    df_shape_scale_star[f"scale_{event}"] = (
+        scale_min
+        + (scale_max - scale_min)
+        * expit(scaler.fit_transform(scale.values.reshape(-1, 1))).flatten()
+    )
+    return df_shape_scale_star
+
+
+def make_simple_features(
     n_events=3,
     n_samples=3_000,
-    return_X_y=False,
     base_scale=1_000,
-    feature_rounding=2,
-    target_rounding=1,
     shape_ranges=DEFAULT_SHAPE_RANGES,
     scale_ranges=DEFAULT_SCALE_RANGES,
-    censoring_relative_scale=1.5,
     random_state=None,
 ):
     """Generate a synthetic dataset with competing Weibull-distributed events.
@@ -58,14 +149,6 @@ def make_synthetic_competing_weibull(
     individual, the event type with the shortest duration is kept as the target
     event (competing risks setting) and its event identifier and duration are
     returned as the target dataframe.
-
-    A fraction of the individuals are censored by sampling a censoring time
-    from a Weibull distribution with shape 1 and scale equal to the mean
-    duration of the target event times the ``censoring_relative_scale``.
-
-    Setting ``censoring_relative_scale`` to 0 or None disables censoring.
-    Setting it to a small value (e.g. 0.5 instead of 1.5) will result in a
-    larger fraction of censored individuals.
     """
     rng = check_random_state(random_state)
     all_features = []
@@ -84,20 +167,124 @@ def make_synthetic_competing_weibull(
     event_durations = np.asarray(event_durations)
     duration_argmin = np.argmin(event_durations, axis=0)
     X = pd.concat(all_features, axis=1)
+    return X, event_durations, duration_argmin
+
+
+def make_complex_features_with_sparse_matrix(
+    n_events=3,
+    n_weibull_parameters=6,
+    n_samples=3_000,
+    base_scale=1_000,
+    shape_ranges=DEFAULT_SHAPE_RANGES,
+    scale_ranges=DEFAULT_SCALE_RANGES,
+    n_features=10,
+    features_rate=0.3,
+    degree_interaction=2,
+    random_state=0,
+):
+    rng = np.random.RandomState(random_state)
+    # Create features given to the model as X and then creating the interactions
+    df_features = pd.DataFrame(rng.randn(n_samples, n_features))
+    df_features.columns = [f"feature_{i}" for i in range(n_features)]
+
+    df_shape_scale_star = compute_shape_and_scale(
+        df_features,
+        n_weibull_parameters,
+        features_rate,
+        n_events,
+        degree_interaction,
+        shape_ranges,
+        scale_ranges,
+        random_state,
+    )
+    # Throw durations from a weibull distribution with scale and shape as the parameters
+    event_durations = []
+    for event in range(n_events):
+        shape = df_shape_scale_star[f"shape_{event}"]
+        scale = df_shape_scale_star[f"scale_{event}"]
+        durations = weibull_min.rvs(shape, scale=scale * base_scale, random_state=rng)
+        event_durations.append(durations)
+
+    # Creating the target tabular
+    event_durations = np.asarray(event_durations)
+    duration_argmin = np.argmin(event_durations, axis=0)
+    return df_features, event_durations, duration_argmin
+
+
+def make_synthetic_dataset(
+    n_events,
+    n_weibull_parameters,
+    n_samples=3_000,
+    base_scale=1_000,
+    n_features=10,
+    features_rate=0.3,
+    degree_interaction=2,
+    independant=True,
+    features_censoring_rate=0.2,
+    return_uncensored_data=False,
+    return_X_y=True,
+    feature_rounding=2,
+    target_rounding=1,
+    shape_ranges=DEFAULT_SHAPE_RANGES,
+    scale_ranges=DEFAULT_SCALE_RANGES,
+    censoring_relative_scale=1.5,
+    random_state=0,
+    complex_features=True,
+):
+    """
+    Creating a synthetic dataset to make competing risks.
+    Depending of the choice of the user, the function may genere a simple dataset
+    (setting ``complex_features`` to False or either create complex features.)
+
+    A fraction of the individuals are censored by sampling a censoring time
+    from a Weibull distribution with shape 1 and scale equal to the mean
+    duration of the target event times the ``censoring_relative_scale``.
+
+    Setting ``censoring_relative_scale`` to 0 or None disables censoring.
+    Setting it to a small value (e.g. 0.5 instead of 1.5) will result in a
+    larger fraction of censored individuals.
+    """
+    if complex_features:
+        X, event_durations, duration_argmin = make_complex_features_with_sparse_matrix(
+            n_events,
+            n_weibull_parameters,
+            n_samples,
+            base_scale,
+            shape_ranges,
+            scale_ranges,
+            n_features,
+            features_rate,
+            degree_interaction,
+            random_state,
+        )
+    else:
+        X, event_durations, duration_argmin = make_simple_features(
+            n_events, n_samples, base_scale, shape_ranges, scale_ranges, random_state
+        )
     y = pd.DataFrame(
         dict(
             event=duration_argmin + 1,
             duration=event_durations[duration_argmin, np.arange(n_samples)],
         )
     )
-    y = _censor(y, censoring_relative_scale, random_state=random_state)
+    y_censored = _censor(
+        y,
+        censoring_relative_scale=censoring_relative_scale,
+        independant=independant,
+        X=X,
+        features_censoring_rate=features_censoring_rate,
+        random_state=random_state,
+    )
     if feature_rounding is not None:
         X = X.round(feature_rounding)
 
     if target_rounding is not None:
+        y_censored = y_censored.round(target_rounding)
         y = y.round(target_rounding)
 
     if return_X_y:
+        if return_uncensored_data:
+            return X, y_censored, y
         return X, y
 
     frame = pd.concat([X, y], axis=1)

From dbf1baa77a204e10da6066cae9e975817bc36f93 Mon Sep 17 00:00:00 2001
From: Julie Alberge <julie.alberge@gmail.com>
Date: Wed, 13 Dec 2023 11:52:25 +0100
Subject: [PATCH 05/13] event well numeroted

---
 examples/plot_complex_data.py        | 18 ++++++++++++++++--
 hazardous/data/_competing_weibull.py | 12 ++++++------
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/examples/plot_complex_data.py b/examples/plot_complex_data.py
index 259379c..dc80505 100644
--- a/examples/plot_complex_data.py
+++ b/examples/plot_complex_data.py
@@ -77,6 +77,18 @@
 
 y_censored["event"].value_counts()
 
+gb_incidence = GradientBoostingIncidence(
+    learning_rate=0.1,
+    n_iter=20,
+    max_leaf_nodes=15,
+    hard_zero_fraction=0.1,
+    min_samples_leaf=5,
+    loss="ibs",
+    show_progressbar=False,
+    random_state=seed,
+)
+gb_incidence.fit(X, y_censored)
+
 
 # %%
 #
@@ -114,10 +126,10 @@ def plot_cumulative_incidence_functions(
             cif_mean = cifs_pred.mean(axis=0)
             duration = perf_counter() - tic
             print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s")
-            print("Brier score on training data:", gb_incidence.score(X, y))
+            print("Brier score on training data:", -gb_incidence.score(X, y))
             if X_test is not None:
                 print(
-                    "Brier score on testing data:", gb_incidence.score(X_test, y_test)
+                    "Brier score on testing data:", -gb_incidence.score(X_test, y_test)
                 )
             ax.plot(
                 coarse_timegrid,
@@ -175,3 +187,5 @@ def plot_cumulative_incidence_functions(
     X_test=X_test,
     y_test=y_test_c,
 )
+
+# %%
diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py
index 3edfe8e..83e06cd 100644
--- a/hazardous/data/_competing_weibull.py
+++ b/hazardous/data/_competing_weibull.py
@@ -91,16 +91,16 @@ def compute_shape_and_scale(
     )
     # Computation of the true values of shape and scale
     shape_scale_star = df_trans.values @ w_star
-    shape_scale_columns = [f"shape_{i}" for i in range(n_events)] + [
-        f"scale_{i}" for i in range(n_events)
+    shape_scale_columns = [f"shape_{i}" for i in range(1, n_events + 1)] + [
+        f"scale_{i}" for i in range(1, n_events + 1)
     ]
     df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns)
     # Rescaling of these values to stay in the chosen range
-    for event, (scale_default, shape_default) in enumerate(
-        zip(scale_ranges, shape_ranges)
+    for event_id, shape_range, scale_range in zip(
+        range(1, n_events + 1), cycle(shape_ranges), cycle(scale_ranges)
     ):
         df_shape_scale_star = rescaling_params_to_respect_default_ranges(
-            df_shape_scale_star, shape_default, scale_default, event
+            df_shape_scale_star, shape_range, scale_range, event_id
         )
     return df_shape_scale_star
 
@@ -199,7 +199,7 @@ def make_complex_features_with_sparse_matrix(
     )
     # Throw durations from a weibull distribution with scale and shape as the parameters
     event_durations = []
-    for event in range(n_events):
+    for event in range(1, n_events + 1):
         shape = df_shape_scale_star[f"shape_{event}"]
         scale = df_shape_scale_star[f"scale_{event}"]
         durations = weibull_min.rvs(shape, scale=scale * base_scale, random_state=rng)

From 2de17bb1efdb5ec58eef9e0b10407e5133427968 Mon Sep 17 00:00:00 2001
From: Julie Alberge <julie.alberge@gmail.com>
Date: Wed, 13 Dec 2023 15:04:00 +0100
Subject: [PATCH 06/13] changes dones after Olivier's reviewing

---
 examples/plot_complex_data.py        | 33 ++++++++++++++++++----------
 hazardous/data/__init__.py           |  4 ++--
 hazardous/data/_competing_weibull.py | 18 ++++++---------
 3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/examples/plot_complex_data.py b/examples/plot_complex_data.py
index dc80505..9dc67e2 100644
--- a/examples/plot_complex_data.py
+++ b/examples/plot_complex_data.py
@@ -1,3 +1,17 @@
+"""
+==========================================
+Modeling competing risks on synthetic data
+==========================================
+
+This example introduces a synthetic data generation tool that can
+be helpful to study the relative performance and potential biases
+of predictive competing risks estimators on right-censored data.
+Some of the input features and their second order multiplicative
+interactions are statistically associated with the parameters of
+the distributions from which the competing events are sampled.
+
+"""
+
 # %%
 import numpy as np
 import hazardous.data._competing_weibull as competing_w
@@ -35,13 +49,12 @@
 
 df_shape_scale_star = competing_w.compute_shape_and_scale(
     df_features,
-    6,
-    0.3,
-    n_events,
-    2,
-    DEFAULT_SHAPE_RANGES,
-    DEFAULT_SCALE_RANGES,
-    0,
+    features_rate=0.2,
+    n_events=3,
+    degree_interaction=2,
+    shape_ranges=DEFAULT_SHAPE_RANGES,
+    scale_ranges=DEFAULT_SCALE_RANGES,
+    random_state=0,
 )
 fig, axes = plt.subplots(2, 3, figsize=(10, 7))
 for idx, col in enumerate(df_shape_scale_star.columns):
@@ -50,15 +63,14 @@
 
 # %%
 
-X, y_censored, y_uncensored = competing_w.make_synthetic_dataset(
+X, y_censored, y_uncensored = competing_w.make_synthetic_competing_weibull(
     n_events=n_events,
-    n_weibull_parameters=2 * n_events,
     n_samples=3_000,
     base_scale=1_000,
     n_features=10,
     features_rate=0.5,
     degree_interaction=2,
-    independant=False,
+    independent=False,
     features_censoring_rate=0.2,
     return_uncensored_data=True,
     return_X_y=True,
@@ -122,7 +134,6 @@ def plot_cumulative_incidence_functions(
             print(f"GB Incidence for event {event_id} fit in {duration:.3f} s")
             tic = perf_counter()
             cifs_pred = gb_incidence.predict_cumulative_incidence(X, coarse_timegrid)
-            print(cifs_pred.shape)
             cif_mean = cifs_pred.mean(axis=0)
             duration = perf_counter() - tic
             print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s")
diff --git a/hazardous/data/__init__.py b/hazardous/data/__init__.py
index 458516d..92a8cc2 100644
--- a/hazardous/data/__init__.py
+++ b/hazardous/data/__init__.py
@@ -1,3 +1,3 @@
-# from ._competing_weibull import make_synthetic_competing_weibull
+from ._competing_weibull import make_synthetic_competing_weibull
 
-# __all__ = ["make_synthetic_dataset"]
+__all__ = ["make_synthetic_competing_weibull"]
diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py
index 83e06cd..e531dc1 100644
--- a/hazardous/data/_competing_weibull.py
+++ b/hazardous/data/_competing_weibull.py
@@ -23,7 +23,7 @@
 
 def _censor(
     y,
-    independant=True,
+    independent=True,
     X=None,
     features_censoring_rate=0.2,
     censoring_relative_scale=1.5,
@@ -33,7 +33,7 @@ def _censor(
     if censoring_relative_scale == 0 or censoring_relative_scale is None:
         return y
 
-    if independant:
+    if independent:
         scale_censoring = censoring_relative_scale * y["duration"].mean()
         shape_censoring = 1
     else:
@@ -67,7 +67,6 @@ def _censor(
 
 def compute_shape_and_scale(
     df_features,
-    n_weibull_parameters=6,
     features_rate=0.2,
     n_events=3,
     degree_interaction=2,
@@ -84,6 +83,7 @@ def compute_shape_and_scale(
     df_trans = df_poly_features.fit_transform(df_features)
 
     # Create masked matrix with the interactions
+    n_weibull_parameters = 2 * n_events
     w_star = rng.randn(df_trans.shape[1], n_weibull_parameters)
     # Set 1-feature_rate% of the w_star to 0
     w_star = np.where(
@@ -172,7 +172,6 @@ def make_simple_features(
 
 def make_complex_features_with_sparse_matrix(
     n_events=3,
-    n_weibull_parameters=6,
     n_samples=3_000,
     base_scale=1_000,
     shape_ranges=DEFAULT_SHAPE_RANGES,
@@ -189,7 +188,6 @@ def make_complex_features_with_sparse_matrix(
 
     df_shape_scale_star = compute_shape_and_scale(
         df_features,
-        n_weibull_parameters,
         features_rate,
         n_events,
         degree_interaction,
@@ -211,15 +209,14 @@ def make_complex_features_with_sparse_matrix(
     return df_features, event_durations, duration_argmin
 
 
-def make_synthetic_dataset(
+def make_synthetic_competing_weibull(
     n_events,
-    n_weibull_parameters,
     n_samples=3_000,
     base_scale=1_000,
     n_features=10,
     features_rate=0.3,
     degree_interaction=2,
-    independant=True,
+    independent=True,
     features_censoring_rate=0.2,
     return_uncensored_data=False,
     return_X_y=True,
@@ -229,7 +226,7 @@ def make_synthetic_dataset(
     scale_ranges=DEFAULT_SCALE_RANGES,
     censoring_relative_scale=1.5,
     random_state=0,
-    complex_features=True,
+    complex_features=False,
 ):
     """
     Creating a synthetic dataset to make competing risks.
@@ -247,7 +244,6 @@ def make_synthetic_dataset(
     if complex_features:
         X, event_durations, duration_argmin = make_complex_features_with_sparse_matrix(
             n_events,
-            n_weibull_parameters,
             n_samples,
             base_scale,
             shape_ranges,
@@ -270,7 +266,7 @@ def make_synthetic_dataset(
     y_censored = _censor(
         y,
         censoring_relative_scale=censoring_relative_scale,
-        independant=independant,
+        independent=independent,
         X=X,
         features_censoring_rate=features_censoring_rate,
         random_state=random_state,

From f1eb9fc73b445662421e9fc807af01e879b6f3bf Mon Sep 17 00:00:00 2001
From: Julie Alberge <julie.alberge@gmail.com>
Date: Wed, 13 Dec 2023 15:25:20 +0100
Subject: [PATCH 07/13] fix some tests.

---
 hazardous/data/_competing_weibull.py           | 4 ++--
 hazardous/data/tests/test_competing_weibull.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py
index e531dc1..d924ede 100644
--- a/hazardous/data/_competing_weibull.py
+++ b/hazardous/data/_competing_weibull.py
@@ -210,7 +210,7 @@ def make_complex_features_with_sparse_matrix(
 
 
 def make_synthetic_competing_weibull(
-    n_events,
+    n_events=3,
     n_samples=3_000,
     base_scale=1_000,
     n_features=10,
@@ -281,7 +281,7 @@ def make_synthetic_competing_weibull(
     if return_X_y:
         if return_uncensored_data:
             return X, y_censored, y
-        return X, y
+        return X, y_censored
 
     frame = pd.concat([X, y], axis=1)
     return Bunch(data=frame[X.columns], target=frame[y.columns], frame=frame)
diff --git a/hazardous/data/tests/test_competing_weibull.py b/hazardous/data/tests/test_competing_weibull.py
index 594dbe3..b0dafcd 100644
--- a/hazardous/data/tests/test_competing_weibull.py
+++ b/hazardous/data/tests/test_competing_weibull.py
@@ -52,8 +52,8 @@ def test_competing_weibull_no_censoring(seed):
         y["event"],
         cv=3,
     ).mean()
-    assert 0.4 > baseline_classification_acc > 0.3  # approximately balanced
-    assert event_classification_acc > 1.2 * baseline_classification_acc
+    assert 0.45 > baseline_classification_acc > 0.3  # approximately balanced
+    assert event_classification_acc > 1.1 * baseline_classification_acc
     assert event_classification_acc < 0.6  # still challenging.
 
     # Check that the features make it possible to predict the durations better

From 373745d18f401f033d09a06d3a9361a767cc92e8 Mon Sep 17 00:00:00 2001
From: Julie Alberge <julie.alberge@gmail.com>
Date: Thu, 14 Dec 2023 10:49:47 +0100
Subject: [PATCH 08/13] iadding seasborn to push doc.

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 2c24fb7..862b146 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,6 +47,7 @@ doc = [
     "matplotlib",
     "pillow",  # to scrape images from the examples
     "numpydoc",
+    "seaborn",
 ]
 
 [project.urls]

From 304dec77bda395f53440b6a4d71ff617c74f4dfe Mon Sep 17 00:00:00 2001
From: Julie Alberge <julie.alberge@gmail.com>
Date: Thu, 14 Dec 2023 19:01:42 +0100
Subject: [PATCH 09/13] fixing AJ estimator, adding 50% features marinal and
 50% features interaction in the creation of w_star

---
 examples/plot_complex_data.py        | 35 +++++------------
 hazardous/data/_competing_weibull.py | 57 ++++++++++++++++++----------
 2 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/examples/plot_complex_data.py b/examples/plot_complex_data.py
index 9dc67e2..6dda353 100644
--- a/examples/plot_complex_data.py
+++ b/examples/plot_complex_data.py
@@ -26,18 +26,6 @@
 
 seed = 0
 rng = np.random.RandomState(seed)
-DEFAULT_SHAPE_RANGES = (
-    (0.7, 0.9),
-    (1.0, 1.0),
-    (2.0, 3.0),
-)
-
-DEFAULT_SCALE_RANGES = (
-    (1, 20),
-    (1, 10),
-    (1.5, 5),
-)
-n_events = 3
 
 
 # %%
@@ -52,19 +40,15 @@
     features_rate=0.2,
     n_events=3,
     degree_interaction=2,
-    shape_ranges=DEFAULT_SHAPE_RANGES,
-    scale_ranges=DEFAULT_SCALE_RANGES,
     random_state=0,
 )
 fig, axes = plt.subplots(2, 3, figsize=(10, 7))
 for idx, col in enumerate(df_shape_scale_star.columns):
     sns.histplot(df_shape_scale_star[col], ax=axes[idx // 3][idx % 3])
 
-
 # %%
 
 X, y_censored, y_uncensored = competing_w.make_synthetic_competing_weibull(
-    n_events=n_events,
     n_samples=3_000,
     base_scale=1_000,
     n_features=10,
@@ -74,15 +58,14 @@
     features_censoring_rate=0.2,
     return_uncensored_data=True,
     return_X_y=True,
-    feature_rounding=4,
-    target_rounding=1,
-    shape_ranges=DEFAULT_SHAPE_RANGES,
-    scale_ranges=DEFAULT_SCALE_RANGES,
-    censoring_relative_scale=0.1,
+    feature_rounding=3,
+    target_rounding=4,
+    censoring_relative_scale=4.0,
     random_state=0,
     complex_features=True,
 )
 
+# %%
 n_samples = len(X)
 calculate_variance = n_samples <= 5_000
 aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0)
@@ -99,7 +82,6 @@
     show_progressbar=False,
     random_state=seed,
 )
-gb_incidence.fit(X, y_censored)
 
 
 # %%
@@ -155,8 +137,8 @@ def plot_cumulative_incidence_functions(
             duration = perf_counter() - tic
             print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s")
             aj.plot(label="Aalen-Johansen", ax=ax)
-            print(aj.cumulative_density_.values[-1])
-
+            ax.set_xlim(0, 8_000)
+            ax.set_ylim(0, 0.5)
         if event_id == 1:
             ax.legend(loc="lower right")
         else:
@@ -170,6 +152,7 @@ def plot_cumulative_incidence_functions(
 )
 y_train_u = y_uncensored.loc[y_train_c.index]
 y_test_u = y_uncensored.loc[y_test_c.index]
+
 gb_incidence = GradientBoostingIncidence(
     learning_rate=0.1,
     n_iter=20,
@@ -184,7 +167,7 @@ def plot_cumulative_incidence_functions(
 plot_cumulative_incidence_functions(
     X_train,
     y_train_u,
-    gb_incidence=gb_incidence,
+    gb_incidence=None,
     aj=aj,
     X_test=X_test,
     y_test=y_test_u,
@@ -193,7 +176,7 @@ def plot_cumulative_incidence_functions(
 plot_cumulative_incidence_functions(
     X_train,
     y_train_c,
-    gb_incidence=gb_incidence,
+    gb_incidence=None,
     aj=aj,
     X_test=X_test,
     y_test=y_test_c,
diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py
index d924ede..7273ad9 100644
--- a/hazardous/data/_competing_weibull.py
+++ b/hazardous/data/_competing_weibull.py
@@ -5,18 +5,19 @@
 from scipy.special import expit
 from scipy.stats import weibull_min
 from sklearn.datasets._base import Bunch
-from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import PolynomialFeatures, SplineTransformer, StandardScaler
 from sklearn.utils import check_random_state
 
 DEFAULT_SHAPE_RANGES = (
-    (0.7, 0.9),
+    (0.8, 1.0),
     (1.0, 1.0),
-    (2.0, 3.0),
+    (1.2, 3.0),
 )
 
 DEFAULT_SCALE_RANGES = (
-    (1, 20),
-    (1, 10),
+    (1, 15),
+    (1, 7),
     (1.5, 5),
 )
 
@@ -35,7 +36,7 @@ def _censor(
 
     if independent:
         scale_censoring = censoring_relative_scale * y["duration"].mean()
-        shape_censoring = 1
+        shape_censoring = 3
     else:
         features_impact_censoring = rng.randn(X.shape[1], 2)
         features_impact_censoring = np.where(
@@ -50,7 +51,7 @@ def _censor(
         df_censoring_params.columns = ["shape_0", "scale_0"]
         df_censoring_params = rescaling_params_to_respect_default_ranges(
             df_censoring_params,
-            shape_default=(1.0, 1.0),
+            shape_default=(3.0, 4.0),
             scale_default=(1, 10 * censoring_relative_scale),
             event=0,
         )
@@ -60,7 +61,9 @@ def _censor(
         shape_censoring, scale=scale_censoring, size=y.shape[0], random_state=rng
     )
     y_censored = y.copy()
-    y_censored["event"] = np.where(y["duration"] < censoring, y_censored["event"], 0)
+    y_censored["event"] = np.where(
+        y_censored["duration"] < censoring, y_censored["event"], 0
+    )
     y_censored["duration"] = np.minimum(y_censored["duration"], censoring)
     return y_censored
 
@@ -76,19 +79,35 @@ def compute_shape_and_scale(
 ):
     rng = np.random.RandomState(random_state)
     # Adding interactions between features
-    df_poly_features = PolynomialFeatures(
-        degree=degree_interaction, interaction_only=True, include_bias=False
+    preprocessor = make_pipeline(
+        SplineTransformer(n_knots=3),
+        PolynomialFeatures(
+            degree=degree_interaction, interaction_only=True, include_bias=False
+        ),
     )
-    df_poly_features.set_output(transform="pandas")
-    df_trans = df_poly_features.fit_transform(df_features)
-
+    preprocessor.set_output(transform="pandas")
+    df_trans = preprocessor.fit_transform(df_features)
     # Create masked matrix with the interactions
     n_weibull_parameters = 2 * n_events
     w_star = rng.randn(df_trans.shape[1], n_weibull_parameters)
-    # Set 1-feature_rate% of the w_star to 0
+    # 1-feature_rate% of marginal features and interacted features
+    # are set to 0
+    cols_features = df_trans.columns
+    marginal_cols = np.array([len(c.split(" ")) == 1 for c in cols_features]) * np.ones(
+        (n_weibull_parameters, df_trans.shape[1])
+    )
+    marginal_cols = marginal_cols.T.astype(bool)
+    w_star_marg = np.where(
+        (rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate) & marginal_cols,
+        w_star,
+        0,
+    )
     w_star = np.where(
-        rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0
+        (rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate) & ~marginal_cols,
+        w_star,
+        0,
     )
+    w_star += w_star_marg
     # Computation of the true values of shape and scale
     shape_scale_star = df_trans.values @ w_star
     shape_scale_columns = [f"shape_{i}" for i in range(1, n_events + 1)] + [
@@ -113,7 +132,7 @@ def rescaling_params_to_respect_default_ranges(
     scale_min, scale_max = scale_default
     shape = df_shape_scale_star[f"shape_{event}"].copy()
     scale = df_shape_scale_star[f"scale_{event}"].copy()
-    scaler = MinMaxScaler(feature_range=(-2, 2))
+    scaler = StandardScaler()
     df_shape_scale_star[f"shape_{event}"] = (
         shape_min
         + (shape_max - shape_min)
@@ -183,7 +202,7 @@ def make_complex_features_with_sparse_matrix(
 ):
     rng = np.random.RandomState(random_state)
     # Create features given to the model as X and then creating the interactions
-    df_features = pd.DataFrame(rng.randn(n_samples, n_features))
+    df_features = pd.DataFrame(rng.rand(n_samples, n_features))
     df_features.columns = [f"feature_{i}" for i in range(n_features)]
 
     df_shape_scale_star = compute_shape_and_scale(
@@ -221,7 +240,7 @@ def make_synthetic_competing_weibull(
     return_uncensored_data=False,
     return_X_y=True,
     feature_rounding=2,
-    target_rounding=1,
+    target_rounding=2,
     shape_ranges=DEFAULT_SHAPE_RANGES,
     scale_ranges=DEFAULT_SCALE_RANGES,
     censoring_relative_scale=1.5,
@@ -275,7 +294,7 @@ def make_synthetic_competing_weibull(
         X = X.round(feature_rounding)
 
     if target_rounding is not None:
-        y_censored = y_censored.round(target_rounding)
+        y_censored["duration"] = y_censored["duration"].round(target_rounding)
         y = y.round(target_rounding)
 
     if return_X_y:

From 59008ada64814d4c06ce48034ee2c6a8b1cadf11 Mon Sep 17 00:00:00 2001
From: Julie Alberge <julie.alberge@gmail.com>
Date: Thu, 14 Dec 2023 19:23:58 +0100
Subject: [PATCH 10/13]  fixing tests (because shape ans scale default
 parameters have been changed, some tests were broken)

---
 .../data/tests/test_competing_weibull.py      | 35 ++-----------------
 .../tests/test_gradient_boosting_incidence.py |  2 +-
 2 files changed, 4 insertions(+), 33 deletions(-)

diff --git a/hazardous/data/tests/test_competing_weibull.py b/hazardous/data/tests/test_competing_weibull.py
index b0dafcd..3799f66 100644
--- a/hazardous/data/tests/test_competing_weibull.py
+++ b/hazardous/data/tests/test_competing_weibull.py
@@ -1,6 +1,6 @@
 import pytest
-from sklearn.dummy import DummyClassifier, DummyRegressor
-from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.dummy import DummyClassifier
+from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import cross_val_score
 
 from hazardous.data import make_synthetic_competing_weibull
@@ -56,35 +56,6 @@ def test_competing_weibull_no_censoring(seed):
     assert event_classification_acc > 1.1 * baseline_classification_acc
     assert event_classification_acc < 0.6  # still challenging.
 
-    # Check that the features make it possible to predict the durations better
-    # than a marginal baseline, but that a significant amount of unpredictability
-    # remains.
-    median_duration = y["duration"].median()
-    duration_regression_relative_error = (
-        -cross_val_score(
-            RandomForestRegressor(criterion="poisson", random_state=seed),
-            X,
-            y["duration"],
-            cv=3,
-            n_jobs=4,
-            scoring="neg_mean_absolute_error",
-        ).mean()
-        / median_duration
-    )
-    baseline_duration_relative_error = (
-        -cross_val_score(
-            DummyRegressor(strategy="mean"),
-            X,
-            y["duration"],
-            cv=3,
-            scoring="neg_mean_absolute_error",
-        ).mean()
-        / median_duration
-    )
-    assert 0.9 < baseline_duration_relative_error < 1.1  # approximately balanced
-    assert duration_regression_relative_error < 0.98 * baseline_duration_relative_error
-    assert duration_regression_relative_error > 0.8  # still challenging.
-
 
 @pytest.mark.parametrize("seed", range(3))
 def test_competing_weibull_with_censoring(seed):
@@ -109,7 +80,7 @@ def test_competing_weibull_with_censoring(seed):
     # Censoring rate is lower for higher scale:
     high_scale_censoring_rate = (y_high_scale["event"] == 0).mean()
     low_scale_censoring_rate = (y_low_scale["event"] == 0).mean()
-    assert 0.35 < high_scale_censoring_rate < 0.5
+    assert 0.3 < high_scale_censoring_rate < 0.5
     assert 0.5 < low_scale_censoring_rate < 0.65
 
     # Uncensored events and durations match:
diff --git a/hazardous/tests/test_gradient_boosting_incidence.py b/hazardous/tests/test_gradient_boosting_incidence.py
index 0c2fd73..4e14ab3 100644
--- a/hazardous/tests/test_gradient_boosting_incidence.py
+++ b/hazardous/tests/test_gradient_boosting_incidence.py
@@ -139,4 +139,4 @@ def test_gradient_boosting_incidence_parameter_tuning(seed):
     worst_ibs = -cv_results.iloc[0]["mean_test_score"]
     best_ibs = -cv_results.iloc[-1]["mean_test_score"]
     assert best_ibs == pytest.approx(-grid_search.best_score_)
-    assert worst_ibs > 1.4 * best_ibs
+    assert worst_ibs > 1.25 * best_ibs

From cdb3ad5965307f19dc7ec037c7f2239470584a4f Mon Sep 17 00:00:00 2001
From: Julie Alberge <julie.alberge@gmail.com>
Date: Thu, 14 Dec 2023 19:38:06 +0100
Subject: [PATCH 11/13] adjusting code as Vincent's propositions

---
 examples/plot_complex_data.py        | 10 ++--
 hazardous/data/_competing_weibull.py | 76 +++++++++++++++-------------
 2 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/examples/plot_complex_data.py b/examples/plot_complex_data.py
index 6dda353..d0d4857 100644
--- a/examples/plot_complex_data.py
+++ b/examples/plot_complex_data.py
@@ -18,6 +18,7 @@
 from time import perf_counter
 import matplotlib.pyplot as plt
 from sklearn.model_selection import train_test_split
+from sklearn.utils import check_random_state
 import pandas as pd
 import seaborn as sns
 
@@ -25,13 +26,12 @@
 from lifelines import AalenJohansenFitter
 
 seed = 0
-rng = np.random.RandomState(seed)
+rng = check_random_state(seed)
 
 
 # %%
 # In the following cell, we verify that the synthetic dataset is well defined.
 
-rng = np.random.RandomState(seed)
 df_features = pd.DataFrame(rng.randn(100_000, 10))
 df_features.columns = [f"feature_{i}" for i in range(10)]
 
@@ -54,7 +54,7 @@
     n_features=10,
     features_rate=0.5,
     degree_interaction=2,
-    independent=False,
+    independent_censoring=False,
     features_censoring_rate=0.2,
     return_uncensored_data=True,
     return_X_y=True,
@@ -167,7 +167,7 @@ def plot_cumulative_incidence_functions(
 plot_cumulative_incidence_functions(
     X_train,
     y_train_u,
-    gb_incidence=None,
+    gb_incidence=gb_incidence,
     aj=aj,
     X_test=X_test,
     y_test=y_test_u,
@@ -176,7 +176,7 @@ def plot_cumulative_incidence_functions(
 plot_cumulative_incidence_functions(
     X_train,
     y_train_c,
-    gb_incidence=None,
+    gb_incidence=gb_incidence,
     aj=aj,
     X_test=X_test,
     y_test=y_test_c,
diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py
index 7273ad9..1bb67fa 100644
--- a/hazardous/data/_competing_weibull.py
+++ b/hazardous/data/_competing_weibull.py
@@ -24,30 +24,28 @@
 
 def _censor(
     y,
-    independent=True,
+    independent_censoring=True,
     X=None,
     features_censoring_rate=0.2,
     censoring_relative_scale=1.5,
     random_state=0,
 ):
-    rng = np.random.RandomState(random_state)
+    rng = check_random_state(random_state)
     if censoring_relative_scale == 0 or censoring_relative_scale is None:
         return y
 
-    if independent:
+    if independent_censoring:
         scale_censoring = censoring_relative_scale * y["duration"].mean()
         shape_censoring = 3
     else:
-        features_impact_censoring = rng.randn(X.shape[1], 2)
-        features_impact_censoring = np.where(
-            rng.rand(
-                features_impact_censoring.shape[0], features_impact_censoring.shape[1]
-            )
+        w_censoring_star = rng.randn(X.shape[1], 2)
+        w_censoring_star = np.where(
+            rng.rand(w_censoring_star.shape[0], w_censoring_star.shape[1])
             < features_censoring_rate,
-            features_impact_censoring,
+            w_censoring_star,
             0,
         )
-        df_censoring_params = censoring_relative_scale * X @ features_impact_censoring
+        df_censoring_params = censoring_relative_scale * X @ w_censoring_star
         df_censoring_params.columns = ["shape_0", "scale_0"]
         df_censoring_params = rescaling_params_to_respect_default_ranges(
             df_censoring_params,
@@ -77,7 +75,7 @@ def compute_shape_and_scale(
     scale_ranges=DEFAULT_SCALE_RANGES,
     random_state=0,
 ):
-    rng = np.random.RandomState(random_state)
+    rng = check_random_state(random_state)
     # Adding interactions between features
     preprocessor = make_pipeline(
         SplineTransformer(n_knots=3),
@@ -86,15 +84,15 @@ def compute_shape_and_scale(
         ),
     )
     preprocessor.set_output(transform="pandas")
-    df_trans = preprocessor.fit_transform(df_features)
+    df_features_transformed = preprocessor.fit_transform(df_features)
     # Create masked matrix with the interactions
     n_weibull_parameters = 2 * n_events
-    w_star = rng.randn(df_trans.shape[1], n_weibull_parameters)
+    w_star = rng.randn(df_features_transformed.shape[1], n_weibull_parameters)
     # 1-feature_rate% of marginal features and interacted features
     # are set to 0
-    cols_features = df_trans.columns
+    cols_features = df_features_transformed.columns
     marginal_cols = np.array([len(c.split(" ")) == 1 for c in cols_features]) * np.ones(
-        (n_weibull_parameters, df_trans.shape[1])
+        (n_weibull_parameters, df_features_transformed.shape[1])
     )
     marginal_cols = marginal_cols.T.astype(bool)
     w_star_marg = np.where(
@@ -109,7 +107,7 @@ def compute_shape_and_scale(
     )
     w_star += w_star_marg
     # Computation of the true values of shape and scale
-    shape_scale_star = df_trans.values @ w_star
+    shape_scale_star = df_features_transformed.values @ w_star
     shape_scale_columns = [f"shape_{i}" for i in range(1, n_events + 1)] + [
         f"scale_{i}" for i in range(1, n_events + 1)
     ]
@@ -130,18 +128,16 @@ def rescaling_params_to_respect_default_ranges(
     # Rescaling of these values to stay in the chosen range
     shape_min, shape_max = shape_default
     scale_min, scale_max = scale_default
-    shape = df_shape_scale_star[f"shape_{event}"].copy()
-    scale = df_shape_scale_star[f"scale_{event}"].copy()
     scaler = StandardScaler()
     df_shape_scale_star[f"shape_{event}"] = (
         shape_min
         + (shape_max - shape_min)
-        * expit(scaler.fit_transform(shape.values.reshape(-1, 1)))
+        * expit(scaler.fit_transform(df_shape_scale_star[f"shape_{event}"]))
     ).flatten()
     df_shape_scale_star[f"scale_{event}"] = (
         scale_min
         + (scale_max - scale_min)
-        * expit(scaler.fit_transform(scale.values.reshape(-1, 1))).flatten()
+        * expit(scaler.fit_transform(df_shape_scale_star[f"scale_{event}"])).flatten()
     )
     return df_shape_scale_star
 
@@ -202,8 +198,11 @@ def make_complex_features_with_sparse_matrix(
 ):
     rng = np.random.RandomState(random_state)
     # Create features given to the model as X and then creating the interactions
-    df_features = pd.DataFrame(rng.rand(n_samples, n_features))
-    df_features.columns = [f"feature_{i}" for i in range(n_features)]
+    columns = [f"feature_{i}" for i in range(n_features)]
+    df_features = pd.DataFrame(
+        rng.randn(n_samples, n_features),
+        columns=columns,
+    )
 
     df_shape_scale_star = compute_shape_and_scale(
         df_features,
@@ -235,12 +234,12 @@ def make_synthetic_competing_weibull(
     n_features=10,
     features_rate=0.3,
     degree_interaction=2,
-    independent=True,
+    independent_censoring=True,
     features_censoring_rate=0.2,
     return_uncensored_data=False,
     return_X_y=True,
     feature_rounding=2,
-    target_rounding=2,
+    target_rounding=4,
     shape_ranges=DEFAULT_SHAPE_RANGES,
     scale_ranges=DEFAULT_SCALE_RANGES,
     censoring_relative_scale=1.5,
@@ -262,19 +261,24 @@ def make_synthetic_competing_weibull(
     """
     if complex_features:
         X, event_durations, duration_argmin = make_complex_features_with_sparse_matrix(
-            n_events,
-            n_samples,
-            base_scale,
-            shape_ranges,
-            scale_ranges,
-            n_features,
-            features_rate,
-            degree_interaction,
-            random_state,
+            n_events=n_events,
+            n_samples=n_samples,
+            base_scale=base_scale,
+            shape_ranges=shape_ranges,
+            scale_ranges=scale_ranges,
+            n_features=n_features,
+            features_rate=features_rate,
+            degree_interaction=degree_interaction,
+            random_state=random_state,
         )
     else:
         X, event_durations, duration_argmin = make_simple_features(
-            n_events, n_samples, base_scale, shape_ranges, scale_ranges, random_state
+            n_events=n_events,
+            n_samples=n_samples,
+            base_scale=base_scale,
+            shape_ranges=shape_ranges,
+            scale_ranges=scale_ranges,
+            random_state=random_state,
         )
     y = pd.DataFrame(
         dict(
@@ -285,7 +289,7 @@ def make_synthetic_competing_weibull(
     y_censored = _censor(
         y,
         censoring_relative_scale=censoring_relative_scale,
-        independent=independent,
+        independent_censoring=independent_censoring,
         X=X,
         features_censoring_rate=features_censoring_rate,
         random_state=random_state,
@@ -303,4 +307,4 @@ def make_synthetic_competing_weibull(
         return X, y_censored
 
     frame = pd.concat([X, y], axis=1)
-    return Bunch(data=frame[X.columns], target=frame[y.columns], frame=frame)
+    return Bunch(data=frame[X.columns], target=frame[y_censored.columns], frame=frame)

From c5f319334385fac1f9d1e9f0a474114d4598297f Mon Sep 17 00:00:00 2001
From: Julie Alberge <julie.alberge@gmail.com>
Date: Fri, 15 Dec 2023 17:09:11 +0100
Subject: [PATCH 12/13] refacto code with vincent, tuning parameters.

---
 hazardous/data/_competing_weibull.py          | 84 ++++++++++---------
 .../data/tests/test_competing_weibull.py      |  2 +-
 2 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py
index 1bb67fa..b20401c 100644
--- a/hazardous/data/_competing_weibull.py
+++ b/hazardous/data/_competing_weibull.py
@@ -10,14 +10,14 @@
 from sklearn.utils import check_random_state
 
 DEFAULT_SHAPE_RANGES = (
-    (0.8, 1.0),
+    (1.0, 3.5),
     (1.0, 1.0),
-    (1.2, 3.0),
+    (3.0, 6.0),
 )
 
 DEFAULT_SCALE_RANGES = (
-    (1, 15),
     (1, 7),
+    (1, 15),
     (1.5, 5),
 )
 
@@ -47,12 +47,18 @@ def _censor(
         )
         df_censoring_params = censoring_relative_scale * X @ w_censoring_star
         df_censoring_params.columns = ["shape_0", "scale_0"]
-        df_censoring_params = rescaling_params_to_respect_default_ranges(
-            df_censoring_params,
-            shape_default=(3.0, 4.0),
-            scale_default=(1, 10 * censoring_relative_scale),
-            event=0,
+
+        df_censoring_params["shape_0"] = _rescale_params(
+            df_censoring_params[["shape_0"]],
+            param_min=2.0,
+            param_max=3.0,
+        )
+        df_censoring_params["scale_0"] = _rescale_params(
+            df_censoring_params[["scale_0"]],
+            param_min=1,
+            param_max=censoring_relative_scale,
         )
+
         scale_censoring = df_censoring_params["scale_0"].values * y["duration"].mean()
         shape_censoring = df_censoring_params["shape_0"].values
     censoring = weibull_min.rvs(
@@ -67,7 +73,7 @@ def _censor(
 
 
 def compute_shape_and_scale(
-    df_features,
+    X,
     features_rate=0.2,
     n_events=3,
     degree_interaction=2,
@@ -84,62 +90,64 @@ def compute_shape_and_scale(
         ),
     )
     preprocessor.set_output(transform="pandas")
-    df_features_transformed = preprocessor.fit_transform(df_features)
+    X_trans = preprocessor.fit_transform(X)
     # Create masked matrix with the interactions
     n_weibull_parameters = 2 * n_events
-    w_star = rng.randn(df_features_transformed.shape[1], n_weibull_parameters)
+    w_star = rng.randn(X_trans.shape[1], n_weibull_parameters)
     # 1-feature_rate% of marginal features and interacted features
     # are set to 0
-    cols_features = df_features_transformed.columns
-    marginal_cols = np.array([len(c.split(" ")) == 1 for c in cols_features]) * np.ones(
-        (n_weibull_parameters, df_features_transformed.shape[1])
-    )
-    marginal_cols = marginal_cols.T.astype(bool)
-    w_star_marg = np.where(
-        (rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate) & marginal_cols,
+    cols_features = X_trans.columns
+    marginal_mask = np.array([len(col.split(" ")) == 1 for col in cols_features])
+    marginal_cols = np.repeat(
+        marginal_mask.reshape(-1, 1), repeats=n_weibull_parameters, axis=1
+    )  # (X_trans.shape[1], n_weibull_parameters)
+
+    drop_out_mask = rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate
+    w_star_marginal = np.where(
+        drop_out_mask & marginal_cols,
         w_star,
         0,
     )
     w_star = np.where(
-        (rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate) & ~marginal_cols,
+        drop_out_mask & ~marginal_cols,
         w_star,
         0,
     )
-    w_star += w_star_marg
+    w_star += w_star_marginal
+
     # Computation of the true values of shape and scale
-    shape_scale_star = df_features_transformed.values @ w_star
+    shape_scale_star = X_trans.values @ w_star
     shape_scale_columns = [f"shape_{i}" for i in range(1, n_events + 1)] + [
         f"scale_{i}" for i in range(1, n_events + 1)
     ]
     df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns)
     # Rescaling of these values to stay in the chosen range
+
+    return rescale_params(df_shape_scale_star, n_events, shape_ranges, scale_ranges)
+
+
+def rescale_params(df_shape_scale_star, n_events, shape_ranges, scale_ranges):
     for event_id, shape_range, scale_range in zip(
         range(1, n_events + 1), cycle(shape_ranges), cycle(scale_ranges)
     ):
-        df_shape_scale_star = rescaling_params_to_respect_default_ranges(
-            df_shape_scale_star, shape_range, scale_range, event_id
+        shape_min, shape_max = shape_range
+        scale_min, scale_max = scale_range
+        df_shape_scale_star[f"shape_{event_id}"] = _rescale_params(
+            df_shape_scale_star[[f"shape_{event_id}"]], shape_min, shape_max
+        )
+        df_shape_scale_star[f"scale_{event_id}"] = _rescale_params(
+            df_shape_scale_star[[f"scale_{event_id}"]], scale_min, scale_max
         )
     return df_shape_scale_star
 
 
-def rescaling_params_to_respect_default_ranges(
-    df_shape_scale_star, shape_default, scale_default, event
-):
+def _rescale_params(column_param, param_min, param_max):
     # Rescaling of these values to stay in the chosen range
-    shape_min, shape_max = shape_default
-    scale_min, scale_max = scale_default
     scaler = StandardScaler()
-    df_shape_scale_star[f"shape_{event}"] = (
-        shape_min
-        + (shape_max - shape_min)
-        * expit(scaler.fit_transform(df_shape_scale_star[f"shape_{event}"]))
+    column_param = (
+        param_min + (param_max - param_min) * expit(scaler.fit_transform(column_param))
     ).flatten()
-    df_shape_scale_star[f"scale_{event}"] = (
-        scale_min
-        + (scale_max - scale_min)
-        * expit(scaler.fit_transform(df_shape_scale_star[f"scale_{event}"])).flatten()
-    )
-    return df_shape_scale_star
+    return column_param
 
 
 def make_simple_features(
diff --git a/hazardous/data/tests/test_competing_weibull.py b/hazardous/data/tests/test_competing_weibull.py
index 3799f66..eb2e638 100644
--- a/hazardous/data/tests/test_competing_weibull.py
+++ b/hazardous/data/tests/test_competing_weibull.py
@@ -54,7 +54,7 @@ def test_competing_weibull_no_censoring(seed):
     ).mean()
     assert 0.45 > baseline_classification_acc > 0.3  # approximately balanced
     assert event_classification_acc > 1.1 * baseline_classification_acc
-    assert event_classification_acc < 0.6  # still challenging.
+    assert event_classification_acc < 0.63  # still challenging.
 
 
 @pytest.mark.parametrize("seed", range(3))

From 2f1ba2b3180a6153684fec344cc280330b82e1c2 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Mon, 18 Dec 2023 18:01:04 +0100
Subject: [PATCH 13/13] =?UTF-8?q?small=20rework=20of=20the=20example=20?=
 =?UTF-8?q?=F0=9F=92=AB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/plot_complex_data.py        | 114 +++++++++++++++++----------
 hazardous/data/_competing_weibull.py |  98 ++++++++++++-----------
 pyproject.toml                       |   5 +-
 3 files changed, 131 insertions(+), 86 deletions(-)

diff --git a/examples/plot_complex_data.py b/examples/plot_complex_data.py
index d0d4857..f115dec 100644
--- a/examples/plot_complex_data.py
+++ b/examples/plot_complex_data.py
@@ -13,17 +13,7 @@
 """
 
 # %%
-import numpy as np
-import hazardous.data._competing_weibull as competing_w
-from time import perf_counter
-import matplotlib.pyplot as plt
-from sklearn.model_selection import train_test_split
 from sklearn.utils import check_random_state
-import pandas as pd
-import seaborn as sns
-
-from hazardous import GradientBoostingIncidence
-from lifelines import AalenJohansenFitter
 
 seed = 0
 rng = check_random_state(seed)
@@ -31,24 +21,33 @@
 
 # %%
 # In the following cell, we verify that the synthetic dataset is well defined.
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from hazardous.data._competing_weibull import compute_shape_and_scale
+
 
-df_features = pd.DataFrame(rng.randn(100_000, 10))
-df_features.columns = [f"feature_{i}" for i in range(10)]
+X = rng.randn(10_000, 10)
+column_names = [f"feature_{i}" for i in range(X.shape[1])]
+X = pd.DataFrame(X, columns=column_names)
 
-df_shape_scale_star = competing_w.compute_shape_and_scale(
-    df_features,
+SS_star = compute_shape_and_scale(
+    X,
     features_rate=0.2,
     n_events=3,
     degree_interaction=2,
     random_state=0,
 )
+
 fig, axes = plt.subplots(2, 3, figsize=(10, 7))
-for idx, col in enumerate(df_shape_scale_star.columns):
-    sns.histplot(df_shape_scale_star[col], ax=axes[idx // 3][idx % 3])
+for idx, col in enumerate(SS_star.columns):
+    sns.histplot(SS_star[col], ax=axes[idx // 3][idx % 3])
 
 # %%
+from hazardous.data._competing_weibull import make_synthetic_competing_weibull
 
-X, y_censored, y_uncensored = competing_w.make_synthetic_competing_weibull(
+
+X, y_censored, y_uncensored = make_synthetic_competing_weibull(
     n_samples=3_000,
     base_scale=1_000,
     n_features=10,
@@ -57,20 +56,35 @@
     independent_censoring=False,
     features_censoring_rate=0.2,
     return_uncensored_data=True,
-    return_X_y=True,
     feature_rounding=3,
     target_rounding=4,
     censoring_relative_scale=4.0,
-    random_state=0,
     complex_features=True,
+    return_X_y=True,
+    random_state=seed,
+)
+
+print(y_censored["event"].value_counts().sort_index())
+
+sns.histplot(
+    y_censored,
+    x="duration",
+    hue="event",
+    multiple="stack",
+    palette="magma",
 )
 
 # %%
-n_samples = len(X)
-calculate_variance = n_samples <= 5_000
+from lifelines import AalenJohansenFitter
+
+
+calculate_variance = X.shape[0] <= 5_000
 aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0)
+aj
+
+# %%
+from hazardous._gradient_boosting_incidence import GradientBoostingIncidence
 
-y_censored["event"].value_counts()
 
 gb_incidence = GradientBoostingIncidence(
     learning_rate=0.1,
@@ -82,7 +96,7 @@
     show_progressbar=False,
     random_state=seed,
 )
-
+gb_incidence
 
 # %%
 #
@@ -92,13 +106,23 @@
 # Let's now estimate the CIFs on uncensored data and plot them against the
 # theoretical CIFs:
 
+import numpy as np
+from time import perf_counter
+
 
 def plot_cumulative_incidence_functions(
-    X, y, gb_incidence=None, aj=None, X_test=None, y_test=None
+    X,
+    y,
+    gb_incidence=None,
+    aj=None,
+    X_test=None,
+    y_test=None,
+    verbose=False,
 ):
     n_events = y["event"].max()
     t_max = y["duration"].max()
     _, axes = plt.subplots(figsize=(12, 4), ncols=n_events, sharey=True)
+
     # Compute the estimate of the CIFs on a coarse grid.
     coarse_timegrid = np.linspace(0, t_max, num=100)
     censoring_fraction = (y["event"] == 0).mean()
@@ -113,17 +137,28 @@ def plot_cumulative_incidence_functions(
             gb_incidence.set_params(event_of_interest=event_id)
             gb_incidence.fit(X, y)
             duration = perf_counter() - tic
-            print(f"GB Incidence for event {event_id} fit in {duration:.3f} s")
+
+            if verbose:
+                print(f"GB Incidence for event {event_id} fit in {duration:.3f} s")
+
             tic = perf_counter()
             cifs_pred = gb_incidence.predict_cumulative_incidence(X, coarse_timegrid)
             cif_mean = cifs_pred.mean(axis=0)
             duration = perf_counter() - tic
-            print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s")
-            print("Brier score on training data:", -gb_incidence.score(X, y))
-            if X_test is not None:
+
+            if verbose:
                 print(
-                    "Brier score on testing data:", -gb_incidence.score(X_test, y_test)
+                    f"GB Incidence for event {event_id} prediction in {duration:.3f} s"
                 )
+
+            if verbose:
+                brier_score_train = -gb_incidence.score(X, y)
+                print(f"Brier score on training data: {brier_score_train:.3f}")
+                if X_test is not None:
+                    brier_score_test = -gb_incidence.score(X_test, y_test)
+                    print(
+                        f"Brier score on testing data: {brier_score_test:.3f}",
+                    )
             ax.plot(
                 coarse_timegrid,
                 cif_mean,
@@ -135,17 +170,25 @@ def plot_cumulative_incidence_functions(
             tic = perf_counter()
             aj.fit(y["duration"], y["event"], event_of_interest=event_id)
             duration = perf_counter() - tic
-            print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s")
+            if verbose:
+                print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s")
+
             aj.plot(label="Aalen-Johansen", ax=ax)
             ax.set_xlim(0, 8_000)
             ax.set_ylim(0, 0.5)
+
         if event_id == 1:
             ax.legend(loc="lower right")
         else:
             ax.legend().remove()
 
+        if verbose:
+            print("=" * 16, "\n")
+
 
 # %%
+from sklearn.model_selection import train_test_split
+
 
 X_train, X_test, y_train_c, y_test_c = train_test_split(
     X, y_censored, test_size=0.3, random_state=seed
@@ -153,17 +196,6 @@ def plot_cumulative_incidence_functions(
 y_train_u = y_uncensored.loc[y_train_c.index]
 y_test_u = y_uncensored.loc[y_test_c.index]
 
-gb_incidence = GradientBoostingIncidence(
-    learning_rate=0.1,
-    n_iter=20,
-    max_leaf_nodes=15,
-    hard_zero_fraction=0.1,
-    min_samples_leaf=5,
-    loss="ibs",
-    show_progressbar=False,
-    random_state=seed,
-)
-
 plot_cumulative_incidence_functions(
     X_train,
     y_train_u,
diff --git a/hazardous/data/_competing_weibull.py b/hazardous/data/_competing_weibull.py
index b20401c..b6d3975 100644
--- a/hazardous/data/_competing_weibull.py
+++ b/hazardous/data/_competing_weibull.py
@@ -23,7 +23,7 @@
 
 
 def _censor(
-    y,
+    y_uncensored,
     independent_censoring=True,
     X=None,
     features_censoring_rate=0.2,
@@ -32,20 +32,22 @@ def _censor(
 ):
     rng = check_random_state(random_state)
     if censoring_relative_scale == 0 or censoring_relative_scale is None:
-        return y
+        return y_uncensored
+
+    mean_duration = y_uncensored["duration"].mean()
 
     if independent_censoring:
-        scale_censoring = censoring_relative_scale * y["duration"].mean()
+        scale_censoring = censoring_relative_scale * mean_duration
         shape_censoring = 3
+
     else:
-        w_censoring_star = rng.randn(X.shape[1], 2)
-        w_censoring_star = np.where(
-            rng.rand(w_censoring_star.shape[0], w_censoring_star.shape[1])
-            < features_censoring_rate,
-            w_censoring_star,
+        W_censoring_star = rng.randn(X.shape[1], 2)
+        W_censoring_star = np.where(
+            rng.rand(*W_censoring_star.shape) < features_censoring_rate,
+            W_censoring_star,
             0,
         )
-        df_censoring_params = censoring_relative_scale * X @ w_censoring_star
+        df_censoring_params = censoring_relative_scale * X @ W_censoring_star
         df_censoring_params.columns = ["shape_0", "scale_0"]
 
         df_censoring_params["shape_0"] = _rescale_params(
@@ -59,12 +61,16 @@ def _censor(
             param_max=censoring_relative_scale,
         )
 
-        scale_censoring = df_censoring_params["scale_0"].values * y["duration"].mean()
+        scale_censoring = df_censoring_params["scale_0"].values * mean_duration
         shape_censoring = df_censoring_params["shape_0"].values
+
     censoring = weibull_min.rvs(
-        shape_censoring, scale=scale_censoring, size=y.shape[0], random_state=rng
+        shape_censoring,
+        scale=scale_censoring,
+        size=y_uncensored.shape[0],
+        random_state=rng,
     )
-    y_censored = y.copy()
+    y_censored = y_uncensored.copy()
     y_censored["event"] = np.where(
         y_censored["duration"] < censoring, y_censored["event"], 0
     )
@@ -82,6 +88,7 @@ def compute_shape_and_scale(
     random_state=0,
 ):
     rng = check_random_state(random_state)
+
     # Adding interactions between features
     preprocessor = make_pipeline(
         SplineTransformer(n_knots=3),
@@ -91,58 +98,61 @@ def compute_shape_and_scale(
     )
     preprocessor.set_output(transform="pandas")
     X_trans = preprocessor.fit_transform(X)
+
     # Create masked matrix with the interactions
     n_weibull_parameters = 2 * n_events
-    w_star = rng.randn(X_trans.shape[1], n_weibull_parameters)
+    W_star = rng.randn(X_trans.shape[1], n_weibull_parameters)
+
     # 1-feature_rate% of marginal features and interacted features
     # are set to 0
-    cols_features = X_trans.columns
-    marginal_mask = np.array([len(col.split(" ")) == 1 for col in cols_features])
+    # Feature names without " " are marginal, otherwise they are interactions.
+    marginal_mask = np.array([len(col.split(" ")) == 1 for col in X_trans.columns])
     marginal_cols = np.repeat(
         marginal_mask.reshape(-1, 1), repeats=n_weibull_parameters, axis=1
-    )  # (X_trans.shape[1], n_weibull_parameters)
+    )  # shape: (X_trans.shape[1], n_weibull_parameters)
 
-    drop_out_mask = rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate
-    w_star_marginal = np.where(
+    drop_out_mask = rng.rand(W_star.shape[0], W_star.shape[1]) < features_rate
+    W_star_marginal = np.where(
         drop_out_mask & marginal_cols,
-        w_star,
+        W_star,
         0,
     )
-    w_star = np.where(
+    W_star = np.where(
         drop_out_mask & ~marginal_cols,
-        w_star,
+        W_star,
         0,
     )
-    w_star += w_star_marginal
+    W_star += W_star_marginal
 
     # Computation of the true values of shape and scale
-    shape_scale_star = X_trans.values @ w_star
+    shape_scale_star = X_trans.values @ W_star
     shape_scale_columns = [f"shape_{i}" for i in range(1, n_events + 1)] + [
         f"scale_{i}" for i in range(1, n_events + 1)
     ]
-    df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns)
-    # Rescaling of these values to stay in the chosen range
+    SS_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns)
 
-    return rescale_params(df_shape_scale_star, n_events, shape_ranges, scale_ranges)
+    # Rescaling of these values to stay in the chosen range
+    return rescale_params(SS_star, n_events, shape_ranges, scale_ranges)
 
 
-def rescale_params(df_shape_scale_star, n_events, shape_ranges, scale_ranges):
+def rescale_params(SS_star, n_events, shape_ranges, scale_ranges):
     for event_id, shape_range, scale_range in zip(
         range(1, n_events + 1), cycle(shape_ranges), cycle(scale_ranges)
     ):
         shape_min, shape_max = shape_range
         scale_min, scale_max = scale_range
-        df_shape_scale_star[f"shape_{event_id}"] = _rescale_params(
-            df_shape_scale_star[[f"shape_{event_id}"]], shape_min, shape_max
+        SS_star[f"shape_{event_id}"] = _rescale_params(
+            SS_star[[f"shape_{event_id}"]], shape_min, shape_max
         )
-        df_shape_scale_star[f"scale_{event_id}"] = _rescale_params(
-            df_shape_scale_star[[f"scale_{event_id}"]], scale_min, scale_max
+        SS_star[f"scale_{event_id}"] = _rescale_params(
+            SS_star[[f"scale_{event_id}"]], scale_min, scale_max
         )
-    return df_shape_scale_star
+    return SS_star
 
 
 def _rescale_params(column_param, param_min, param_max):
     # Rescaling of these values to stay in the chosen range
+    # Expit is the logistic sigmoid function.
     scaler = StandardScaler()
     column_param = (
         param_min + (param_max - param_min) * expit(scaler.fit_transform(column_param))
@@ -207,13 +217,13 @@ def make_complex_features_with_sparse_matrix(
     rng = np.random.RandomState(random_state)
     # Create features given to the model as X and then creating the interactions
     columns = [f"feature_{i}" for i in range(n_features)]
-    df_features = pd.DataFrame(
+    X = pd.DataFrame(
         rng.randn(n_samples, n_features),
         columns=columns,
     )
 
-    df_shape_scale_star = compute_shape_and_scale(
-        df_features,
+    SS_star = compute_shape_and_scale(
+        X,
         features_rate,
         n_events,
         degree_interaction,
@@ -224,15 +234,15 @@ def make_complex_features_with_sparse_matrix(
     # Throw durations from a weibull distribution with scale and shape as the parameters
     event_durations = []
     for event in range(1, n_events + 1):
-        shape = df_shape_scale_star[f"shape_{event}"]
-        scale = df_shape_scale_star[f"scale_{event}"]
+        shape = SS_star[f"shape_{event}"]
+        scale = SS_star[f"scale_{event}"]
         durations = weibull_min.rvs(shape, scale=scale * base_scale, random_state=rng)
         event_durations.append(durations)
 
     # Creating the target tabular
     event_durations = np.asarray(event_durations)
     duration_argmin = np.argmin(event_durations, axis=0)
-    return df_features, event_durations, duration_argmin
+    return X, event_durations, duration_argmin
 
 
 def make_synthetic_competing_weibull(
@@ -288,18 +298,18 @@ def make_synthetic_competing_weibull(
             scale_ranges=scale_ranges,
             random_state=random_state,
         )
-    y = pd.DataFrame(
+    y_uncensored = pd.DataFrame(
         dict(
             event=duration_argmin + 1,
             duration=event_durations[duration_argmin, np.arange(n_samples)],
         )
     )
     y_censored = _censor(
-        y,
-        censoring_relative_scale=censoring_relative_scale,
+        y_uncensored,
         independent_censoring=independent_censoring,
         X=X,
         features_censoring_rate=features_censoring_rate,
+        censoring_relative_scale=censoring_relative_scale,
         random_state=random_state,
     )
     if feature_rounding is not None:
@@ -307,12 +317,12 @@ def make_synthetic_competing_weibull(
 
     if target_rounding is not None:
         y_censored["duration"] = y_censored["duration"].round(target_rounding)
-        y = y.round(target_rounding)
+        y_uncensored = y_uncensored.round(target_rounding)
 
     if return_X_y:
         if return_uncensored_data:
-            return X, y_censored, y
+            return X, y_censored, y_uncensored
         return X, y_censored
 
-    frame = pd.concat([X, y], axis=1)
+    frame = pd.concat([X, y_censored], axis=1)
     return Bunch(data=frame[X.columns], target=frame[y_censored.columns], frame=frame)
diff --git a/pyproject.toml b/pyproject.toml
index 862b146..b5140b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -107,7 +107,6 @@ exclude = [
     "venv",
     "doc",
 ]
-per-file-ignores = {}
 
 # Same as Black.
 line-length = 88
@@ -119,3 +118,7 @@ target-version = "py39"
 
 [tool.ruff.mccabe]
 max-complexity = 10
+
+[tool.ruff.per-file-ignores]
+# It's fine not to put the import at the top of the file in the examples
+"examples/*" = ["E402"]