Skip to content

Commit

Permalink
Synthe dataset icml (#26)
Browse files Browse the repository at this point in the history
* Creating a new synthetic dataset. For now, the censoring method does not depend of the covariates.

* adding censoring depending on the covariates

* Synthetic dataset with expit

* refactoring the code, adding histograms.

* event well numeroted

* changes dones after Olivier's reviewing

* fix some tests.

* iadding seasborn to push doc.

* fixing AJ estimator,
adding 50% features marinal and 50% features interaction in the creation of w_star

* fixing tests (because shape ans scale default parameters have been changed, some tests were broken)

* adjusting code as Vincent's propositions

* refacto code with vincent, tuning parameters.

* small rework of the example 💫

---------

Co-authored-by: Vincent M <[email protected]>
  • Loading branch information
juAlberge and Vincent-Maladiere authored Dec 18, 2023
1 parent bac9af1 commit a80546a
Show file tree
Hide file tree
Showing 5 changed files with 486 additions and 70 deletions.
217 changes: 217 additions & 0 deletions examples/plot_complex_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
"""
==========================================
Modeling competing risks on synthetic data
==========================================
This example introduces a synthetic data generation tool that can
be helpful to study the relative performance and potential biases
of predictive competing risks estimators on right-censored data.
Some of the input features and their second order multiplicative
interactions are statistically associated with the parameters of
the distributions from which the competing events are sampled.
"""

# %%
from sklearn.utils import check_random_state

seed = 0
rng = check_random_state(seed)


# %%
# In the following cell, we verify that the synthetic dataset is well defined.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from hazardous.data._competing_weibull import compute_shape_and_scale


X = rng.randn(10_000, 10)
column_names = [f"feature_{i}" for i in range(X.shape[1])]
X = pd.DataFrame(X, columns=column_names)

SS_star = compute_shape_and_scale(
X,
features_rate=0.2,
n_events=3,
degree_interaction=2,
random_state=0,
)

fig, axes = plt.subplots(2, 3, figsize=(10, 7))
for idx, col in enumerate(SS_star.columns):
sns.histplot(SS_star[col], ax=axes[idx // 3][idx % 3])

# %%
from hazardous.data._competing_weibull import make_synthetic_competing_weibull


X, y_censored, y_uncensored = make_synthetic_competing_weibull(
n_samples=3_000,
base_scale=1_000,
n_features=10,
features_rate=0.5,
degree_interaction=2,
independent_censoring=False,
features_censoring_rate=0.2,
return_uncensored_data=True,
feature_rounding=3,
target_rounding=4,
censoring_relative_scale=4.0,
complex_features=True,
return_X_y=True,
random_state=seed,
)

print(y_censored["event"].value_counts().sort_index())

sns.histplot(
y_censored,
x="duration",
hue="event",
multiple="stack",
palette="magma",
)

# %%
from lifelines import AalenJohansenFitter


calculate_variance = X.shape[0] <= 5_000
aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0)
aj

# %%
from hazardous._gradient_boosting_incidence import GradientBoostingIncidence


gb_incidence = GradientBoostingIncidence(
learning_rate=0.1,
n_iter=20,
max_leaf_nodes=15,
hard_zero_fraction=0.1,
min_samples_leaf=5,
loss="ibs",
show_progressbar=False,
random_state=seed,
)
gb_incidence

# %%
#
# CIFs estimated on uncensored data
# ---------------------------------
#
# Let's now estimate the CIFs on uncensored data and plot them against the
# theoretical CIFs:

import numpy as np
from time import perf_counter


def plot_cumulative_incidence_functions(
X,
y,
gb_incidence=None,
aj=None,
X_test=None,
y_test=None,
verbose=False,
):
n_events = y["event"].max()
t_max = y["duration"].max()
_, axes = plt.subplots(figsize=(12, 4), ncols=n_events, sharey=True)

# Compute the estimate of the CIFs on a coarse grid.
coarse_timegrid = np.linspace(0, t_max, num=100)
censoring_fraction = (y["event"] == 0).mean()
plt.suptitle(
"Cause-specific cumulative incidence functions"
f" ({censoring_fraction:.1%} censoring)"
)

for event_id, ax in enumerate(axes, 1):
if gb_incidence is not None:
tic = perf_counter()
gb_incidence.set_params(event_of_interest=event_id)
gb_incidence.fit(X, y)
duration = perf_counter() - tic

if verbose:
print(f"GB Incidence for event {event_id} fit in {duration:.3f} s")

tic = perf_counter()
cifs_pred = gb_incidence.predict_cumulative_incidence(X, coarse_timegrid)
cif_mean = cifs_pred.mean(axis=0)
duration = perf_counter() - tic

if verbose:
print(
f"GB Incidence for event {event_id} prediction in {duration:.3f} s"
)

if verbose:
brier_score_train = -gb_incidence.score(X, y)
print(f"Brier score on training data: {brier_score_train:.3f}")
if X_test is not None:
brier_score_test = -gb_incidence.score(X_test, y_test)
print(
f"Brier score on testing data: {brier_score_test:.3f}",
)
ax.plot(
coarse_timegrid,
cif_mean,
label="GradientBoostingIncidence",
)
ax.set(title=f"Event {event_id}")

if aj is not None:
tic = perf_counter()
aj.fit(y["duration"], y["event"], event_of_interest=event_id)
duration = perf_counter() - tic
if verbose:
print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s")

aj.plot(label="Aalen-Johansen", ax=ax)
ax.set_xlim(0, 8_000)
ax.set_ylim(0, 0.5)

if event_id == 1:
ax.legend(loc="lower right")
else:
ax.legend().remove()

if verbose:
print("=" * 16, "\n")


# %%
from sklearn.model_selection import train_test_split


X_train, X_test, y_train_c, y_test_c = train_test_split(
X, y_censored, test_size=0.3, random_state=seed
)
y_train_u = y_uncensored.loc[y_train_c.index]
y_test_u = y_uncensored.loc[y_test_c.index]

plot_cumulative_incidence_functions(
X_train,
y_train_u,
gb_incidence=gb_incidence,
aj=aj,
X_test=X_test,
y_test=y_test_u,
)

plot_cumulative_incidence_functions(
X_train,
y_train_c,
gb_incidence=gb_incidence,
aj=aj,
X_test=X_test,
y_test=y_test_c,
)

# %%
Loading

0 comments on commit a80546a

Please sign in to comment.