Skip to content

Commit

Permalink
Synthetic dataset with expit
Browse files Browse the repository at this point in the history
  • Loading branch information
juAlberge committed Dec 11, 2023
1 parent ab5d372 commit b038090
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 63 deletions.
85 changes: 43 additions & 42 deletions examples/complex_synthetic_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd
from scipy.stats import weibull_min
from scipy.special import expit

rng = np.random.default_rng(0)
seed = 0
rng = np.random.RandomState(seed)

DEFAULT_SHAPE_RANGES = (
(0.7, 0.9),
(1.0, 1.0),
Expand Down Expand Up @@ -53,81 +55,80 @@ def compute_shape_and_scale(
df_trans = df_poly_features.fit_transform(df_features)

# Create masked matrix with the interactions
w_star = np.random.randn(df_trans.shape[1], n_weibull_parameters)
w_star = rng.randn(df_trans.shape[1], n_weibull_parameters)
# Set 1-feature_rate% of the w_star to 0
w_star = np.where(
np.random.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0
rng.rand(w_star.shape[0], w_star.shape[1]) < features_rate, w_star, 0
)
# Computation of the true values of shape and scale
shape_scale_star = df_trans.values @ w_star
shape_scale_columns = [f"shape_{i}" for i in range(n_events)] + [
f"scale_{i}" for i in range(n_events)
]
df_shape_scale_star = pd.DataFrame(shape_scale_star, columns=shape_scale_columns)
# Rescaling of these values to stay in the chosen range
df_shape_scale_star = rescaling_params_to_respect_default_ranges(
df_shape_scale_star
)
return df_shape_scale_star


def rescaling_params_to_respect_default_ranges(df_shape_scale_star):
# Rescaling of these values to stay in the chosen range
for event, (scale_default, shape_default) in enumerate(
zip(DEFAULT_SCALE_RANGES, DEFAULT_SHAPE_RANGES)
):
shape_min, shape_max = shape_default
scale_min, scale_max = scale_default
shape = df_shape_scale_star[f"shape_{event}"].copy()
scale = df_shape_scale_star[f"scale_{event}"].copy()
shape = shape_min + (shape_max - shape_min) * (shape - shape.min()) / (
shape.max() - shape.min()
df_shape_scale_star = rescaling_params_to_respect_default_ranges(
df_shape_scale_star, shape_default, scale_default, event
)
scale = scale_min + (scale_max - scale_min) * (scale - scale.min()) / (
scale.max() - scale.min()
)
df_shape_scale_star[f"shape_{event}"] = shape
df_shape_scale_star[f"scale_{event}"] = scale
return df_shape_scale_star


def rescaling_params_to_respect_default_ranges(
df_shape_scale_star, shape_default, scale_default, event
):
# Rescaling of these values to stay in the chosen range
shape_min, shape_max = shape_default
scale_min, scale_max = scale_default
shape = df_shape_scale_star[f"shape_{event}"].copy()
scale = df_shape_scale_star[f"scale_{event}"].copy()
df_shape_scale_star[f"shape_{event}"] = shape_min + (shape_max - shape_min) * expit(
shape
)
df_shape_scale_star[f"scale_{event}"] = scale_min + (scale_max - scale_min) * expit(
scale
)
return df_shape_scale_star


def censor_data(
y, relative_scale, independant=True, X=None, features_censoring_rate=0.2
y, independant=True, X=None, features_censoring_rate=0.2, relative_scale=1.5
):
if relative_scale == 0 or relative_scale is None:
return y

if independant:
scale_censoring = relative_scale * y["duration"].mean()
shape_censoring = 1
else:
features_impact_censoring = np.abs(np.random.randn(X.shape[1], 1))
features_impact_censoring = rng.randn(X.shape[1], 2)
features_impact_censoring = np.where(
np.random.rand(
rng.rand(
features_impact_censoring.shape[0], features_impact_censoring.shape[1]
)
< features_censoring_rate,
features_impact_censoring,
0,
)
scale_censoring = (X @ features_impact_censoring).values.flatten()
scale_censoring = (
(
1
+ (scale_censoring - scale_censoring.min())
/ (scale_censoring.max() - scale_censoring.min())
)
* relative_scale
* y["duration"].mean()
df_censoring_params = relative_scale * X @ features_impact_censoring
df_censoring_params.columns = ["shape_0", "scale_0"]
df_censoring_params = rescaling_params_to_respect_default_ranges(
df_censoring_params,
shape_default=(0.5, 0.9),
scale_default=(10, 15),
event=0,
)

scale_censoring = df_censoring_params["scale_0"].values * y["duration"].mean()
shape_censoring = df_censoring_params["shape_0"].values
censoring = weibull_min.rvs(
1, scale=scale_censoring, size=y.shape[0], random_state=rng
shape_censoring, scale=scale_censoring, size=y.shape[0], random_state=rng
)
y = y.copy()
y["event"] = np.where(y["duration"] < censoring, y["event"], 0)
y["duration"] = np.minimum(y["duration"], censoring)
return y
y_censored = y.copy()
y_censored["event"] = np.where(y["duration"] < censoring, y_censored["event"], 0)
y_censored["duration"] = np.minimum(y_censored["duration"], censoring)
return y_censored


def complex_data(
Expand All @@ -144,7 +145,7 @@ def complex_data(
return_uncensored_data=False,
):
# Create features given to the model as X and then creating the interactions
df_features = pd.DataFrame(np.random.randn(n_samples, n_features))
df_features = pd.DataFrame(rng.randn(n_samples, n_features))
df_features.columns = [f"feature_{i}" for i in range(n_features)]

df_shape_scale_star = compute_shape_and_scale(
Expand All @@ -169,7 +170,7 @@ def complex_data(
)
y_censored = censor_data(
y,
relative_scale,
relative_scale=relative_scale,
independant=independant,
X=df_features,
features_censoring_rate=features_censoring_rate,
Expand Down
55 changes: 34 additions & 21 deletions examples/test_complex_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from hazardous import GradientBoostingIncidence
from lifelines import AalenJohansenFitter

rng = np.random.default_rng(0)
seed = 0
rng = np.random.RandomState(seed)
DEFAULT_SHAPE_RANGES = (
(0.7, 0.9),
(1.0, 1.0),
Expand All @@ -25,37 +25,25 @@

# %%

X, y = complex_data(
X, y_censored, y_uncensored = complex_data(
n_events=n_events,
n_weibull_parameters=2 * n_events,
n_samples=10_000,
n_samples=30_000,
base_scale=1_000,
n_features=10,
features_rate=0.3,
features_rate=0.5,
degree_interaction=2,
relative_scale=0.95,
independant=False,
features_censoring_rate=0.1,
features_censoring_rate=0.2,
return_uncensored_data=True,
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

n_samples = len(X_train)
n_samples = len(X)
calculate_variance = n_samples <= 5_000
aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0)

gb_incidence = GradientBoostingIncidence(
learning_rate=0.03,
n_iter=200,
max_leaf_nodes=5,
hard_zero_fraction=0.1,
min_samples_leaf=50,
loss="ibs",
show_progressbar=False,
random_state=0,
)

y["event"].value_counts()
# %%
#
# CIFs estimated on uncensored data
Expand Down Expand Up @@ -110,6 +98,7 @@ def plot_cumulative_incidence_functions(
duration = perf_counter() - tic
print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s")
aj.plot(label="Aalen-Johansen", ax=ax)
print(aj.cumulative_density_.values[-1])

if event_id == 1:
ax.legend(loc="lower right")
Expand All @@ -119,13 +108,37 @@ def plot_cumulative_incidence_functions(

# %%

X_train, X_test, y_train_c, y_test_c = train_test_split(
X, y_censored, test_size=0.3, random_state=seed
)
y_train_u = y_uncensored.loc[y_train_c.index]
y_test_u = y_uncensored.loc[y_test_c.index]
gb_incidence = GradientBoostingIncidence(
learning_rate=0.05,
n_iter=1_000,
max_leaf_nodes=50,
hard_zero_fraction=0.1,
min_samples_leaf=5,
loss="inll",
show_progressbar=False,
random_state=seed,
)

plot_cumulative_incidence_functions(
X_train,
y_train,
y_train_u,
gb_incidence=gb_incidence,
aj=aj,
X_test=X_test,
y_test=y_test,
y_test=y_test_u,
)

plot_cumulative_incidence_functions(
X_train,
y_train_c,
gb_incidence=gb_incidence,
aj=aj,
X_test=X_test,
y_test=y_test_c,
)
# %%

0 comments on commit b038090

Please sign in to comment.