Skip to content

Commit

Permalink
Minor refactor (#66)
Browse files Browse the repository at this point in the history
* feat: splitting xgboost train into a feature extractor class

* in progress:

* consolidate xgboost in feature extractor

* consolidate feature extractor into base class

* rerun examples

* add categorical suport

* update docstrings

* fix formatting
  • Loading branch information
GabrielGimenez authored Jul 4, 2024
1 parent 6247a15 commit 56fa631
Show file tree
Hide file tree
Showing 21 changed files with 1,956 additions and 1,777 deletions.
924 changes: 228 additions & 696 deletions examples/basic_usage.ipynb

Large diffs are not rendered by default.

9 changes: 3 additions & 6 deletions examples/benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,8 @@
import numpy as np
import pandas as pd


from xgbse.metrics import concordance_index, approx_brier_score, dist_calibration_score
from xgbse.converters import (
convert_data_to_xgb_format,
convert_to_structured,
)
from xgbse.converters import convert_data_to_xgb_format, convert_to_structured
from xgbse.metrics import approx_brier_score, concordance_index, dist_calibration_score

# setting seed
np.random.seed(42)
Expand Down Expand Up @@ -244,6 +240,7 @@ def train(self):
validation_data=(self.X_valid, self.y_valid),
early_stopping_rounds=10,
time_bins=self.time_bins,
num_boost_round=1000,
)

else:
Expand Down
89 changes: 53 additions & 36 deletions examples/confidence_interval.ipynb

Large diffs are not rendered by default.

627 changes: 532 additions & 95 deletions examples/extrapolation_example.ipynb

Large diffs are not rendered by default.

1,124 changes: 587 additions & 537 deletions examples/how_xgbse_works.ipynb

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions tests/data.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pandas as pd
from lifelines.datasets import load_gbsg2
from sklearn.model_selection import train_test_split

Expand Down Expand Up @@ -39,3 +40,41 @@ def get_data():
y_valid,
feat,
)


def get_data_with_categorical():
data = load_gbsg2()

feat = ["age", "tsize", "pnodes", "progrec", "estrec", "menostat"]
X = data[feat]
X["menostat"] = pd.Categorical(X["menostat"])
T = data["time"]
E = data["cens"]

split_params = {"test_size": 0.2, "random_state": 0}
X_train, X_test, T_train, T_test, E_train, E_test = train_test_split(
X, T, E, **split_params
)
X_train, X_valid, T_train, T_valid, E_train, E_valid = train_test_split(
X_train, T_train, E_train, **split_params
)

y_train = convert_to_structured(T_train, E_train)
y_test = convert_to_structured(T_test, E_test)
y_valid = convert_to_structured(T_valid, E_valid)

return (
X_train,
X_test,
X_valid,
T_train,
T_test,
T_valid,
E_train,
E_test,
E_valid,
y_train,
y_test,
y_valid,
feat,
)
10 changes: 5 additions & 5 deletions tests/test_extrapolation.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from tests.data import get_data
from tests.test_survival_curves import between_01, monotonicity
from xgbse import XGBSEDebiasedBCE
from xgbse.extrapolation import extrapolate_constant_risk
from xgbse.non_parametric import get_time_bins, calculate_kaplan_vectorized
from tests.data import get_data
from tests.test_survival_curves import monotonicity, between_01
from xgbse.non_parametric import calculate_kaplan_vectorized, get_time_bins

(
X_train,
Expand All @@ -23,7 +23,7 @@

# generating Kaplan Meier for all tests

time_bins = get_time_bins(T_train, E_train, 100)
time_bins = get_time_bins(T_train, E_train, size=30)

mean, high, low = calculate_kaplan_vectorized(
T_train.values.reshape(1, -1), E_train.values.reshape(1, -1), time_bins
Expand All @@ -34,7 +34,7 @@
xgbse_model.fit(
X_train,
y_train,
num_boost_round=1000,
num_boost_round=100,
validation_data=(X_valid, y_valid),
early_stopping_rounds=10,
verbose_eval=0,
Expand Down
106 changes: 106 additions & 0 deletions tests/test_feature_extractors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import pytest

from tests.data import get_data
from xgbse._feature_extractors import FeatureExtractor

(
X_train,
X_test,
X_valid,
T_train,
T_test,
T_valid,
E_train,
E_test,
E_valid,
y_train,
y_test,
y_valid,
features,
) = get_data()


def test_wrong_objective():
with pytest.raises(ValueError):
FeatureExtractor(xgb_params={"objective": "reg:squarederror"})


def test_no_objective():
assert FeatureExtractor(xgb_params={}).xgb_params["objective"] == "survival:aft"


def test_predict_leaves_early_stop():
xgbse = FeatureExtractor()
xgbse.fit(
X_train,
y_train,
num_boost_round=10000,
validation_data=(X_valid, y_valid),
early_stopping_rounds=10,
verbose_eval=0,
)
prediction = xgbse.predict_leaves(X_test)
assert prediction.shape == (
X_test.shape[0],
xgbse.bst.best_iteration + 1,
)


def test_predict_leaves_no_early_stop():
xgbse = FeatureExtractor()
xgbse.fit(
X_train,
y_train,
num_boost_round=100,
validation_data=(X_valid, y_valid),
early_stopping_rounds=None,
verbose_eval=0,
)
assert xgbse.predict_leaves(X_test).shape == (X_test.shape[0], 100)


def test_predict_hazard_early_stop():
xgbse = FeatureExtractor()
xgbse.fit(
X_train,
y_train,
num_boost_round=10000,
validation_data=(X_valid, y_valid),
early_stopping_rounds=10,
verbose_eval=0,
)
assert xgbse.predict_hazard(X_test).shape == (X_test.shape[0],)


def test_predict_hazard_no_early_stop():
xgbse = FeatureExtractor()
xgbse.fit(
X_train,
y_train,
num_boost_round=100,
validation_data=(X_valid, y_valid),
early_stopping_rounds=None,
verbose_eval=0,
)
assert xgbse.predict_hazard(X_test).shape == (X_test.shape[0],)


def test_feature_importances():
xgbse = FeatureExtractor()
xgbse.fit(
X_train,
y_train,
num_boost_round=100,
validation_data=(X_valid, y_valid),
early_stopping_rounds=None,
verbose_eval=0,
)
assert xgbse.feature_importances_ == xgbse.bst.get_score()


def test_predict_not_fitted():
xgbse = FeatureExtractor()
with pytest.raises(ValueError):
xgbse.predict_leaves(X_test)
with pytest.raises(ValueError):
xgbse.predict_hazard(X_test)
3 changes: 0 additions & 3 deletions tests/test_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@


def is_ci_width_consistent(bootstrap, X):

mean1, high1, low1 = bootstrap.predict(X, return_ci=True, ci_width=0.683)
mean2, high2, low2 = bootstrap.predict(X, return_ci=True, ci_width=0.95)

Expand All @@ -43,7 +42,6 @@ def is_ci_width_consistent(bootstrap, X):
"model", [XGBSEDebiasedBCE, XGBSEKaplanNeighbors, XGBSEKaplanTree]
)
def test_ci_width_consistency(model):

model = model()
bootstrap = XGBSEBootstrapEstimator(model)
bootstrap.fit(X_train, y_train)
Expand All @@ -52,7 +50,6 @@ def test_ci_width_consistency(model):


def test_accuracy_improvement():

base_model = XGBSEKaplanTree()
base_model.fit(X_train, y_train)

Expand Down
15 changes: 5 additions & 10 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import pandas as pd
import numpy as np
import pandas as pd

from xgbse.metrics import concordance_index, approx_brier_score, dist_calibration_score

from xgbse import XGBSEDebiasedBCE

from xgbse.non_parametric import get_time_bins, calculate_kaplan_vectorized
from tests.data import get_data
from xgbse import XGBSEDebiasedBCE
from xgbse.metrics import approx_brier_score, concordance_index, dist_calibration_score
from xgbse.non_parametric import calculate_kaplan_vectorized, get_time_bins

(
X_train,
Expand All @@ -27,7 +25,7 @@

# generating Kaplan Meier for all tests

time_bins = get_time_bins(T_train, E_train, 100)
time_bins = get_time_bins(T_train, E_train, size=100)

mean, high, low = calculate_kaplan_vectorized(
T_train.values.reshape(1, -1), E_train.values.reshape(1, -1), time_bins
Expand Down Expand Up @@ -77,7 +75,6 @@ def is_dist_cal_return_correct_type():


def test_concordance_index():

assert concordance_index(y_train, km_survival) == 0.5
assert concordance_index(y_test, preds) > 0.5
assert np.isclose(
Expand All @@ -93,15 +90,13 @@ def test_concordance_index():


def test_approx_brier_score():

assert approx_brier_score(y_test, preds) < 0.25
assert approx_brier_score(y_train, km_survival) < 0.2
assert approx_brier_score(y_test, dummy_preds) == 0.25
assert is_brier_score_return_correct_len()


def test_dist_calibration_score():

assert dist_calibration_score(y_train, km_survival) > 0.90
assert dist_calibration_score(y_train, km_survival, returns="statistic") < 1.0
assert dist_calibration_score(y_train, km_survival, returns="max_deviation") < 0.01
Expand Down
28 changes: 21 additions & 7 deletions tests/test_survival_curves.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import numpy as np
import pytest
from xgbse.metrics import concordance_index

from tests.data import get_data
from tests.data import get_data_with_categorical
from xgbse import (
XGBSEDebiasedBCE,
XGBSEKaplanNeighbors,
XGBSEKaplanTree,
XGBSEStackedWeibull,
)
from xgbse.metrics import concordance_index

(
X_train,
Expand All @@ -24,7 +24,7 @@
y_test,
y_valid,
features,
) = get_data()
) = get_data_with_categorical()


def monotonicity(survival_curves):
Expand All @@ -50,7 +50,7 @@ def assert_survival_curve(xgbse, test, preds, cindex):
"model", [XGBSEDebiasedBCE, XGBSEKaplanNeighbors, XGBSEStackedWeibull]
)
def test_survival_curve(model):
xgbse = model()
xgbse = model(enable_categorical=True)

xgbse.fit(
X_train,
Expand All @@ -71,7 +71,7 @@ def test_survival_curve(model):
"model", [XGBSEDebiasedBCE, XGBSEKaplanNeighbors, XGBSEStackedWeibull]
)
def test_survival_curve_without_early_stopping(model):
xgbse = model()
xgbse = model(enable_categorical=True)

xgbse.fit(
X_train,
Expand All @@ -85,7 +85,7 @@ def test_survival_curve_without_early_stopping(model):


def test_survival_curve_tree():
xgbse = XGBSEKaplanTree()
xgbse = XGBSEKaplanTree(enable_categorical=True)

xgbse.fit(X_train, y_train)

Expand All @@ -97,7 +97,21 @@ def test_survival_curve_tree():

@pytest.mark.parametrize("model", [XGBSEKaplanTree, XGBSEKaplanNeighbors])
def test_time_bins(model):
xgbse = model()
xgbse = model(enable_categorical=True)

bins = np.linspace(100, 1000, 6)
xgbse.fit(X_train, y_train, time_bins=bins)

preds = xgbse.predict(X_test)
assert (preds.columns == bins).all()


@pytest.mark.parametrize(
"model",
[XGBSEDebiasedBCE, XGBSEKaplanNeighbors, XGBSEKaplanTree, XGBSEStackedWeibull],
)
def test_categorical_features(model):
xgbse = model(enable_categorical=True)

bins = np.linspace(100, 1000, 6)
xgbse.fit(X_train, y_train, time_bins=bins)
Expand Down
9 changes: 8 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,11 @@


def test_convert_value_error():
assert_raises(ValueError, convert_data_to_xgb_format, X_train, y_train, "blablabla")
assert_raises(
ValueError,
convert_data_to_xgb_format,
X_train,
y_train,
"blablabla",
enable_categorical=False,
)
Loading

0 comments on commit 56fa631

Please sign in to comment.