Minor refactor (#66)

* feat: splitting xgboost train into a feature extractor class * in progress: * consolidate xgboost in feature extractor * consolidate feature extractor into base class * rerun examples * add categorical suport * update docstrings * fix formatting
loft-br · Jul 4, 2024 · 56fa631 · 56fa631
1 parent 6247a15
commit 56fa631
Show file tree

Hide file tree

Showing 21 changed files with 1,956 additions and 1,777 deletions.
diff --git a/examples/basic_usage.ipynb b/examples/basic_usage.ipynb
diff --git a/examples/benchmarks/benchmark.py b/examples/benchmarks/benchmark.py
@@ -3,12 +3,8 @@
 import numpy as np
 import pandas as pd
 
-
-from xgbse.metrics import concordance_index, approx_brier_score, dist_calibration_score
-from xgbse.converters import (
-    convert_data_to_xgb_format,
-    convert_to_structured,
-)
+from xgbse.converters import convert_data_to_xgb_format, convert_to_structured
+from xgbse.metrics import approx_brier_score, concordance_index, dist_calibration_score
 
 # setting seed
 np.random.seed(42)
@@ -244,6 +240,7 @@ def train(self):
                 validation_data=(self.X_valid, self.y_valid),
                 early_stopping_rounds=10,
                 time_bins=self.time_bins,
+                num_boost_round=1000,
             )
 
         else:

diff --git a/examples/confidence_interval.ipynb b/examples/confidence_interval.ipynb
diff --git a/examples/extrapolation_example.ipynb b/examples/extrapolation_example.ipynb
diff --git a/examples/how_xgbse_works.ipynb b/examples/how_xgbse_works.ipynb
diff --git a/tests/data.py b/tests/data.py
@@ -1,3 +1,4 @@
+import pandas as pd
 from lifelines.datasets import load_gbsg2
 from sklearn.model_selection import train_test_split
 
@@ -39,3 +40,41 @@ def get_data():
         y_valid,
         feat,
     )
+
+
+def get_data_with_categorical():
+    data = load_gbsg2()
+
+    feat = ["age", "tsize", "pnodes", "progrec", "estrec", "menostat"]
+    X = data[feat]
+    X["menostat"] = pd.Categorical(X["menostat"])
+    T = data["time"]
+    E = data["cens"]
+
+    split_params = {"test_size": 0.2, "random_state": 0}
+    X_train, X_test, T_train, T_test, E_train, E_test = train_test_split(
+        X, T, E, **split_params
+    )
+    X_train, X_valid, T_train, T_valid, E_train, E_valid = train_test_split(
+        X_train, T_train, E_train, **split_params
+    )
+
+    y_train = convert_to_structured(T_train, E_train)
+    y_test = convert_to_structured(T_test, E_test)
+    y_valid = convert_to_structured(T_valid, E_valid)
+
+    return (
+        X_train,
+        X_test,
+        X_valid,
+        T_train,
+        T_test,
+        T_valid,
+        E_train,
+        E_test,
+        E_valid,
+        y_train,
+        y_test,
+        y_valid,
+        feat,
+    )
diff --git a/tests/test_extrapolation.py b/tests/test_extrapolation.py
@@ -1,8 +1,8 @@
+from tests.data import get_data
+from tests.test_survival_curves import between_01, monotonicity
 from xgbse import XGBSEDebiasedBCE
 from xgbse.extrapolation import extrapolate_constant_risk
-from xgbse.non_parametric import get_time_bins, calculate_kaplan_vectorized
-from tests.data import get_data
-from tests.test_survival_curves import monotonicity, between_01
+from xgbse.non_parametric import calculate_kaplan_vectorized, get_time_bins
 
 (
     X_train,
@@ -23,7 +23,7 @@
 
 # generating Kaplan Meier for all tests
 
-time_bins = get_time_bins(T_train, E_train, 100)
+time_bins = get_time_bins(T_train, E_train, size=30)
 
 mean, high, low = calculate_kaplan_vectorized(
     T_train.values.reshape(1, -1), E_train.values.reshape(1, -1), time_bins
@@ -34,7 +34,7 @@
 xgbse_model.fit(
     X_train,
     y_train,
-    num_boost_round=1000,
+    num_boost_round=100,
     validation_data=(X_valid, y_valid),
     early_stopping_rounds=10,
     verbose_eval=0,

diff --git a/tests/test_feature_extractors.py b/tests/test_feature_extractors.py
@@ -0,0 +1,106 @@
+import pytest
+
+from tests.data import get_data
+from xgbse._feature_extractors import FeatureExtractor
+
+(
+    X_train,
+    X_test,
+    X_valid,
+    T_train,
+    T_test,
+    T_valid,
+    E_train,
+    E_test,
+    E_valid,
+    y_train,
+    y_test,
+    y_valid,
+    features,
+) = get_data()
+
+
+def test_wrong_objective():
+    with pytest.raises(ValueError):
+        FeatureExtractor(xgb_params={"objective": "reg:squarederror"})
+
+
+def test_no_objective():
+    assert FeatureExtractor(xgb_params={}).xgb_params["objective"] == "survival:aft"
+
+
+def test_predict_leaves_early_stop():
+    xgbse = FeatureExtractor()
+    xgbse.fit(
+        X_train,
+        y_train,
+        num_boost_round=10000,
+        validation_data=(X_valid, y_valid),
+        early_stopping_rounds=10,
+        verbose_eval=0,
+    )
+    prediction = xgbse.predict_leaves(X_test)
+    assert prediction.shape == (
+        X_test.shape[0],
+        xgbse.bst.best_iteration + 1,
+    )
+
+
+def test_predict_leaves_no_early_stop():
+    xgbse = FeatureExtractor()
+    xgbse.fit(
+        X_train,
+        y_train,
+        num_boost_round=100,
+        validation_data=(X_valid, y_valid),
+        early_stopping_rounds=None,
+        verbose_eval=0,
+    )
+    assert xgbse.predict_leaves(X_test).shape == (X_test.shape[0], 100)
+
+
+def test_predict_hazard_early_stop():
+    xgbse = FeatureExtractor()
+    xgbse.fit(
+        X_train,
+        y_train,
+        num_boost_round=10000,
+        validation_data=(X_valid, y_valid),
+        early_stopping_rounds=10,
+        verbose_eval=0,
+    )
+    assert xgbse.predict_hazard(X_test).shape == (X_test.shape[0],)
+
+
+def test_predict_hazard_no_early_stop():
+    xgbse = FeatureExtractor()
+    xgbse.fit(
+        X_train,
+        y_train,
+        num_boost_round=100,
+        validation_data=(X_valid, y_valid),
+        early_stopping_rounds=None,
+        verbose_eval=0,
+    )
+    assert xgbse.predict_hazard(X_test).shape == (X_test.shape[0],)
+
+
+def test_feature_importances():
+    xgbse = FeatureExtractor()
+    xgbse.fit(
+        X_train,
+        y_train,
+        num_boost_round=100,
+        validation_data=(X_valid, y_valid),
+        early_stopping_rounds=None,
+        verbose_eval=0,
+    )
+    assert xgbse.feature_importances_ == xgbse.bst.get_score()
+
+
+def test_predict_not_fitted():
+    xgbse = FeatureExtractor()
+    with pytest.raises(ValueError):
+        xgbse.predict_leaves(X_test)
+    with pytest.raises(ValueError):
+        xgbse.predict_hazard(X_test)
diff --git a/tests/test_meta.py b/tests/test_meta.py
@@ -28,7 +28,6 @@
 
 
 def is_ci_width_consistent(bootstrap, X):
-
     mean1, high1, low1 = bootstrap.predict(X, return_ci=True, ci_width=0.683)
     mean2, high2, low2 = bootstrap.predict(X, return_ci=True, ci_width=0.95)
 
@@ -43,7 +42,6 @@ def is_ci_width_consistent(bootstrap, X):
     "model", [XGBSEDebiasedBCE, XGBSEKaplanNeighbors, XGBSEKaplanTree]
 )
 def test_ci_width_consistency(model):
-
     model = model()
     bootstrap = XGBSEBootstrapEstimator(model)
     bootstrap.fit(X_train, y_train)
@@ -52,7 +50,6 @@ def test_ci_width_consistency(model):
 
 
 def test_accuracy_improvement():
-
     base_model = XGBSEKaplanTree()
     base_model.fit(X_train, y_train)
 

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -1,12 +1,10 @@
-import pandas as pd
 import numpy as np
+import pandas as pd
 
-from xgbse.metrics import concordance_index, approx_brier_score, dist_calibration_score
-
-from xgbse import XGBSEDebiasedBCE
-
-from xgbse.non_parametric import get_time_bins, calculate_kaplan_vectorized
 from tests.data import get_data
+from xgbse import XGBSEDebiasedBCE
+from xgbse.metrics import approx_brier_score, concordance_index, dist_calibration_score
+from xgbse.non_parametric import calculate_kaplan_vectorized, get_time_bins
 
 (
     X_train,
@@ -27,7 +25,7 @@
 
 # generating Kaplan Meier for all tests
 
-time_bins = get_time_bins(T_train, E_train, 100)
+time_bins = get_time_bins(T_train, E_train, size=100)
 
 mean, high, low = calculate_kaplan_vectorized(
     T_train.values.reshape(1, -1), E_train.values.reshape(1, -1), time_bins
@@ -77,7 +75,6 @@ def is_dist_cal_return_correct_type():
 
 
 def test_concordance_index():
-
     assert concordance_index(y_train, km_survival) == 0.5
     assert concordance_index(y_test, preds) > 0.5
     assert np.isclose(
@@ -93,15 +90,13 @@ def test_concordance_index():
 
 
 def test_approx_brier_score():
-
     assert approx_brier_score(y_test, preds) < 0.25
     assert approx_brier_score(y_train, km_survival) < 0.2
     assert approx_brier_score(y_test, dummy_preds) == 0.25
     assert is_brier_score_return_correct_len()
 
 
 def test_dist_calibration_score():
-
     assert dist_calibration_score(y_train, km_survival) > 0.90
     assert dist_calibration_score(y_train, km_survival, returns="statistic") < 1.0
     assert dist_calibration_score(y_train, km_survival, returns="max_deviation") < 0.01

diff --git a/tests/test_survival_curves.py b/tests/test_survival_curves.py
@@ -1,14 +1,14 @@
 import numpy as np
 import pytest
-from xgbse.metrics import concordance_index
 
-from tests.data import get_data
+from tests.data import get_data_with_categorical
 from xgbse import (
     XGBSEDebiasedBCE,
     XGBSEKaplanNeighbors,
     XGBSEKaplanTree,
     XGBSEStackedWeibull,
 )
+from xgbse.metrics import concordance_index
 
 (
     X_train,
@@ -24,7 +24,7 @@
     y_test,
     y_valid,
     features,
-) = get_data()
+) = get_data_with_categorical()
 
 
 def monotonicity(survival_curves):
@@ -50,7 +50,7 @@ def assert_survival_curve(xgbse, test, preds, cindex):
     "model", [XGBSEDebiasedBCE, XGBSEKaplanNeighbors, XGBSEStackedWeibull]
 )
 def test_survival_curve(model):
-    xgbse = model()
+    xgbse = model(enable_categorical=True)
 
     xgbse.fit(
         X_train,
@@ -71,7 +71,7 @@ def test_survival_curve(model):
     "model", [XGBSEDebiasedBCE, XGBSEKaplanNeighbors, XGBSEStackedWeibull]
 )
 def test_survival_curve_without_early_stopping(model):
-    xgbse = model()
+    xgbse = model(enable_categorical=True)
 
     xgbse.fit(
         X_train,
@@ -85,7 +85,7 @@ def test_survival_curve_without_early_stopping(model):
 
 
 def test_survival_curve_tree():
-    xgbse = XGBSEKaplanTree()
+    xgbse = XGBSEKaplanTree(enable_categorical=True)
 
     xgbse.fit(X_train, y_train)
 
@@ -97,7 +97,21 @@ def test_survival_curve_tree():
 
 @pytest.mark.parametrize("model", [XGBSEKaplanTree, XGBSEKaplanNeighbors])
 def test_time_bins(model):
-    xgbse = model()
+    xgbse = model(enable_categorical=True)
+
+    bins = np.linspace(100, 1000, 6)
+    xgbse.fit(X_train, y_train, time_bins=bins)
+
+    preds = xgbse.predict(X_test)
+    assert (preds.columns == bins).all()
+
+
+@pytest.mark.parametrize(
+    "model",
+    [XGBSEDebiasedBCE, XGBSEKaplanNeighbors, XGBSEKaplanTree, XGBSEStackedWeibull],
+)
+def test_categorical_features(model):
+    xgbse = model(enable_categorical=True)
 
     bins = np.linspace(100, 1000, 6)
     xgbse.fit(X_train, y_train, time_bins=bins)

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -21,4 +21,11 @@
 
 
 def test_convert_value_error():
-    assert_raises(ValueError, convert_data_to_xgb_format, X_train, y_train, "blablabla")
+    assert_raises(
+        ValueError,
+        convert_data_to_xgb_format,
+        X_train,
+        y_train,
+        "blablabla",
+        enable_categorical=False,
+    )