Skip to content

Commit

Permalink
patches to deal with categorical data in xgboost
Browse files Browse the repository at this point in the history
  • Loading branch information
Raul committed Jul 1, 2022
1 parent 6247a15 commit 60fce5b
Show file tree
Hide file tree
Showing 6 changed files with 146 additions and 63 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,5 @@ cython_debug/

.DS_Store
.vscode/

.idea
19 changes: 16 additions & 3 deletions xgbse/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@ def __init__(self):
self.bst = None

def get_neighbors(
self, query_data, index_data=None, query_id=None, index_id=None, n_neighbors=30
self,
query_data,
index_data=None,
query_id=None,
index_id=None,
n_neighbors=30,
enable_categorical: bool = False
):
"""
Search for portotypes (size: n_neighbors) for each unit in a
Expand All @@ -36,6 +42,13 @@ def get_neighbors(
n_neighbors (int): Number of neighbors/comparables to be considered.
enable_categorical: boolean, optional
.. versionadded:: 1.3.0
.. note:: This parameter is experimental
Experimental support of specializing for categorical features. Do not set
to True unless you are interested in development. Also, JSON/UBJSON
serialization format is required.
Returns:
comps_df (pd.DataFrame): A dataframe of comparables/neighbors for each
evaluated sample. If units identifier is specified, the output dataframe
Expand All @@ -57,7 +70,7 @@ def get_neighbors(
index_id = self.index_id
index = self.tree
else:
index_matrix = xgb.DMatrix(index_data)
index_matrix = xgb.DMatrix(index_data, enable_categorical=enable_categorical)
index_leaves = self.bst.predict(
index_matrix,
pred_leaf=True,
Expand All @@ -68,7 +81,7 @@ def get_neighbors(
index_leaves = index_leaves.reshape(-1, 1)
index = BallTree(index_leaves, metric="hamming")

query_matrix = xgb.DMatrix(query_data)
query_matrix = xgb.DMatrix(query_data, enable_categorical=enable_categorical)
query_leaves = self.bst.predict(
query_matrix,
pred_leaf=True,
Expand Down
11 changes: 9 additions & 2 deletions xgbse/_debiased_bce.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ def _predict_from_lr_list(self, lr_estimators, leaves_encoded, time_bins):
# to cumulative survival curve
return hazard_to_survival(preds)

def predict(self, X, return_interval_probs=False):
def predict(self, X, return_interval_probs=False, enable_categorical: bool = False):
"""
Predicts survival probabilities using the XGBoost + Logistic Regression pipeline.
Expand All @@ -360,6 +360,13 @@ def predict(self, X, return_interval_probs=False):
supposed to be returned. If False the cumulative survival is returned.
Default is False.
enable_categorical: boolean, optional
.. versionadded:: 1.3.0
.. note:: This parameter is experimental
Experimental support of specializing for categorical features. Do not set
to True unless you are interested in development. Also, JSON/UBJSON
serialization format is required.
Returns:
pd.DataFrame: A dataframe of survival probabilities
for all times (columns), from a time_bins array, for all samples of X
Expand All @@ -368,7 +375,7 @@ def predict(self, X, return_interval_probs=False):
"""

# converting to xgb format
d_matrix = xgb.DMatrix(X)
d_matrix = xgb.DMatrix(X, enable_categorical=enable_categorical)

# getting leaves and extracting neighbors
leaves = self.bst.predict(
Expand Down
101 changes: 67 additions & 34 deletions xgbse/_kaplan_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,16 +101,17 @@ def __init__(self, xgb_params=None, n_neighbors=30, radius=None):
self.feature_importances_ = None

def fit(
self,
X,
y,
num_boost_round=1000,
validation_data=None,
early_stopping_rounds=None,
verbose_eval=0,
persist_train=True,
index_id=None,
time_bins=None,
self,
X,
y,
num_boost_round=1000,
validation_data=None,
early_stopping_rounds=None,
verbose_eval=0,
persist_train=True,
index_id=None,
time_bins=None,
enable_categorical: bool = False
):
"""
Transform feature space by fitting a XGBoost model and outputting its leaf indices.
Expand Down Expand Up @@ -142,6 +143,13 @@ def fit(
time_bins (np.array): Specified time windows to use when making survival predictions
enable_categorical: boolean, optional
.. versionadded:: 1.3.0
.. note:: This parameter is experimental
Experimental support of specializing for categorical features. Do not set
to True unless you are interested in development. Also, JSON/UBJSON
serialization format is required.
Returns:
XGBSEKaplanNeighbors: Fitted instance of XGBSEKaplanNeighbors
"""
Expand All @@ -152,14 +160,14 @@ def fit(
self.time_bins = time_bins

# converting data to xgb format
dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"])
dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"], enable_categorical=enable_categorical)

# converting validation data to xgb format
evals = ()
if validation_data:
X_val, y_val = validation_data
dvalid = convert_data_to_xgb_format(
X_val, y_val, self.xgb_params["objective"]
X_val, y_val, self.xgb_params["objective"], enable_categorical=enable_categorical
)
evals = [(dvalid, "validation")]

Expand Down Expand Up @@ -190,12 +198,13 @@ def fit(
return self

def predict(
self,
X,
time_bins=None,
return_ci=False,
ci_width=0.683,
return_interval_probs=False,
self,
X,
time_bins=None,
return_ci=False,
ci_width=0.683,
return_interval_probs=False,
enable_categorical: bool = False
):
"""
Make queries to nearest neighbor search index build on the transformed XGBoost space.
Expand All @@ -213,6 +222,13 @@ def predict(
return_interval_probs (Bool): Boolean indicating if interval probabilities are
supposed to be returned. If False the cumulative survival is returned.
enable_categorical: boolean, optional
.. versionadded:: 1.3.0
.. note:: This parameter is experimental
Experimental support of specializing for categorical features. Do not set
to True unless you are interested in development. Also, JSON/UBJSON
serialization format is required.
Returns:
(pd.DataFrame): A dataframe of survival probabilities
Expand All @@ -228,7 +244,7 @@ def predict(
"""

# converting to xgb format
d_matrix = xgb.DMatrix(X)
d_matrix = xgb.DMatrix(X, enable_categorical=enable_categorical)

# getting leaves and extracting neighbors
leaves = self.bst.predict(
Expand Down Expand Up @@ -271,7 +287,8 @@ def predict(

if return_ci and return_interval_probs:
raise ValueError(
"Confidence intervals for interval probabilities is not supported. Choose between return_ci and return_interval_probs."
"Confidence intervals for interval probabilities is not supported. "
"Choose between return_ci and return_interval_probs."
)

if return_interval_probs:
Expand Down Expand Up @@ -315,8 +332,8 @@ class XGBSEKaplanTree(XGBSEBaseEstimator):
"""

def __init__(
self,
xgb_params=None,
self,
xgb_params=None,
):
"""
Args:
Expand Down Expand Up @@ -347,14 +364,15 @@ def __init__(
self.feature_importances_ = None

def fit(
self,
X,
y,
persist_train=True,
index_id=None,
time_bins=None,
ci_width=0.683,
**xgb_kwargs,
self,
X,
y,
persist_train=True,
index_id=None,
time_bins=None,
ci_width=0.683,
enable_categorical: bool = False,
**xgb_kwargs,
):
"""
Fit a single decision tree using xgboost. For each leaf in the tree,
Expand All @@ -381,6 +399,13 @@ def fit(
ci_width (Float): Width of confidence interval
enable_categorical: boolean, optional
.. versionadded:: 1.3.0
.. note:: This parameter is experimental
Experimental support of specializing for categorical features. Do not set
to True unless you are interested in development. Also, JSON/UBJSON
serialization format is required.
Returns:
XGBSEKaplanTree: Trained instance of XGBSEKaplanTree
"""
Expand All @@ -391,7 +416,7 @@ def fit(
self.time_bins = time_bins

# converting data to xgb format
dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"])
dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"], enable_categorical=enable_categorical)

# training XGB
self.bst = xgb.train(self.xgb_params, dtrain, num_boost_round=1, **xgb_kwargs)
Expand Down Expand Up @@ -437,7 +462,7 @@ def fit(

return self

def predict(self, X, return_ci=False, return_interval_probs=False):
def predict(self, X, return_ci=False, return_interval_probs=False, enable_categorical: bool = False):
"""
Run samples through tree until terminal nodes. Predict the Kaplan-Meier
estimator associated to the leaf node each sample ended into.
Expand All @@ -450,6 +475,13 @@ def predict(self, X, return_ci=False, return_interval_probs=False):
return_interval_probs (Bool): Boolean indicating if interval probabilities are
supposed to be returned. If False the cumulative survival is returned.
enable_categorical: boolean, optional
.. versionadded:: 1.3.0
.. note:: This parameter is experimental
Experimental support of specializing for categorical features. Do not set
to True unless you are interested in development. Also, JSON/UBJSON
serialization format is required.
Returns:
preds_df (pd.DataFrame): A dataframe of survival probabilities
Expand All @@ -465,7 +497,7 @@ def predict(self, X, return_ci=False, return_interval_probs=False):
"""

# converting to xgb format
d_matrix = xgb.DMatrix(X)
d_matrix = xgb.DMatrix(X, enable_categorical=enable_categorical)

# getting leaves and extracting neighbors
leaves = self.bst.predict(
Expand All @@ -479,7 +511,8 @@ def predict(self, X, return_ci=False, return_interval_probs=False):

if return_ci and return_interval_probs:
raise ValueError(
"Confidence intervals for interval probabilities is not supported. Choose between return_ci and return_interval_probs."
"Confidence intervals for interval probabilities is not supported. "
"Choose between return_ci and return_interval_probs."
)

if return_interval_probs:
Expand Down
Loading

0 comments on commit 60fce5b

Please sign in to comment.