patches to deal with categorical data in xgboost

loft-br · Jul 1, 2022 · 60fce5b · 60fce5b
1 parent 6247a15
commit 60fce5b
Show file tree

Hide file tree

Showing 6 changed files with 146 additions and 63 deletions.
diff --git a/.gitignore b/.gitignore
@@ -139,3 +139,5 @@ cython_debug/
 
 .DS_Store
 .vscode/
+
+.idea
diff --git a/xgbse/_base.py b/xgbse/_base.py
@@ -17,7 +17,13 @@ def __init__(self):
         self.bst = None
 
     def get_neighbors(
-        self, query_data, index_data=None, query_id=None, index_id=None, n_neighbors=30
+            self,
+            query_data,
+            index_data=None,
+            query_id=None,
+            index_id=None,
+            n_neighbors=30,
+            enable_categorical: bool = False
     ):
         """
         Search for portotypes (size: n_neighbors) for each unit in a
@@ -36,6 +42,13 @@ def get_neighbors(
 
             n_neighbors (int): Number of neighbors/comparables to be considered.
 
+            enable_categorical: boolean, optional
+                .. versionadded:: 1.3.0
+                .. note:: This parameter is experimental
+                Experimental support of specializing for categorical features.  Do not set
+                to True unless you are interested in development. Also, JSON/UBJSON
+                serialization format is required.
+
         Returns:
             comps_df (pd.DataFrame): A dataframe of comparables/neighbors for each
             evaluated sample. If units identifier is specified, the output dataframe
@@ -57,7 +70,7 @@ def get_neighbors(
             index_id = self.index_id
             index = self.tree
         else:
-            index_matrix = xgb.DMatrix(index_data)
+            index_matrix = xgb.DMatrix(index_data, enable_categorical=enable_categorical)
             index_leaves = self.bst.predict(
                 index_matrix,
                 pred_leaf=True,
@@ -68,7 +81,7 @@ def get_neighbors(
                 index_leaves = index_leaves.reshape(-1, 1)
             index = BallTree(index_leaves, metric="hamming")
 
-        query_matrix = xgb.DMatrix(query_data)
+        query_matrix = xgb.DMatrix(query_data, enable_categorical=enable_categorical)
         query_leaves = self.bst.predict(
             query_matrix,
             pred_leaf=True,

diff --git a/xgbse/_debiased_bce.py b/xgbse/_debiased_bce.py
@@ -348,7 +348,7 @@ def _predict_from_lr_list(self, lr_estimators, leaves_encoded, time_bins):
         # to cumulative survival curve
         return hazard_to_survival(preds)
 
-    def predict(self, X, return_interval_probs=False):
+    def predict(self, X, return_interval_probs=False, enable_categorical: bool = False):
         """
         Predicts survival probabilities using the XGBoost + Logistic Regression pipeline.
 
@@ -360,6 +360,13 @@ def predict(self, X, return_interval_probs=False):
                 supposed to be returned. If False the cumulative survival is returned.
                 Default is False.
 
+            enable_categorical: boolean, optional
+                .. versionadded:: 1.3.0
+                .. note:: This parameter is experimental
+                Experimental support of specializing for categorical features.  Do not set
+                to True unless you are interested in development. Also, JSON/UBJSON
+                serialization format is required.
+
         Returns:
             pd.DataFrame: A dataframe of survival probabilities
             for all times (columns), from a time_bins array, for all samples of X
@@ -368,7 +375,7 @@ def predict(self, X, return_interval_probs=False):
         """
 
         # converting to xgb format
-        d_matrix = xgb.DMatrix(X)
+        d_matrix = xgb.DMatrix(X, enable_categorical=enable_categorical)
 
         # getting leaves and extracting neighbors
         leaves = self.bst.predict(

diff --git a/xgbse/_kaplan_neighbors.py b/xgbse/_kaplan_neighbors.py
@@ -101,16 +101,17 @@ def __init__(self, xgb_params=None, n_neighbors=30, radius=None):
         self.feature_importances_ = None
 
     def fit(
-        self,
-        X,
-        y,
-        num_boost_round=1000,
-        validation_data=None,
-        early_stopping_rounds=None,
-        verbose_eval=0,
-        persist_train=True,
-        index_id=None,
-        time_bins=None,
+            self,
+            X,
+            y,
+            num_boost_round=1000,
+            validation_data=None,
+            early_stopping_rounds=None,
+            verbose_eval=0,
+            persist_train=True,
+            index_id=None,
+            time_bins=None,
+            enable_categorical: bool = False
     ):
         """
         Transform feature space by fitting a XGBoost model and outputting its leaf indices.
@@ -142,6 +143,13 @@ def fit(
 
             time_bins (np.array): Specified time windows to use when making survival predictions
 
+            enable_categorical: boolean, optional
+                .. versionadded:: 1.3.0
+                .. note:: This parameter is experimental
+                Experimental support of specializing for categorical features.  Do not set
+                to True unless you are interested in development. Also, JSON/UBJSON
+                serialization format is required.
+
         Returns:
             XGBSEKaplanNeighbors: Fitted instance of XGBSEKaplanNeighbors
         """
@@ -152,14 +160,14 @@ def fit(
         self.time_bins = time_bins
 
         # converting data to xgb format
-        dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"])
+        dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"], enable_categorical=enable_categorical)
 
         # converting validation data to xgb format
         evals = ()
         if validation_data:
             X_val, y_val = validation_data
             dvalid = convert_data_to_xgb_format(
-                X_val, y_val, self.xgb_params["objective"]
+                X_val, y_val, self.xgb_params["objective"], enable_categorical=enable_categorical
             )
             evals = [(dvalid, "validation")]
 
@@ -190,12 +198,13 @@ def fit(
         return self
 
     def predict(
-        self,
-        X,
-        time_bins=None,
-        return_ci=False,
-        ci_width=0.683,
-        return_interval_probs=False,
+            self,
+            X,
+            time_bins=None,
+            return_ci=False,
+            ci_width=0.683,
+            return_interval_probs=False,
+            enable_categorical: bool = False
     ):
         """
         Make queries to nearest neighbor search index build on the transformed XGBoost space.
@@ -213,6 +222,13 @@ def predict(
             return_interval_probs (Bool): Boolean indicating if interval probabilities are
                 supposed to be returned. If False the cumulative survival is returned.
 
+            enable_categorical: boolean, optional
+                .. versionadded:: 1.3.0
+                .. note:: This parameter is experimental
+                Experimental support of specializing for categorical features.  Do not set
+                to True unless you are interested in development. Also, JSON/UBJSON
+                serialization format is required.
+
 
         Returns:
             (pd.DataFrame): A dataframe of survival probabilities
@@ -228,7 +244,7 @@ def predict(
         """
 
         # converting to xgb format
-        d_matrix = xgb.DMatrix(X)
+        d_matrix = xgb.DMatrix(X, enable_categorical=enable_categorical)
 
         # getting leaves and extracting neighbors
         leaves = self.bst.predict(
@@ -271,7 +287,8 @@ def predict(
 
         if return_ci and return_interval_probs:
             raise ValueError(
-                "Confidence intervals for interval probabilities is not supported. Choose between return_ci and return_interval_probs."
+                "Confidence intervals for interval probabilities is not supported. "
+                "Choose between return_ci and return_interval_probs."
             )
 
         if return_interval_probs:
@@ -315,8 +332,8 @@ class XGBSEKaplanTree(XGBSEBaseEstimator):
     """
 
     def __init__(
-        self,
-        xgb_params=None,
+            self,
+            xgb_params=None,
     ):
         """
         Args:
@@ -347,14 +364,15 @@ def __init__(
         self.feature_importances_ = None
 
     def fit(
-        self,
-        X,
-        y,
-        persist_train=True,
-        index_id=None,
-        time_bins=None,
-        ci_width=0.683,
-        **xgb_kwargs,
+            self,
+            X,
+            y,
+            persist_train=True,
+            index_id=None,
+            time_bins=None,
+            ci_width=0.683,
+            enable_categorical: bool = False,
+            **xgb_kwargs,
     ):
         """
         Fit a single decision tree using xgboost. For each leaf in the tree,
@@ -381,6 +399,13 @@ def fit(
 
             ci_width (Float): Width of confidence interval
 
+            enable_categorical: boolean, optional
+                .. versionadded:: 1.3.0
+                .. note:: This parameter is experimental
+                Experimental support of specializing for categorical features.  Do not set
+                to True unless you are interested in development. Also, JSON/UBJSON
+                serialization format is required.
+
         Returns:
             XGBSEKaplanTree: Trained instance of XGBSEKaplanTree
         """
@@ -391,7 +416,7 @@ def fit(
         self.time_bins = time_bins
 
         # converting data to xgb format
-        dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"])
+        dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"], enable_categorical=enable_categorical)
 
         # training XGB
         self.bst = xgb.train(self.xgb_params, dtrain, num_boost_round=1, **xgb_kwargs)
@@ -437,7 +462,7 @@ def fit(
 
         return self
 
-    def predict(self, X, return_ci=False, return_interval_probs=False):
+    def predict(self, X, return_ci=False, return_interval_probs=False, enable_categorical: bool = False):
         """
         Run samples through tree until terminal nodes. Predict the Kaplan-Meier
         estimator associated to the leaf node each sample ended into.
@@ -450,6 +475,13 @@ def predict(self, X, return_ci=False, return_interval_probs=False):
             return_interval_probs (Bool): Boolean indicating if interval probabilities are
                 supposed to be returned. If False the cumulative survival is returned.
 
+            enable_categorical: boolean, optional
+                .. versionadded:: 1.3.0
+                .. note:: This parameter is experimental
+                Experimental support of specializing for categorical features.  Do not set
+                to True unless you are interested in development. Also, JSON/UBJSON
+                serialization format is required.
+
 
         Returns:
             preds_df (pd.DataFrame): A dataframe of survival probabilities
@@ -465,7 +497,7 @@ def predict(self, X, return_ci=False, return_interval_probs=False):
         """
 
         # converting to xgb format
-        d_matrix = xgb.DMatrix(X)
+        d_matrix = xgb.DMatrix(X, enable_categorical=enable_categorical)
 
         # getting leaves and extracting neighbors
         leaves = self.bst.predict(
@@ -479,7 +511,8 @@ def predict(self, X, return_ci=False, return_interval_probs=False):
 
         if return_ci and return_interval_probs:
             raise ValueError(
-                "Confidence intervals for interval probabilities is not supported. Choose between return_ci and return_interval_probs."
+                "Confidence intervals for interval probabilities is not supported. "
+                "Choose between return_ci and return_interval_probs."
             )
 
         if return_interval_probs:
Original file line number	Diff line number	Diff line change
Expand Up		@@ -139,3 +139,5 @@ cython_debug/

		.DS_Store
		.vscode/

		.idea