XGBSE error with sklearn pipeline and GridSearchCV #61

jules-germany · 2022-09-23T19:21:04Z

Code sample

__

# Your code here

import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from xgbse import XGBSEKaplanNeighbors
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import itertools
import time

[Radiomics_Clinical_GMMC_2022_09_22-CT_der_PET-mix.csv](https://github.com/loft-br/xgboost-survival-embeddings/files/9636594/Radiomics_Clinical_GMMC_2022_09_22-CT_der_PET-mix.csv)

csv_path = "C:/Radiomics_Clinical_GMMC_2022_09_22-CT_der_PET-mix.csv"

df = pd.read_csv(csv_path)
df = df.set_index('cohort', drop=True)
df.index.rename('index', inplace=True)
df.index = df.index.astype(int)

X = df.drop(columns=['PFS_binaer_Progress', 'Ereignis_korrigiert_Update_2021_03', 'DFS_M_ED_Update_2021_03',
                     'Pseudonym', 'train_test_mix', 'SUVmax', 'SEX_SPSS', 
                     'DIAGECOG_komplett_ueber_1', 'DIAALTER', 'PTNM_T_SPSS_korr_grob_7th',
                     'PTNM_N_SPSS_korr', 'STADIUM_GROB_SPSS_7thEdition',
                     'R_Status', 'PTNM_T_SPSS_korr_7th', 'STADIUM_SPSS_7thEdition',
                     'Histo_Subtyp', 'NEOADJ_CHEMO', 'NEOADJ_BESTR', 'ADJ_CHEMO', 'ADJ_BESTR',
                     'ANY_CHEMO', 'ANY_BESTR', 'ASP_high_19_5', 'ASP', 'ASP_high_33_3'])

y_df = pd.DataFrame()
y_df['event'] = df['Ereignis_korrigiert_Update_2021_03'].astype(bool)
y_df['time'] = df['DFS_M_ED_Update_2021_03']

# split dataframe into training+test and validation cohort

X_train_test = X.iloc[X.index.isin([1])]
X_valid = X.iloc[X.index.isin([2])]

y_train_test_df = y_df.iloc[y_df.index.isin([1])]
y_valid_df = y_df.iloc[y_df.index.isin([2])]

s = y_df.dtypes
y_train_test = np.array([tuple(x) for x in y_train_test_df.values], dtype=list(zip(s.index, s)))
y_valid = np.array([tuple(x) for x in y_valid_df.values], dtype=list(zip(s.index, s)))


def score_survival_model(model, X, y):
    prediction = model.predict(X)
    result = concordance_index_censored(y['event'], y['time'], prediction)
    return result[0]


feature_select_dict = {
                        "MIC" : SelectKBest(mutual_info_classif, k=30),                        
                        }


p_grid_dict = {"xgbse" : {"estimator__objective": ["survival:aft"],
                             "estimator__eval_metric": ["aft-nloglik"],
                             "estimator__aft_loss_distribution": ["normal", "logistic"],
                             "estimator__aft_loss_distribution_scale": [0.5, 1.0, 1.5],
                             "estimator__tree_method": ["hist"],
                             "estimator__learning_rate": np.logspace(-2, 0, num=6),
                             "estimator__max_depth": [0, 1, 5, 10, 15, 25, 50],
                             "estimator__booster": ["dart"],
                             "estimator__subsample": [0.1, 0.2, 0.4, 0.6, 0.8, 1.0],
                             "estimator__min_child_weight": [1, 2, 5, 10],
                             "estimator__colsample_bynode": [0.5, 1.0, 2.0]}
                        }


models_dict = {
                "xgbse" : XGBSEKaplanNeighbors(xgb_params=p_grid_dict['xgbse'])
                }


inner_cv = sklearn.model_selection.KFold(n_splits=10, shuffle=True, random_state=1)
outer_cv = sklearn.model_selection.KFold(n_splits=10, shuffle=True, random_state=1)



def model_scores(feature_select_dict, models_dict, p_grid_dict, X_train, y_train, X_valid, y_valid):
    
    # define the scaler and prepare empty dict or dataframe to assemble results and best parameters
    
    scaler = RobustScaler(with_centering = False)
    models_df_dict = dict()
    params_df = pd.DataFrame()    
    
    for outerKey in feature_select_dict:
         
        models = pd.DataFrame()  
        feature_select = feature_select_dict[outerKey]

        for innerKey in models_dict:
  
            # instantiate model
        
            model = models_dict[innerKey]
            p_grid = p_grid_dict[innerKey]
        
        
            # inner loop of nested cross-validation: perform hyperparameter tuning in the training set of the outer loop
        
            t1 = time.time()
            
            pipeline = Pipeline([('scaling', scaler), ('feature_selection', feature_select), ('estimator', model)])
            
            clf_model = sklearn.model_selection.GridSearchCV(estimator=pipeline, 
                                                             scoring=score_survival_model,
                                                             param_grid=p_grid, 
                                                             cv=inner_cv, refit=True) 
            
                        
            # outer loop: train the model with training data and score the perfomance on test sets
            nested_test_score = sklearn.model_selection.cross_val_score(clf_model, scoring=score_survival_model,
                                                                        X=X_train, y=y_train, cv=outer_cv)
        
        
            # calculate AUC from test and validation set
        
            test_mean = nested_test_score.mean()
            test_std = nested_test_score.std()
            
            clf_model_fit = clf_model.fit(X_train, y_train)
            
            clf_model_best_parameters = str(clf_model.best_params_)
        
            valid_score = clf_model.score(X_valid, y_valid)
            
            test_plus_valid = test_mean + valid_score
            
            model_time = (time.time()-t1)
                
                
            # at the end of this nested CV: add results for this model to the models dataframe 
        
            models[innerKey] = [test_mean, test_std, model_time, valid_score, test_plus_valid]
            
            df_list = [outerKey, innerKey, clf_model_best_parameters]
            params = pd.DataFrame(df_list)
            params_df = pd.concat([params_df, params], axis = 1)

        # add model results for different feature_select_dict keys to the dict of model results
        # add best parmaeters to the dataframe
       

        models_df_dict[outerKey] = models 
        
        
    # subsequent to all model calculations: add multiindex, flip (transpose) dataframe and sort by highest "test+valid"
    # finalize the "best parameters" dataframe
    
    multiindex = {(outerKey, innerKey): values for outerKey, innerDict in models_df_dict.items() for innerKey, values in innerDict.items()}

    models_df_dict_multiindex = pd.DataFrame(multiindex)
    models_df_dict_multiindex.index = ['nested_test_mean', 'nested_test_SD', 'time', 'valid', 'test_plus_valid']
    
    models_transpose = models_df_dict_multiindex.transpose()
    models_transpose.index.set_names(['pre', 'model'], inplace=True)
    models_transpose = models_transpose.sort_values(by = ['nested_test_mean'], ascending = False)

    params_df = params_df.T
    params_df.columns = ['feature_select', 'model', 'parameters']
    params_df = params_df.sort_values(by = ['model', 'feature_select'], ascending = [True, True])
    
    return models_transpose, params_df


results, params = model_scores(feature_select_dict, models_dict, p_grid_dict, X_train_test, y_train_test, X_valid, y_valid)
results_ready = results.reset_index(level=['pre', 'model'])
print(results_ready)

Problem description

__

Use of GridSearchCV is not possible because XGBSE requires hyperparameters to be unique and to be passed during model initiation. Furthermore, parameter vales in the parameter dict need to be without [], while GridSearchCV expects values in [].
XGBSE therefore seems to be incompatible with GridSearchCV.
Furthermore, XGBSE seems to be incompatible with sklearn's pipeline.
If the sklearn pipeline is fitted, the estimator XGBSE receives the X dataframe as a np.array in the last step of the pipeline, which misses an index. This gives a corresponding error because XGBSE fitting seems to require X.index.

Expected behavior

__
It would be expected that XGBSE can be used with GridSearchCV and pipeline.

Possible solutions

__
It would be required that hyperparameters could be defined and that fitting would allow X without an index (np.array).

thecml · 2022-10-03T14:13:59Z

I stumbled into the same issue. Is there any workout to this, other than tuning hyperparameters with a vanilla XGBoost model and simply transferring those to the XGBSe one?

brunocarlin · 2022-10-24T15:25:22Z

We have developed a workaround

from xgbse import XGBSEStackedWeibull

class sklearn_wei(XGBSEStackedWeibull):
  
    def get_params2(self):
        return(self.get_params()['xgb_params'])
      
    def set_params(self,**params):
      old_params = self.get_params2()
      old_params.update(params)
      self.xgb_params = old_params
      return(self)

ok = sklearn_wei()

`After you define the sklearn compatible model things work out as normal

from skopt import BayesSearchCV

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold

from xgbse.metrics import (
    concordance_index,
    approx_brier_score,
    dist_calibration_score
)

opt = BayesSearchCV(
  estimator= ok,
  scoring=make_scorer(concordance_index),
  n_iter = 2,random_state = 42,
  cv = StratifiedKFold(n_splits = 4, shuffle = True),
  n_jobs = -1,
  n_points = 1,
  verbose = 1,
  search_spaces = {
    'max_depth': space.Integer(low = 1,high =32,prior = 'uniform'), # I don't know why but more than 32 breaks
    'learning_rate': space.Real(10**-3, 10**-1, "log-uniform"),
    #'tree_method': space.Categorical(categories=['hist','exact']),
    'aft_loss_distribution': space.Categorical(categories=['normal','extreme','logistic']),
    #'aft_loss_distribution_scale': space.Real(low=10**-2,high= 3*10**1,prior = "log-uniform"),
    #'n_estimators': space.Real(low = 10**1,high = 10**3,prior = "log-uniform"),
    'subsample':space.Real(low = .8,high = 1,prior = "uniform"),
    'colsample_bytree':space.Real(low = .8,high = 1,prior = "uniform"),
    'reg_alpha': space.Real(low=0,high = 1,prior = "uniform"),
    'reg_lambda': space.Real(low=0,high = 1,prior = "uniform"),
    'max_bin': space.Integer(low=200,high = 1000,prior = 'log-uniform'),
    'min_split_loss' : space.Real(low = 0,high = 10,prior = 'uniform')
  },
  fit_params = {}
 )

opt.fit(X = X_train,
        y = y_train,
        time_bins = range(1,59),
        validation_data= (X_valid,y_valid),
        early_stopping_rounds=5)

df_results = \
  pd.DataFrame(opt.cv_results_).sort_values(by = 'rank_test_score', ascending = True)

df_results['params'] = df_results['params'].astype('string')
df_results.reset_index(drop = True, inplace = True)
df_results

jules-germany added the bug Something isn't working label Sep 23, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

XGBSE error with sklearn pipeline and GridSearchCV #61

XGBSE error with sklearn pipeline and GridSearchCV #61

jules-germany commented Sep 23, 2022

thecml commented Oct 3, 2022 •

edited

Loading

brunocarlin commented Oct 24, 2022 •

edited

Loading

XGBSE error with sklearn pipeline and GridSearchCV #61

XGBSE error with sklearn pipeline and GridSearchCV #61

Comments

jules-germany commented Sep 23, 2022

Code sample

Problem description

Expected behavior

Possible solutions

thecml commented Oct 3, 2022 • edited Loading

brunocarlin commented Oct 24, 2022 • edited Loading

thecml commented Oct 3, 2022 •

edited

Loading

brunocarlin commented Oct 24, 2022 •

edited

Loading