Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

XGBSE error with sklearn pipeline and GridSearchCV #61

Open
jules-germany opened this issue Sep 23, 2022 · 2 comments
Open

XGBSE error with sklearn pipeline and GridSearchCV #61

jules-germany opened this issue Sep 23, 2022 · 2 comments
Labels
bug Something isn't working

Comments

@jules-germany
Copy link

Code sample

__

# Your code here

import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from xgbse import XGBSEKaplanNeighbors
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import itertools
import time

[Radiomics_Clinical_GMMC_2022_09_22-CT_der_PET-mix.csv](https://github.com/loft-br/xgboost-survival-embeddings/files/9636594/Radiomics_Clinical_GMMC_2022_09_22-CT_der_PET-mix.csv)

csv_path = "C:/Radiomics_Clinical_GMMC_2022_09_22-CT_der_PET-mix.csv"

df = pd.read_csv(csv_path)
df = df.set_index('cohort', drop=True)
df.index.rename('index', inplace=True)
df.index = df.index.astype(int)

X = df.drop(columns=['PFS_binaer_Progress', 'Ereignis_korrigiert_Update_2021_03', 'DFS_M_ED_Update_2021_03',
                     'Pseudonym', 'train_test_mix', 'SUVmax', 'SEX_SPSS', 
                     'DIAGECOG_komplett_ueber_1', 'DIAALTER', 'PTNM_T_SPSS_korr_grob_7th',
                     'PTNM_N_SPSS_korr', 'STADIUM_GROB_SPSS_7thEdition',
                     'R_Status', 'PTNM_T_SPSS_korr_7th', 'STADIUM_SPSS_7thEdition',
                     'Histo_Subtyp', 'NEOADJ_CHEMO', 'NEOADJ_BESTR', 'ADJ_CHEMO', 'ADJ_BESTR',
                     'ANY_CHEMO', 'ANY_BESTR', 'ASP_high_19_5', 'ASP', 'ASP_high_33_3'])

y_df = pd.DataFrame()
y_df['event'] = df['Ereignis_korrigiert_Update_2021_03'].astype(bool)
y_df['time'] = df['DFS_M_ED_Update_2021_03']

# split dataframe into training+test and validation cohort

X_train_test = X.iloc[X.index.isin([1])]
X_valid = X.iloc[X.index.isin([2])]

y_train_test_df = y_df.iloc[y_df.index.isin([1])]
y_valid_df = y_df.iloc[y_df.index.isin([2])]

s = y_df.dtypes
y_train_test = np.array([tuple(x) for x in y_train_test_df.values], dtype=list(zip(s.index, s)))
y_valid = np.array([tuple(x) for x in y_valid_df.values], dtype=list(zip(s.index, s)))


def score_survival_model(model, X, y):
    prediction = model.predict(X)
    result = concordance_index_censored(y['event'], y['time'], prediction)
    return result[0]


feature_select_dict = {
                        "MIC" : SelectKBest(mutual_info_classif, k=30),                        
                        }


p_grid_dict = {"xgbse" : {"estimator__objective": ["survival:aft"],
                             "estimator__eval_metric": ["aft-nloglik"],
                             "estimator__aft_loss_distribution": ["normal", "logistic"],
                             "estimator__aft_loss_distribution_scale": [0.5, 1.0, 1.5],
                             "estimator__tree_method": ["hist"],
                             "estimator__learning_rate": np.logspace(-2, 0, num=6),
                             "estimator__max_depth": [0, 1, 5, 10, 15, 25, 50],
                             "estimator__booster": ["dart"],
                             "estimator__subsample": [0.1, 0.2, 0.4, 0.6, 0.8, 1.0],
                             "estimator__min_child_weight": [1, 2, 5, 10],
                             "estimator__colsample_bynode": [0.5, 1.0, 2.0]}
                        }


models_dict = {
                "xgbse" : XGBSEKaplanNeighbors(xgb_params=p_grid_dict['xgbse'])
                }


inner_cv = sklearn.model_selection.KFold(n_splits=10, shuffle=True, random_state=1)
outer_cv = sklearn.model_selection.KFold(n_splits=10, shuffle=True, random_state=1)



def model_scores(feature_select_dict, models_dict, p_grid_dict, X_train, y_train, X_valid, y_valid):
    
    # define the scaler and prepare empty dict or dataframe to assemble results and best parameters
    
    scaler = RobustScaler(with_centering = False)
    models_df_dict = dict()
    params_df = pd.DataFrame()    
    
    for outerKey in feature_select_dict:
         
        models = pd.DataFrame()  
        feature_select = feature_select_dict[outerKey]

        for innerKey in models_dict:
  
            # instantiate model
        
            model = models_dict[innerKey]
            p_grid = p_grid_dict[innerKey]
        
        
            # inner loop of nested cross-validation: perform hyperparameter tuning in the training set of the outer loop
        
            t1 = time.time()
            
            pipeline = Pipeline([('scaling', scaler), ('feature_selection', feature_select), ('estimator', model)])
            
            clf_model = sklearn.model_selection.GridSearchCV(estimator=pipeline, 
                                                             scoring=score_survival_model,
                                                             param_grid=p_grid, 
                                                             cv=inner_cv, refit=True) 
            
                        
            # outer loop: train the model with training data and score the perfomance on test sets
            nested_test_score = sklearn.model_selection.cross_val_score(clf_model, scoring=score_survival_model,
                                                                        X=X_train, y=y_train, cv=outer_cv)
        
        
            # calculate AUC from test and validation set
        
            test_mean = nested_test_score.mean()
            test_std = nested_test_score.std()
            
            clf_model_fit = clf_model.fit(X_train, y_train)
            
            clf_model_best_parameters = str(clf_model.best_params_)
        
            valid_score = clf_model.score(X_valid, y_valid)
            
            test_plus_valid = test_mean + valid_score
            
            model_time = (time.time()-t1)
                
                
            # at the end of this nested CV: add results for this model to the models dataframe 
        
            models[innerKey] = [test_mean, test_std, model_time, valid_score, test_plus_valid]
            
            df_list = [outerKey, innerKey, clf_model_best_parameters]
            params = pd.DataFrame(df_list)
            params_df = pd.concat([params_df, params], axis = 1)

        # add model results for different feature_select_dict keys to the dict of model results
        # add best parmaeters to the dataframe
       

        models_df_dict[outerKey] = models 
        
        
    # subsequent to all model calculations: add multiindex, flip (transpose) dataframe and sort by highest "test+valid"
    # finalize the "best parameters" dataframe
    
    multiindex = {(outerKey, innerKey): values for outerKey, innerDict in models_df_dict.items() for innerKey, values in innerDict.items()}

    models_df_dict_multiindex = pd.DataFrame(multiindex)
    models_df_dict_multiindex.index = ['nested_test_mean', 'nested_test_SD', 'time', 'valid', 'test_plus_valid']
    
    models_transpose = models_df_dict_multiindex.transpose()
    models_transpose.index.set_names(['pre', 'model'], inplace=True)
    models_transpose = models_transpose.sort_values(by = ['nested_test_mean'], ascending = False)

    params_df = params_df.T
    params_df.columns = ['feature_select', 'model', 'parameters']
    params_df = params_df.sort_values(by = ['model', 'feature_select'], ascending = [True, True])
    
    return models_transpose, params_df


results, params = model_scores(feature_select_dict, models_dict, p_grid_dict, X_train_test, y_train_test, X_valid, y_valid)
results_ready = results.reset_index(level=['pre', 'model'])
print(results_ready)

Problem description

__

Use of GridSearchCV is not possible because XGBSE requires hyperparameters to be unique and to be passed during model initiation. Furthermore, parameter vales in the parameter dict need to be without [], while GridSearchCV expects values in [].
XGBSE therefore seems to be incompatible with GridSearchCV.
Furthermore, XGBSE seems to be incompatible with sklearn's pipeline.
If the sklearn pipeline is fitted, the estimator XGBSE receives the X dataframe as a np.array in the last step of the pipeline, which misses an index. This gives a corresponding error because XGBSE fitting seems to require X.index.

Expected behavior

__
It would be expected that XGBSE can be used with GridSearchCV and pipeline.

Possible solutions

__
It would be required that hyperparameters could be defined and that fitting would allow X without an index (np.array).

@jules-germany jules-germany added the bug Something isn't working label Sep 23, 2022
@thecml
Copy link

thecml commented Oct 3, 2022

I stumbled into the same issue. Is there any workout to this, other than tuning hyperparameters with a vanilla XGBoost model and simply transferring those to the XGBSe one?

@brunocarlin
Copy link

brunocarlin commented Oct 24, 2022

We have developed a workaround

from xgbse import XGBSEStackedWeibull

class sklearn_wei(XGBSEStackedWeibull):
  
    def get_params2(self):
        return(self.get_params()['xgb_params'])
      
    def set_params(self,**params):
      old_params = self.get_params2()
      old_params.update(params)
      self.xgb_params = old_params
      return(self)

ok = sklearn_wei()

`After you define the sklearn compatible model things work out as normal

from skopt import BayesSearchCV

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold

from xgbse.metrics import (
    concordance_index,
    approx_brier_score,
    dist_calibration_score
)
opt = BayesSearchCV(
  estimator= ok,
  scoring=make_scorer(concordance_index),
  n_iter = 2,random_state = 42,
  cv = StratifiedKFold(n_splits = 4, shuffle = True),
  n_jobs = -1,
  n_points = 1,
  verbose = 1,
  search_spaces = {
    'max_depth': space.Integer(low = 1,high =32,prior = 'uniform'), # I don't know why but more than 32 breaks
    'learning_rate': space.Real(10**-3, 10**-1, "log-uniform"),
    #'tree_method': space.Categorical(categories=['hist','exact']),
    'aft_loss_distribution': space.Categorical(categories=['normal','extreme','logistic']),
    #'aft_loss_distribution_scale': space.Real(low=10**-2,high= 3*10**1,prior = "log-uniform"),
    #'n_estimators': space.Real(low = 10**1,high = 10**3,prior = "log-uniform"),
    'subsample':space.Real(low = .8,high = 1,prior = "uniform"),
    'colsample_bytree':space.Real(low = .8,high = 1,prior = "uniform"),
    'reg_alpha': space.Real(low=0,high = 1,prior = "uniform"),
    'reg_lambda': space.Real(low=0,high = 1,prior = "uniform"),
    'max_bin': space.Integer(low=200,high = 1000,prior = 'log-uniform'),
    'min_split_loss' : space.Real(low = 0,high = 10,prior = 'uniform')
  },
  fit_params = {}
 )
opt.fit(X = X_train,
        y = y_train,
        time_bins = range(1,59),
        validation_data= (X_valid,y_valid),
        early_stopping_rounds=5)
df_results = \
  pd.DataFrame(opt.cv_results_).sort_values(by = 'rank_test_score', ascending = True)

df_results['params'] = df_results['params'].astype('string')
df_results.reset_index(drop = True, inplace = True)
df_results

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

3 participants