From 5d9cccfb156cb4a5aa78be0d4e5c0ca9d56e066b Mon Sep 17 00:00:00 2001 From: sanyhe Date: Thu, 21 Dec 2023 07:49:27 +0800 Subject: [PATCH] feat: distinguish which model can deal with missing values. --- geochemistrypi/data_mining/cli_pipeline.py | 64 ++++++++++++++----- geochemistrypi/data_mining/constants.py | 29 ++++++++- .../data_mining/model/classification.py | 10 +-- .../data_mining/model/regression.py | 8 +-- .../data_mining/plot/statistic_plot.py | 4 +- .../data_mining/process/classify.py | 4 +- geochemistrypi/data_mining/process/regress.py | 4 +- pyproject.toml | 6 +- 8 files changed, 93 insertions(+), 36 deletions(-) diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py index 19d36dcb..1af3a3b0 100644 --- a/geochemistrypi/data_mining/cli_pipeline.py +++ b/geochemistrypi/data_mining/cli_pipeline.py @@ -10,7 +10,9 @@ from .constants import ( CLASSIFICATION_MODELS, + CLASSIFICATION_MODELS_WITH_MISSING_VALUES, CLUSTERING_MODELS, + CLUSTERING_MODELS_WITH_MISSING_VALUES, DECOMPOSITION_MODELS, FEATURE_SCALING_STRATEGY, FEATURE_SELECTION_STRATEGY, @@ -21,6 +23,7 @@ OPTION, OUTPUT_PATH, REGRESSION_MODELS, + REGRESSION_MODELS_WITH_MISSING_VALUES, SECTION, TEST_DATA_OPTION, WORKING_PATH, @@ -32,7 +35,7 @@ from .data.preprocessing import feature_scaler, feature_selector from .data.statistic import monte_carlo_simulator from .plot.map_plot import process_world_map -from .plot.statistic_plot import basic_statistic, correlation_plot, distribution_plot, is_imputed, is_null_value, log_distribution_plot, probability_plot, ratio_null_vs_filled +from .plot.statistic_plot import basic_statistic, check_missing_value, correlation_plot, distribution_plot, is_null_value, log_distribution_plot, probability_plot, ratio_null_vs_filled from .process.classify import ClassificationModelSelection from .process.cluster import ClusteringModelSelection from .process.decompose import DecompositionModelSelection @@ -157,7 +160,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N raise e experiment = mlflow.get_experiment(experiment_id=new_experiment_id) # print("Artifact Location: {}".format(experiment.artifact_location)) - run_name = Prompt.ask("✨ Run Name", default="Xgboost Algorithm - Test 1") + run_name = Prompt.ask("✨ Run Name", default="XGBoost Algorithm - Test 1") # run_tag = Prompt.ask("✨ Run Tag Version", default="R - v1.0.0") # run_description = Prompt.ask("✨ Run Description", default="Use xgboost for GeoPi classification.") # mlflow.start_run(run_name=run_name, experiment_id=experiment.experiment_id, tags={"version": run_tag, "description": run_description}) @@ -219,7 +222,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N print("The Selected Data Set:") print(data_selected) clear_output() - print("Basic Statistical Information: ") + print("-*-*- Basic Statistical Information -*-*-") basic_info(data_selected) basic_statistic(data_selected) correlation_plot(data_selected.columns, data_selected) @@ -232,11 +235,26 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N # <--- Imputation ---> logger.debug("Imputation") - print("-*-*- Imputation -*-*-") + print("-*-*- Missing Value Check -*-*-") is_null_value(data_selected) ratio_null_vs_filled(data_selected) - imputed_flag = is_imputed(data_selected) + missing_value_flag = check_missing_value(data_selected) clear_output() + if missing_value_flag: + # Ask the user whether to use imputation techniques to deal with the missing values. + print("-*-*- Imputation Option -*-*-") + num2option(OPTION) + imputation_num = limit_num_input(OPTION, SECTION[1], num_input) + if imputation_num == 1: + imputed_flag = True + else: + imputed_flag = False + clear_output() + else: + # Allow the user not to use imputation techniques to deal with the missing values. + # Subsequently, in the mode selection, only regression, classification and clustering models are available. + # In the corresponding model selection, only the models that support missing values are available. + imputed_flag = False if imputed_flag: print("-*-*- Strategy for Missing Values -*-*-") num2option(IMPUTING_STRATEGY) @@ -281,8 +299,16 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N # <--- Mode Selection ---> logger.debug("Mode Selection") print("-*-*- Mode Selection -*-*-") - num2option(MODE_OPTION) - mode_num = limit_num_input(MODE_OPTION, SECTION[2], num_input) + # If the selected data set is with missing values and is not been imputed, then only allow the user to choose regression, classification and clustering modes. + # Otherwise, allow the user to choose decomposition modes. + if missing_value_flag and not imputed_flag: + # Delete the decomposition mode because it doesn't support missing values. + MODE_OPTION.remove("Dimensional Reduction") + num2option(MODE_OPTION) + mode_num = limit_num_input(MODE_OPTION, SECTION[2], num_input) + else: + num2option(MODE_OPTION) + mode_num = limit_num_input(MODE_OPTION, SECTION[2], num_input) clear_output() # <--- Data Segmentation ---> @@ -359,7 +385,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N # create training data and testing data print("-*-*- Data Split - Train Set and Test Set -*-*-") - print("Notice: Normally, set 20% of the dataset aside as test set, such as 0.2") + print("Notice: Normally, set 20% of the dataset aside as test set, such as 0.2.") test_ratio = float_input(default=0.2, prefix=SECTION[1], slogan="@Test Ratio: ") train_test_data = data_split(X, y, test_ratio) for key, value in train_test_data.items(): @@ -404,14 +430,20 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N # <--- Model Selection ---> logger.debug("Model Selection") - print("-*-*- Model Selection -*-*-:") - Modes2Models = {1: REGRESSION_MODELS, 2: CLASSIFICATION_MODELS, 3: CLUSTERING_MODELS, 4: DECOMPOSITION_MODELS} - Modes2Initiators = { - 1: RegressionModelSelection, - 2: ClassificationModelSelection, - 3: ClusteringModelSelection, - 4: DecompositionModelSelection, - } + print("-*-*- Model Selection -*-*-") + # If the selected data set is with missing values and is not been imputed, then only allow the user to choose regression, classification and clustering models. + # Otherwise, allow the user to choose decomposition models. + if missing_value_flag and not imputed_flag: + Modes2Models = {1: REGRESSION_MODELS_WITH_MISSING_VALUES, 2: CLASSIFICATION_MODELS_WITH_MISSING_VALUES, 3: CLUSTERING_MODELS_WITH_MISSING_VALUES} + Modes2Initiators = {1: RegressionModelSelection, 2: ClassificationModelSelection, 3: ClusteringModelSelection} + else: + Modes2Models = {1: REGRESSION_MODELS, 2: CLASSIFICATION_MODELS, 3: CLUSTERING_MODELS, 4: DECOMPOSITION_MODELS} + Modes2Initiators = { + 1: RegressionModelSelection, + 2: ClassificationModelSelection, + 3: ClusteringModelSelection, + 4: DecompositionModelSelection, + } MODELS = Modes2Models[mode_num] num2option(MODELS) # Add the option of all models diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py index 433eb2e6..34a5bb09 100644 --- a/geochemistrypi/data_mining/constants.py +++ b/geochemistrypi/data_mining/constants.py @@ -39,11 +39,14 @@ "Random Forest", "Extra-Trees", "Gradient Boosting", - "Xgboost", + "XGBoost", "Multi-layer Perceptron", "Lasso Regression", "Elastic Net", "SGD Regression", + # "Bagging Regression", + # "Decision Tree", + # Histogram-based Gradient Boosting, ] CLASSIFICATION_MODELS = [ "Logistic Regression", @@ -51,15 +54,37 @@ "Decision Tree", "Random Forest", "Extra-Trees", - "Xgboost", + "XGBoost", "Multi-layer Perceptron", "Gradient Boosting", "K-Nearest Neighbors", "Stochastic Gradient Descent", + # "Bagging Classification", + # "Decision Tree", + # Histogram-based Gradient Boosting, ] CLUSTERING_MODELS = ["KMeans", "DBSCAN"] DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"] +# The model can deal with missing values +# Reference: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values +REGRESSION_MODELS_WITH_MISSING_VALUES = [ + "XGBoost", + # "Bagging Regression", + # "Decision Tree", + # Histogram-based Gradient Boosting, +] +CLASSIFICATION_MODELS_WITH_MISSING_VALUES = [ + "XGBoost", + # "Bagging Classification", + # "Decision Tree", + # Histogram-based Gradient Boosting, +] +CLUSTERING_MODELS_WITH_MISSING_VALUES = [ + # "HDBSCAN" +] + + # Special AutoML models NON_AUTOML_MODELS = ["Linear Regression", "Polynomial Regression"] RAY_FLAML = ["Multi-layer Perceptron"] diff --git a/geochemistrypi/data_mining/model/classification.py b/geochemistrypi/data_mining/model/classification.py index 7e203fa0..910f34d3 100644 --- a/geochemistrypi/data_mining/model/classification.py +++ b/geochemistrypi/data_mining/model/classification.py @@ -1211,9 +1211,9 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None: class XGBoostClassification(TreeWorkflowMixin, ClassificationWorkflowBase): - """The automation workflow of using Xgboost algorithm to make insightful products.""" + """The automation workflow of using XGBoost algorithm to make insightful products.""" - name = "Xgboost" + name = "XGBoost" special_function = ["Feature Importance Diagram"] # https: // xgboost.readthedocs.io / en / stable / python / python_api.html # module-xgboost.sklearn @@ -1419,10 +1419,10 @@ def __init__( References ---------- - [1] Xgboost Python API Reference - Scikit-Learn API + [1] XGBoost Python API Reference - Scikit-Learn API https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn - [2] Xgboost API for the scikit-learn wrapper: + [2] XGBoost API for the scikit-learn wrapper: https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py """ super().__init__() @@ -1526,7 +1526,7 @@ def manual_hyper_parameters(cls) -> Dict: # def _plot_tree(trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None: # # TODO: (solve the problem of failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH # # Drawing diagrams of the first decision tree of xgboost - # print("-----* Xgboost's Single Tree Diagram *-----") + # print("-----* XGBoost's Single Tree Diagram *-----") # xgboost.plot_tree(trained_model) # # node_params = { # # 'shape': 'box', diff --git a/geochemistrypi/data_mining/model/regression.py b/geochemistrypi/data_mining/model/regression.py index 63848ce2..834347b6 100644 --- a/geochemistrypi/data_mining/model/regression.py +++ b/geochemistrypi/data_mining/model/regression.py @@ -310,9 +310,9 @@ def special_components(self, **kwargs) -> None: class XGBoostRegression(TreeWorkflowMixin, RegressionWorkflowBase): - """The automation workflow of using Xgboost algorithm to make insightful products.""" + """The automation workflow of using XGBoost algorithm to make insightful products.""" - name = "Xgboost" + name = "XGBoost" special_function = ["Feature Importance Diagram"] # In fact, it's used for type hint in the original xgboost package. @@ -516,10 +516,10 @@ def __init__( References ---------- - [1] Xgboost Python API Reference - Scikit-Learn API + [1] XGBoost Python API Reference - Scikit-Learn API https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn - [2] Xgboost API for the scikit-learn wrapper: + [2] XGBoost API for the scikit-learn wrapper: https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py """ diff --git a/geochemistrypi/data_mining/plot/statistic_plot.py b/geochemistrypi/data_mining/plot/statistic_plot.py index f839eba8..6612ab60 100644 --- a/geochemistrypi/data_mining/plot/statistic_plot.py +++ b/geochemistrypi/data_mining/plot/statistic_plot.py @@ -37,7 +37,7 @@ def is_null_value(data: pd.DataFrame) -> None: print("--" * 10) -def is_imputed(data: pd.DataFrame) -> bool: +def check_missing_value(data: pd.DataFrame) -> bool: """Check whether the data set has null value or not. Parameters @@ -54,7 +54,7 @@ def is_imputed(data: pd.DataFrame) -> bool: if flag: print("Note: you'd better use imputation techniques to deal with the missing values.") else: - print("Note: you don't need to deal with the missing values, we'll just pass this step!") + print("Note: The provided data set is complete without missing values, we'll just pass this step!") return flag diff --git a/geochemistrypi/data_mining/process/classify.py b/geochemistrypi/data_mining/process/classify.py index fed17f85..8e6b6dfe 100644 --- a/geochemistrypi/data_mining/process/classify.py +++ b/geochemistrypi/data_mining/process/classify.py @@ -80,7 +80,7 @@ def activate( oob_score=hyper_parameters["oob_score"], max_samples=hyper_parameters["max_samples"], ) - elif self.model_name == "Xgboost": + elif self.model_name == "XGBoost": hyper_parameters = XGBoostClassification.manual_hyper_parameters() self.clf_workflow = XGBoostClassification( n_estimators=hyper_parameters["n_estimators"], @@ -214,7 +214,7 @@ def activate( self.clf_workflow = DecisionTreeClassification() elif self.model_name == "Random Forest": self.clf_workflow = RandomForestClassification() - elif self.model_name == "Xgboost": + elif self.model_name == "XGBoost": self.clf_workflow = XGBoostClassification() elif self.model_name == "Logistic Regression": self.clf_workflow = LogisticRegressionClassification() diff --git a/geochemistrypi/data_mining/process/regress.py b/geochemistrypi/data_mining/process/regress.py index 10083972..7c63ea72 100644 --- a/geochemistrypi/data_mining/process/regress.py +++ b/geochemistrypi/data_mining/process/regress.py @@ -59,7 +59,7 @@ def activate( poly_config, X_train, X_test = self.reg_workflow.poly(X_train, X_test) self.transformer_config.update(poly_config) self.reg_workflow.data_upload(X_train=X_train, X_test=X_test) - elif self.model_name == "Xgboost": + elif self.model_name == "XGBoost": hyper_parameters = XGBoostRegression.manual_hyper_parameters() self.reg_workflow = XGBoostRegression( n_estimators=hyper_parameters["n_estimators"], @@ -228,7 +228,7 @@ def activate( poly_config, X_train, X_test = self.reg_workflow.poly(X_train, X_test) self.transformer_config.update(poly_config) self.reg_workflow.data_upload(X_train=X_train, X_test=X_test) - elif self.model_name == "Xgboost": + elif self.model_name == "XGBoost": self.reg_workflow = XGBoostRegression() elif self.model_name == "Decision Tree": self.reg_workflow = DecisionTreeRegression() diff --git a/pyproject.toml b/pyproject.toml index c573f98a..591be5fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,9 +33,9 @@ dependencies = [ "openpyxl==3.0.10", "pandas==1.5.2", "joblib==1.2.0", - "flaml==1.0.14", # required to run Xgboost + FLMAL - "numpy==1.23.5", # required to run Xgboost + FLMAL - "xgboost==1.6.2", # required to run Xgboost + FLAML and be compatible with M2 chip on Mac + "flaml==1.0.14", # required to run XGBoost + FLMAL + "numpy==1.23.5", # required to run XGBoost + FLMAL + "xgboost==1.6.2", # required to run XGBoost + FLAML and be compatible with M2 chip on Mac "threadpoolctl==3.1.0", # required to draw 3d plot for KMeans "matplotlib==3.5.2", # required to draw 3d plot for KMeans "fastapi", # backend framework