Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: distinguish which model can deal with missing values. #289

Merged
merged 1 commit into from
Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 48 additions & 16 deletions geochemistrypi/data_mining/cli_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@

from .constants import (
CLASSIFICATION_MODELS,
CLASSIFICATION_MODELS_WITH_MISSING_VALUES,
CLUSTERING_MODELS,
CLUSTERING_MODELS_WITH_MISSING_VALUES,
DECOMPOSITION_MODELS,
FEATURE_SCALING_STRATEGY,
FEATURE_SELECTION_STRATEGY,
Expand All @@ -21,6 +23,7 @@
OPTION,
OUTPUT_PATH,
REGRESSION_MODELS,
REGRESSION_MODELS_WITH_MISSING_VALUES,
SECTION,
TEST_DATA_OPTION,
WORKING_PATH,
Expand All @@ -32,7 +35,7 @@
from .data.preprocessing import feature_scaler, feature_selector
from .data.statistic import monte_carlo_simulator
from .plot.map_plot import process_world_map
from .plot.statistic_plot import basic_statistic, correlation_plot, distribution_plot, is_imputed, is_null_value, log_distribution_plot, probability_plot, ratio_null_vs_filled
from .plot.statistic_plot import basic_statistic, check_missing_value, correlation_plot, distribution_plot, is_null_value, log_distribution_plot, probability_plot, ratio_null_vs_filled
from .process.classify import ClassificationModelSelection
from .process.cluster import ClusteringModelSelection
from .process.decompose import DecompositionModelSelection
Expand Down Expand Up @@ -157,7 +160,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
raise e
experiment = mlflow.get_experiment(experiment_id=new_experiment_id)
# print("Artifact Location: {}".format(experiment.artifact_location))
run_name = Prompt.ask("✨ Run Name", default="Xgboost Algorithm - Test 1")
run_name = Prompt.ask("✨ Run Name", default="XGBoost Algorithm - Test 1")
# run_tag = Prompt.ask("✨ Run Tag Version", default="R - v1.0.0")
# run_description = Prompt.ask("✨ Run Description", default="Use xgboost for GeoPi classification.")
# mlflow.start_run(run_name=run_name, experiment_id=experiment.experiment_id, tags={"version": run_tag, "description": run_description})
Expand Down Expand Up @@ -219,7 +222,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
print("The Selected Data Set:")
print(data_selected)
clear_output()
print("Basic Statistical Information: ")
print("-*-*- Basic Statistical Information -*-*-")
basic_info(data_selected)
basic_statistic(data_selected)
correlation_plot(data_selected.columns, data_selected)
Expand All @@ -232,11 +235,26 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N

# <--- Imputation --->
logger.debug("Imputation")
print("-*-*- Imputation -*-*-")
print("-*-*- Missing Value Check -*-*-")
is_null_value(data_selected)
ratio_null_vs_filled(data_selected)
imputed_flag = is_imputed(data_selected)
missing_value_flag = check_missing_value(data_selected)
clear_output()
if missing_value_flag:
# Ask the user whether to use imputation techniques to deal with the missing values.
print("-*-*- Imputation Option -*-*-")
num2option(OPTION)
imputation_num = limit_num_input(OPTION, SECTION[1], num_input)
if imputation_num == 1:
imputed_flag = True
else:
imputed_flag = False
clear_output()
else:
# Allow the user not to use imputation techniques to deal with the missing values.
# Subsequently, in the mode selection, only regression, classification and clustering models are available.
# In the corresponding model selection, only the models that support missing values are available.
imputed_flag = False
if imputed_flag:
print("-*-*- Strategy for Missing Values -*-*-")
num2option(IMPUTING_STRATEGY)
Expand Down Expand Up @@ -281,8 +299,16 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
# <--- Mode Selection --->
logger.debug("Mode Selection")
print("-*-*- Mode Selection -*-*-")
num2option(MODE_OPTION)
mode_num = limit_num_input(MODE_OPTION, SECTION[2], num_input)
# If the selected data set is with missing values and is not been imputed, then only allow the user to choose regression, classification and clustering modes.
# Otherwise, allow the user to choose decomposition modes.
if missing_value_flag and not imputed_flag:
# Delete the decomposition mode because it doesn't support missing values.
MODE_OPTION.remove("Dimensional Reduction")
num2option(MODE_OPTION)
mode_num = limit_num_input(MODE_OPTION, SECTION[2], num_input)
else:
num2option(MODE_OPTION)
mode_num = limit_num_input(MODE_OPTION, SECTION[2], num_input)
clear_output()

# <--- Data Segmentation --->
Expand Down Expand Up @@ -359,7 +385,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N

# create training data and testing data
print("-*-*- Data Split - Train Set and Test Set -*-*-")
print("Notice: Normally, set 20% of the dataset aside as test set, such as 0.2")
print("Notice: Normally, set 20% of the dataset aside as test set, such as 0.2.")
test_ratio = float_input(default=0.2, prefix=SECTION[1], slogan="@Test Ratio: ")
train_test_data = data_split(X, y, test_ratio)
for key, value in train_test_data.items():
Expand Down Expand Up @@ -404,14 +430,20 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N

# <--- Model Selection --->
logger.debug("Model Selection")
print("-*-*- Model Selection -*-*-:")
Modes2Models = {1: REGRESSION_MODELS, 2: CLASSIFICATION_MODELS, 3: CLUSTERING_MODELS, 4: DECOMPOSITION_MODELS}
Modes2Initiators = {
1: RegressionModelSelection,
2: ClassificationModelSelection,
3: ClusteringModelSelection,
4: DecompositionModelSelection,
}
print("-*-*- Model Selection -*-*-")
# If the selected data set is with missing values and is not been imputed, then only allow the user to choose regression, classification and clustering models.
# Otherwise, allow the user to choose decomposition models.
if missing_value_flag and not imputed_flag:
Modes2Models = {1: REGRESSION_MODELS_WITH_MISSING_VALUES, 2: CLASSIFICATION_MODELS_WITH_MISSING_VALUES, 3: CLUSTERING_MODELS_WITH_MISSING_VALUES}
Modes2Initiators = {1: RegressionModelSelection, 2: ClassificationModelSelection, 3: ClusteringModelSelection}
else:
Modes2Models = {1: REGRESSION_MODELS, 2: CLASSIFICATION_MODELS, 3: CLUSTERING_MODELS, 4: DECOMPOSITION_MODELS}
Modes2Initiators = {
1: RegressionModelSelection,
2: ClassificationModelSelection,
3: ClusteringModelSelection,
4: DecompositionModelSelection,
}
MODELS = Modes2Models[mode_num]
num2option(MODELS)
# Add the option of all models
Expand Down
29 changes: 27 additions & 2 deletions geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,27 +39,52 @@
"Random Forest",
"Extra-Trees",
"Gradient Boosting",
"Xgboost",
"XGBoost",
"Multi-layer Perceptron",
"Lasso Regression",
"Elastic Net",
"SGD Regression",
# "Bagging Regression",
# "Decision Tree",
# Histogram-based Gradient Boosting,
]
CLASSIFICATION_MODELS = [
"Logistic Regression",
"Support Vector Machine",
"Decision Tree",
"Random Forest",
"Extra-Trees",
"Xgboost",
"XGBoost",
"Multi-layer Perceptron",
"Gradient Boosting",
"K-Nearest Neighbors",
"Stochastic Gradient Descent",
# "Bagging Classification",
# "Decision Tree",
# Histogram-based Gradient Boosting,
]
CLUSTERING_MODELS = ["KMeans", "DBSCAN"]
DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"]

# The model can deal with missing values
# Reference: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
REGRESSION_MODELS_WITH_MISSING_VALUES = [
"XGBoost",
# "Bagging Regression",
# "Decision Tree",
# Histogram-based Gradient Boosting,
]
CLASSIFICATION_MODELS_WITH_MISSING_VALUES = [
"XGBoost",
# "Bagging Classification",
# "Decision Tree",
# Histogram-based Gradient Boosting,
]
CLUSTERING_MODELS_WITH_MISSING_VALUES = [
# "HDBSCAN"
]


# Special AutoML models
NON_AUTOML_MODELS = ["Linear Regression", "Polynomial Regression"]
RAY_FLAML = ["Multi-layer Perceptron"]
Expand Down
10 changes: 5 additions & 5 deletions geochemistrypi/data_mining/model/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -1211,9 +1211,9 @@ def special_components(self, is_automl: bool = False, **kwargs) -> None:


class XGBoostClassification(TreeWorkflowMixin, ClassificationWorkflowBase):
"""The automation workflow of using Xgboost algorithm to make insightful products."""
"""The automation workflow of using XGBoost algorithm to make insightful products."""

name = "Xgboost"
name = "XGBoost"
special_function = ["Feature Importance Diagram"]

# https: // xgboost.readthedocs.io / en / stable / python / python_api.html # module-xgboost.sklearn
Expand Down Expand Up @@ -1419,10 +1419,10 @@ def __init__(

References
----------
[1] Xgboost Python API Reference - Scikit-Learn API
[1] XGBoost Python API Reference - Scikit-Learn API
https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

[2] Xgboost API for the scikit-learn wrapper:
[2] XGBoost API for the scikit-learn wrapper:
https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py
"""
super().__init__()
Expand Down Expand Up @@ -1526,7 +1526,7 @@ def manual_hyper_parameters(cls) -> Dict:
# def _plot_tree(trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
# # TODO: (solve the problem of failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH
# # Drawing diagrams of the first decision tree of xgboost
# print("-----* Xgboost's Single Tree Diagram *-----")
# print("-----* XGBoost's Single Tree Diagram *-----")
# xgboost.plot_tree(trained_model)
# # node_params = {
# # 'shape': 'box',
Expand Down
8 changes: 4 additions & 4 deletions geochemistrypi/data_mining/model/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,9 +310,9 @@ def special_components(self, **kwargs) -> None:


class XGBoostRegression(TreeWorkflowMixin, RegressionWorkflowBase):
"""The automation workflow of using Xgboost algorithm to make insightful products."""
"""The automation workflow of using XGBoost algorithm to make insightful products."""

name = "Xgboost"
name = "XGBoost"
special_function = ["Feature Importance Diagram"]

# In fact, it's used for type hint in the original xgboost package.
Expand Down Expand Up @@ -516,10 +516,10 @@ def __init__(

References
----------
[1] Xgboost Python API Reference - Scikit-Learn API
[1] XGBoost Python API Reference - Scikit-Learn API
https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

[2] Xgboost API for the scikit-learn wrapper:
[2] XGBoost API for the scikit-learn wrapper:
https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py
"""

Expand Down
4 changes: 2 additions & 2 deletions geochemistrypi/data_mining/plot/statistic_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def is_null_value(data: pd.DataFrame) -> None:
print("--" * 10)


def is_imputed(data: pd.DataFrame) -> bool:
def check_missing_value(data: pd.DataFrame) -> bool:
"""Check whether the data set has null value or not.

Parameters
Expand All @@ -54,7 +54,7 @@ def is_imputed(data: pd.DataFrame) -> bool:
if flag:
print("Note: you'd better use imputation techniques to deal with the missing values.")
else:
print("Note: you don't need to deal with the missing values, we'll just pass this step!")
print("Note: The provided data set is complete without missing values, we'll just pass this step!")
return flag


Expand Down
4 changes: 2 additions & 2 deletions geochemistrypi/data_mining/process/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def activate(
oob_score=hyper_parameters["oob_score"],
max_samples=hyper_parameters["max_samples"],
)
elif self.model_name == "Xgboost":
elif self.model_name == "XGBoost":
hyper_parameters = XGBoostClassification.manual_hyper_parameters()
self.clf_workflow = XGBoostClassification(
n_estimators=hyper_parameters["n_estimators"],
Expand Down Expand Up @@ -214,7 +214,7 @@ def activate(
self.clf_workflow = DecisionTreeClassification()
elif self.model_name == "Random Forest":
self.clf_workflow = RandomForestClassification()
elif self.model_name == "Xgboost":
elif self.model_name == "XGBoost":
self.clf_workflow = XGBoostClassification()
elif self.model_name == "Logistic Regression":
self.clf_workflow = LogisticRegressionClassification()
Expand Down
4 changes: 2 additions & 2 deletions geochemistrypi/data_mining/process/regress.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def activate(
poly_config, X_train, X_test = self.reg_workflow.poly(X_train, X_test)
self.transformer_config.update(poly_config)
self.reg_workflow.data_upload(X_train=X_train, X_test=X_test)
elif self.model_name == "Xgboost":
elif self.model_name == "XGBoost":
hyper_parameters = XGBoostRegression.manual_hyper_parameters()
self.reg_workflow = XGBoostRegression(
n_estimators=hyper_parameters["n_estimators"],
Expand Down Expand Up @@ -228,7 +228,7 @@ def activate(
poly_config, X_train, X_test = self.reg_workflow.poly(X_train, X_test)
self.transformer_config.update(poly_config)
self.reg_workflow.data_upload(X_train=X_train, X_test=X_test)
elif self.model_name == "Xgboost":
elif self.model_name == "XGBoost":
self.reg_workflow = XGBoostRegression()
elif self.model_name == "Decision Tree":
self.reg_workflow = DecisionTreeRegression()
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ dependencies = [
"openpyxl==3.0.10",
"pandas==1.5.2",
"joblib==1.2.0",
"flaml==1.0.14", # required to run Xgboost + FLMAL
"numpy==1.23.5", # required to run Xgboost + FLMAL
"xgboost==1.6.2", # required to run Xgboost + FLAML and be compatible with M2 chip on Mac
"flaml==1.0.14", # required to run XGBoost + FLMAL
"numpy==1.23.5", # required to run XGBoost + FLMAL
"xgboost==1.6.2", # required to run XGBoost + FLAML and be compatible with M2 chip on Mac
"threadpoolctl==3.1.0", # required to draw 3d plot for KMeans
"matplotlib==3.5.2", # required to draw 3d plot for KMeans
"fastapi", # backend framework
Expand Down