Skip to content

Commit

Permalink
Merge pull request #283 from ZJUEarthData/dev/Mengqi
Browse files Browse the repository at this point in the history
perf: improve binary classification and multi-classification compatibility issues.
  • Loading branch information
SanyHe authored Nov 22, 2023
2 parents cf754bd + f20cc3f commit b776957
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 22 deletions.
2 changes: 2 additions & 0 deletions geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,5 @@
CUSTOMIZE_LABEL_STRATEGY = ["Automatic Coding", "Custom Numeric Labels", "Custom Non-numeric Labels"]

FEATURE_SELECTION_STRATEGY = ["GenericUnivariateSelect", "SelectKBest"]

CALCULATION_METHOD_OPTION = ["Micro", "Macro", "Weighted"]
4 changes: 3 additions & 1 deletion geochemistrypi/data_mining/model/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,9 @@ def _plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, t
print("-----* Confusion Matrix *-----")
data = plot_confusion_matrix(y_test, y_test_predict, trained_model)
save_fig(f"Confusion Matrix - {algorithm_name}", local_path, mlflow_path)
data = pd.DataFrame(data, columns=["Predicted Negative", "Predicted Positive"], index=["Actual Negative", "Actual Positive"])
index = [f"true_{i}" for i in range(int(y_test.nunique().values))]
columns = [f"pred_{i}" for i in range(int(y_test.nunique().values))]
data = pd.DataFrame(data, columns=columns, index=index)
save_data(data, f"Confusion Matrix - {algorithm_name}", local_path, mlflow_path, True)

@staticmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import mlflow
import numpy as np
import pandas as pd
from data_mining.constants import CALCULATION_METHOD_OPTION, SECTION
from data_mining.data.data_readiness import limit_num_input, num2option, num_input
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
Expand All @@ -31,10 +33,24 @@ def score(y_true: pd.DataFrame, y_predict: pd.DataFrame) -> Dict:
scores : dict
The scores of the classification model.
"""
average = "binary"
if int(y_true.nunique().values) > 2:
print("Please select calculation method:")
print("[bold green]Micro[/bold green]: Calculate metrics globally by counting the total true positives, false negatives and false positives.")
print("[bold green]Macro[/bold green]: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.")
print("[bold green]Weighted[/bold green]: Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label).")
num2option(CALCULATION_METHOD_OPTION)
average_num = limit_num_input(CALCULATION_METHOD_OPTION, SECTION[0], num_input)
if average_num == 1:
average = "micro"
elif average_num == 2:
average = "macro"
elif average_num == 3:
average = "weighted"
accuracy = accuracy_score(y_true, y_predict)
precision = precision_score(y_true, y_predict)
recall = recall_score(y_true, y_predict)
f1 = f1_score(y_true, y_predict)
precision = precision_score(y_true, y_predict, average=average)
recall = recall_score(y_true, y_predict, average=average)
f1 = f1_score(y_true, y_predict, average=average)
print("Accuracy: ", accuracy)
print("Precision:", precision)
print("Recall:", recall)
Expand Down Expand Up @@ -177,16 +193,19 @@ def plot_precision_recall(X_test, y_test, trained_model: object, algorithm_name:
thresholds : np.ndarray
The thresholds of the model.
"""
# Predict probabilities for the positive class
y_probs = trained_model.predict_proba(X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)
if int(y_test.nunique().values) == 2:
# Predict probabilities for the positive class
y_probs = trained_model.predict_proba(X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)

plt.figure()
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.legend(labels=["Precision", "Recall"], loc="best")
plt.title(f"Precision Recall Curve - {algorithm_name}")
return y_probs, precisions, recalls, thresholds
plt.figure()
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.legend(labels=["Precision", "Recall"], loc="best")
plt.title(f"Precision Recall Curve - {algorithm_name}")
return y_probs, precisions, recalls, thresholds
else:
return None, None, None, None


def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, algorithm_name: str) -> tuple:
Expand Down Expand Up @@ -220,15 +239,18 @@ def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object,
thresholds : np.ndarray
The thresholds of the model.
"""
y_probs = trained_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
plt.figure()
plt.plot(fpr, tpr, linewidth=2)
plt.plot([0, 1], [0, 1], "r--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title(f"ROC Curve - {algorithm_name}")
return y_probs, fpr, tpr, thresholds
if int(y_test.nunique().values) == 2:
y_probs = trained_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
plt.figure()
plt.plot(fpr, tpr, linewidth=2)
plt.plot([0, 1], [0, 1], "r--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title(f"ROC Curve - {algorithm_name}")
return y_probs, fpr, tpr, thresholds
else:
return None, None, None, None


def plot_2d_decision_boundary(X: pd.DataFrame, X_test: pd.DataFrame, trained_model: object, image_config: Dict) -> None:
Expand Down

0 comments on commit b776957

Please sign in to comment.