Skip to content

Commit

Permalink
Merge pull request #43 from aodn/feat/6168-filter-model
Browse files Browse the repository at this point in the history
Feat/6168 filter model
  • Loading branch information
utas-raymondng authored Feb 17, 2025
2 parents ccd564b + 5034fda commit 69cb0ea
Show file tree
Hide file tree
Showing 14 changed files with 488 additions and 87 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[preprocessor]
[keywordPreprocessor]
vocabs = AODN Discovery Parameter Vocabulary, AODN Platform Vocabulary
test_size = 0.2
n_splits = 5
Expand All @@ -15,4 +15,13 @@ early_stopping_patience = 5
reduce_lr_patience = 5
validation_split = 0.2
confidence = 0.5
top_N = 3
top_N = 2

[filterPreprocessor]
test_size = 0.3

[filterModel]
n_estimators = 10
random_state = 42
threshold = 0.9
n_components = 0.9
2 changes: 1 addition & 1 deletion data_discovery_ai/common/constants.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
API_PREFIX = "/api/v1/ml"
API_KEY_NAME = "X-API-Key"
AVAILABLE_MODELS = ["development", "staging", "production", "experimental", "benchmark"]
KEYWORD_CONFIG = "keyword_classification_parameters.ini"
MODEL_CONFIG = "classification_parameters.ini"
ELASTICSEARCH_CONFIG = "esManager.ini"
KEYWORD_FOLDER = "KeywordClassifier"
KEYWORD_SAMPLE_FILE = "keyword_sample.pkl"
Expand Down
146 changes: 146 additions & 0 deletions data_discovery_ai/model/filteringModel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# The data delivery mode filter model to classify the metadata records based on their titles, abstracts, and lineages.
# Possible classes are 'Real Time', 'Delayed', and 'Other'.
import logging
import os
from data_discovery_ai.utils.preprocessor import get_description_embedding
import tensorflow as tf
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
import numpy as np
from pathlib import Path

from typing import Any, Tuple
from configparser import ConfigParser

from data_discovery_ai.common.constants import FILTER_FOLDER
from data_discovery_ai.utils.preprocessor import (
save_to_file,
load_from_file,
get_description_embedding,
)

os.environ["TF_USE_LEGACY_KERAS"] = "1"

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def ddm_filter_model(
model_name: str,
X_train: np.ndarray,
y_train: np.ndarray,
params: ConfigParser,
) -> Tuple[Any, Any]:
"""
The classification model for predicting the data delivery mode of metadata records, based on their titles, abstracts, and lineages.
Currently, we applied a self-training model with a random forest classifier as the base model. It extends the idea of semi-supervised learning, in which both
Input:
model_name: str. The name of the model, which should be stricted within the options of `AVAILABLE_MODELS` in `data_discovery_ai/common/constants.py`.
X_labelled_train: np.ndarray. The training data of the metadata records, which is splited from the labelled data.
y_labelled_train: np.ndarray. The labels of the training data, which is splited from the labelled data.
params: ConfigParser. The configuration parameters for the model, which is loaded from the `MODEL_CONFIG` defined in `data_discovery_ai/common/constants.py`.
Output:
Tuple[Any, Any]. The trained model and pca model
"""
n_estimators = params.getint("filterModel", "n_estimators")
random_state = params.getint("filterModel", "random_state")
threshold = params.getfloat("filterModel", "threshold")
n_components = params.getfloat("filterModel", "n_components")

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train)

base_model = RandomForestClassifier(
n_estimators=n_estimators, random_state=random_state
)
# self-training classifier
self_training_model = SelfTrainingClassifier(base_model, threshold=threshold)
self_training_model.fit(X_train_pca, y_train)

# model file path
model_file_path = (
Path(__file__).resolve().parent.parent
/ "resources"
/ FILTER_FOLDER
/ model_name
)

# make sure path exists
model_file_path.parent.mkdir(parents=True, exist_ok=True)
save_to_file(self_training_model, model_file_path.with_suffix(".pkl"))
save_to_file(pca, model_file_path.with_suffix(".pca.pkl"))

return self_training_model, pca


def load_saved_model(model_name: str) -> Tuple[Any, Any]:
"""
Load the saved model and the trained pca model from local pickle files.
The fine name is given by the 'selected_model' from the API request. The model file is end up with ".pkl" suffix, while the pca file is end up with the ".pca.pkl" suffix.
Input:
model_name: str. The name of the model, which should be stricted within the options of `AVAILABLE_MODELS` in `data_discovery_ai/common/constants.py`.
Output:
Tuple[Any, Any]. The trained model and pca model
"""
# load model pickle file
model_file_path = (
Path(__file__).resolve().parent.parent
/ "resources"
/ FILTER_FOLDER
/ model_name
)
trained_model = load_from_file(model_file_path.with_suffix(".pkl"))

# load pca pickle file
pca = load_from_file(model_file_path.with_suffix(".pca.pkl"))

return trained_model, pca


def evaluate_model(model: Any, X_test: np.ndarray, y_test: np.ndarray, pca) -> None:
"""
Evaluate the model with the testing data. The evaluation metrics comes from the classification report, which is printed out in the log.
Input:
model: Any. The trained model.
X_test: np.ndarray. The testing data of the metadata records, which is splited from the labelled data.
y_test: np.ndarray. The real class ("real-time" or "delayed") of the testing data, which is splited from the labelled data. This can be used as the groundtruth to evaluate the model.
pca: Any. The trained pca model.
"""
X_test_pca = pca.transform(X_test)
y_pred = model.predict(X_test_pca)
report = classification_report(y_test, y_pred)
logger.info(f"Classification report: \n{report}")


def make_prediction(model: Any, description: str, pca) -> np.ndarray:
"""
Make prediction for a given metadata record, the description is the combination of its title, abstract, and lineage.
Input:
model: Any. The trained model.
description: str. The textual description of the metadata record, which is the combination of its title, abstract, and lineage.
pca: Any. The trained pca model.
Output:
np.ndarray. Return an np.ndarray of size 1, which is the predicted class of the metadata record. This prediction task has only two classes: 0 for "Real-Time" and 1 for "Delayed".
"""
description_embedding = get_description_embedding(description)
dimension = description_embedding.shape[0]
target_X = description_embedding.reshape(1, dimension)
target_X_pca = pca.transform(target_X)

y_pred = model.predict(target_X_pca)
return y_pred[0]


def get_predicted_class_name(predicted_class: int) -> str:
"""
Conver the numeric class to the textual class name
Input:
predicted_class: int. The predicted class of the metadata record. It can be 0 or 1.
Output:
str. The textual class name of the predicted class. It can be "Real-Time" or "Delayed".
"""
class_map = {0: "Real-Time", 1: "Delayed", 2: "Other"}
pred_class = class_map.get(predicted_class)
return pred_class
8 changes: 4 additions & 4 deletions data_discovery_ai/model/keywordModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
hamming_loss,
jaccard_score,
)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
Expand All @@ -26,6 +27,7 @@

import logging
from typing import Dict, Callable, Any, Tuple, Optional, List
from configparser import ConfigParser
import os
from pathlib import Path

Expand All @@ -34,8 +36,7 @@
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# TODO: Delete this line after fix 'module not exist issue' in notebooks
KEYWORD_FOLDER = "KeywordClassifier"
from data_discovery_ai.common.constants import KEYWORD_FOLDER


def get_class_weights(Y_train: np.ndarray) -> Dict[int, float]:
Expand Down Expand Up @@ -87,7 +88,7 @@ def keyword_model(
class_weight: Dict[int, float],
dim: int,
n_labels: int,
params: Dict[str, Any],
params: ConfigParser,
) -> Tuple[Sequential, Any, str]:
"""
Builds, trains, and evaluates a multi-label classification model for keyword prediction. Train neural network model with configurable hyperparameters (through `common/keyword_classification_parameters.json`), compiles it with a focal loss function, and trains it on the provided training data.
Expand Down Expand Up @@ -273,7 +274,6 @@ def baseline(
baseModel = DecisionTreeClassifier(random_state=42)
baseline_model = MultiOutputClassifier(baseModel)
baseline_model.fit(X_train, Y_train)
# TODO: add more baseline models
else:
raise ValueError(
f"Unsupported model type: {model}. Please choose 'KNN' or 'DT'."
Expand Down
Loading

0 comments on commit 69cb0ea

Please sign in to comment.