Merge pull request #43 from aodn/feat/6168-filter-model

Feat/6168 filter model
aodn · Feb 17, 2025 · 69cb0ea · 69cb0ea
2 parents ccd564b + 5034fda
commit 69cb0ea
Show file tree

Hide file tree

Showing 14 changed files with 488 additions and 87 deletions.
diff --git a/...mon/keyword_classification_parameters.ini → ...y_ai/common/classification_parameters.ini b/...mon/keyword_classification_parameters.ini → ...y_ai/common/classification_parameters.ini
@@ -1,4 +1,4 @@
-[preprocessor]
+[keywordPreprocessor]
 vocabs = AODN Discovery Parameter Vocabulary, AODN Platform Vocabulary
 test_size = 0.2
 n_splits = 5
@@ -15,4 +15,13 @@ early_stopping_patience = 5
 reduce_lr_patience = 5
 validation_split = 0.2
 confidence = 0.5
-top_N = 3
+top_N = 2
+
+[filterPreprocessor]
+test_size = 0.3
+
+[filterModel]
+n_estimators = 10
+random_state = 42
+threshold = 0.9
+n_components = 0.9
diff --git a/data_discovery_ai/common/constants.py b/data_discovery_ai/common/constants.py
@@ -1,7 +1,7 @@
 API_PREFIX = "/api/v1/ml"
 API_KEY_NAME = "X-API-Key"
 AVAILABLE_MODELS = ["development", "staging", "production", "experimental", "benchmark"]
-KEYWORD_CONFIG = "keyword_classification_parameters.ini"
+MODEL_CONFIG = "classification_parameters.ini"
 ELASTICSEARCH_CONFIG = "esManager.ini"
 KEYWORD_FOLDER = "KeywordClassifier"
 KEYWORD_SAMPLE_FILE = "keyword_sample.pkl"

diff --git a/data_discovery_ai/model/filteringModel.py b/data_discovery_ai/model/filteringModel.py
@@ -0,0 +1,146 @@
+# The data delivery mode filter model to classify the metadata records based on their titles, abstracts, and lineages.
+# Possible classes are 'Real Time', 'Delayed', and 'Other'.
+import logging
+import os
+from data_discovery_ai.utils.preprocessor import get_description_embedding
+import tensorflow as tf
+from sklearn.semi_supervised import SelfTrainingClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import classification_report
+from sklearn.decomposition import PCA
+import numpy as np
+from pathlib import Path
+
+from typing import Any, Tuple
+from configparser import ConfigParser
+
+from data_discovery_ai.common.constants import FILTER_FOLDER
+from data_discovery_ai.utils.preprocessor import (
+    save_to_file,
+    load_from_file,
+    get_description_embedding,
+)
+
+os.environ["TF_USE_LEGACY_KERAS"] = "1"
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+def ddm_filter_model(
+    model_name: str,
+    X_train: np.ndarray,
+    y_train: np.ndarray,
+    params: ConfigParser,
+) -> Tuple[Any, Any]:
+    """
+    The classification model for predicting the data delivery mode of metadata records, based on their titles, abstracts, and lineages.
+    Currently, we applied a self-training model with a random forest classifier as the base model. It extends the idea of semi-supervised learning, in which both
+    Input:
+        model_name: str. The name of the model, which should be stricted within the options of `AVAILABLE_MODELS` in `data_discovery_ai/common/constants.py`.
+        X_labelled_train: np.ndarray. The training data of the metadata records, which is splited from the labelled data.
+        y_labelled_train: np.ndarray. The labels of the training data, which is splited from the labelled data.
+        params: ConfigParser. The configuration parameters for the model, which is loaded from the `MODEL_CONFIG` defined in `data_discovery_ai/common/constants.py`.
+    Output:
+        Tuple[Any, Any]. The trained model and pca model
+    """
+    n_estimators = params.getint("filterModel", "n_estimators")
+    random_state = params.getint("filterModel", "random_state")
+    threshold = params.getfloat("filterModel", "threshold")
+    n_components = params.getfloat("filterModel", "n_components")
+
+    pca = PCA(n_components=n_components)
+    X_train_pca = pca.fit_transform(X_train)
+
+    base_model = RandomForestClassifier(
+        n_estimators=n_estimators, random_state=random_state
+    )
+    # self-training classifier
+    self_training_model = SelfTrainingClassifier(base_model, threshold=threshold)
+    self_training_model.fit(X_train_pca, y_train)
+
+    # model file path
+    model_file_path = (
+        Path(__file__).resolve().parent.parent
+        / "resources"
+        / FILTER_FOLDER
+        / model_name
+    )
+
+    # make sure path exists
+    model_file_path.parent.mkdir(parents=True, exist_ok=True)
+    save_to_file(self_training_model, model_file_path.with_suffix(".pkl"))
+    save_to_file(pca, model_file_path.with_suffix(".pca.pkl"))
+
+    return self_training_model, pca
+
+
+def load_saved_model(model_name: str) -> Tuple[Any, Any]:
+    """
+    Load the saved model and the trained pca model from local pickle files.
+    The fine name is given by the 'selected_model' from the API request. The model file is end up with ".pkl" suffix, while the pca file is end up with the ".pca.pkl" suffix.
+    Input:
+        model_name: str. The name of the model, which should be stricted within the options of `AVAILABLE_MODELS` in `data_discovery_ai/common/constants.py`.
+    Output:
+        Tuple[Any, Any]. The trained model and pca model
+    """
+    # load model pickle file
+    model_file_path = (
+        Path(__file__).resolve().parent.parent
+        / "resources"
+        / FILTER_FOLDER
+        / model_name
+    )
+    trained_model = load_from_file(model_file_path.with_suffix(".pkl"))
+
+    # load pca pickle file
+    pca = load_from_file(model_file_path.with_suffix(".pca.pkl"))
+
+    return trained_model, pca
+
+
+def evaluate_model(model: Any, X_test: np.ndarray, y_test: np.ndarray, pca) -> None:
+    """
+    Evaluate the model with the testing data. The evaluation metrics comes from the classification report, which is printed out in the log.
+    Input:
+        model: Any. The trained model.
+        X_test: np.ndarray. The testing data of the metadata records, which is splited from the labelled data.
+        y_test: np.ndarray. The real class ("real-time" or "delayed") of the testing data, which is splited from the labelled data. This can be used as the groundtruth to evaluate the model.
+        pca: Any. The trained pca model.
+    """
+    X_test_pca = pca.transform(X_test)
+    y_pred = model.predict(X_test_pca)
+    report = classification_report(y_test, y_pred)
+    logger.info(f"Classification report: \n{report}")
+
+
+def make_prediction(model: Any, description: str, pca) -> np.ndarray:
+    """
+    Make prediction for a given metadata record, the description is the combination of its title, abstract, and lineage.
+    Input:
+        model: Any. The trained model.
+        description: str. The textual description of the metadata record, which is the combination of its title, abstract, and lineage.
+        pca: Any. The trained pca model.
+    Output:
+        np.ndarray. Return an np.ndarray of size 1, which is the predicted class of the metadata record. This prediction task has only two classes: 0 for "Real-Time" and 1 for "Delayed".
+    """
+    description_embedding = get_description_embedding(description)
+    dimension = description_embedding.shape[0]
+    target_X = description_embedding.reshape(1, dimension)
+    target_X_pca = pca.transform(target_X)
+
+    y_pred = model.predict(target_X_pca)
+    return y_pred[0]
+
+
+def get_predicted_class_name(predicted_class: int) -> str:
+    """
+    Conver the numeric class to the textual class name
+    Input:
+        predicted_class: int. The predicted class of the metadata record. It can be 0 or 1.
+    Output:
+        str. The textual class name of the predicted class. It can be "Real-Time" or "Delayed".
+    """
+    class_map = {0: "Real-Time", 1: "Delayed", 2: "Other"}
+    pred_class = class_map.get(predicted_class)
+    return pred_class
diff --git a/data_discovery_ai/model/keywordModel.py b/data_discovery_ai/model/keywordModel.py
@@ -10,6 +10,7 @@
     hamming_loss,
     jaccard_score,
 )
+
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.multioutput import MultiOutputClassifier
@@ -26,6 +27,7 @@
 
 import logging
 from typing import Dict, Callable, Any, Tuple, Optional, List
+from configparser import ConfigParser
 import os
 from pathlib import Path
 
@@ -34,8 +36,7 @@
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
-# TODO: Delete this line after fix 'module not exist issue' in notebooks
-KEYWORD_FOLDER = "KeywordClassifier"
+from data_discovery_ai.common.constants import KEYWORD_FOLDER
 
 
 def get_class_weights(Y_train: np.ndarray) -> Dict[int, float]:
@@ -87,7 +88,7 @@ def keyword_model(
     class_weight: Dict[int, float],
     dim: int,
     n_labels: int,
-    params: Dict[str, Any],
+    params: ConfigParser,
 ) -> Tuple[Sequential, Any, str]:
     """
     Builds, trains, and evaluates a multi-label classification model for keyword prediction. Train neural network model with configurable hyperparameters (through `common/keyword_classification_parameters.json`), compiles it with a focal loss function, and trains it on the provided training data.
@@ -273,7 +274,6 @@ def baseline(
         baseModel = DecisionTreeClassifier(random_state=42)
         baseline_model = MultiOutputClassifier(baseModel)
         baseline_model.fit(X_train, Y_train)
-    # TODO: add more baseline models
     else:
         raise ValueError(
             f"Unsupported model type: {model}. Please choose 'KNN' or 'DT'."