From 5c542e681e3ff81101cf5849f7dcc188a8a60f12 Mon Sep 17 00:00:00 2001
From: Remi Tschupp <remi.tschupp@cea.fr>
Date: Thu, 27 Jun 2024 15:59:43 +0200
Subject: [PATCH] Erasing non util file and updating notebook for models and
 MLFlow

---
 Simple_example.py            |  80 ----------------
 titanic/titanic_models.ipynb | 181 +++++++++++++++++++++--------------
 2 files changed, 108 insertions(+), 153 deletions(-)
 delete mode 100644 Simple_example.py

diff --git a/Simple_example.py b/Simple_example.py
deleted file mode 100644
index bb2b464..0000000
--- a/Simple_example.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import mlflow
-from mlflow.models import infer_signature
-
-import pandas as pd
-from sklearn import datasets
-from sklearn.model_selection import train_test_split
-from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
-from pyngrok import ngrok
-from getpass import getpass
-
-
-# Load the Iris dataset
-X, y = datasets.load_iris(return_X_y=True)
-
-# Split the data into training and test sets
-X_train, X_test, y_train, y_test = train_test_split(
-    X, y, test_size=0.2, random_state=42
-)
-
-# Define the model hyperparameters
-params = {
-    "solver": "lbfgs",
-    "max_iter": 1000,
-    "multi_class": "auto",
-    "random_state": 8888,
-}
-
-# Train the model
-lr = LogisticRegression(**params)
-lr.fit(X_train, y_train)
-
-# Predict on the test set
-y_pred = lr.predict(X_test)
-
-# Calculate metrics
-accuracy = accuracy_score(y_test, y_pred)
-
-
-# Set our tracking server uri for logging
-mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
-
-# Create a new MLflow Experiment
-mlflow.set_experiment("MLflow Quickstart")
-
-# Start an MLflow run
-with mlflow.start_run():
-    # Log the hyperparameters
-    mlflow.log_params(params)
-
-    # Log the loss metric
-    mlflow.log_metric("accuracy", accuracy)
-
-    # Set a tag that we can use to remind ourselves what this run was for
-    mlflow.set_tag("Training Info", "Basic LR model for iris data")
-
-    # Infer the model signature
-    signature = infer_signature(X_train, lr.predict(X_train))
-
-    # Log the model
-    model_info = mlflow.sklearn.log_model(
-        sk_model=lr,
-        artifact_path="iris_model",
-        signature=signature,
-        input_example=X_train,
-        registered_model_name="tracking-quickstart",
-    )
-
-# Load the model back for predictions as a generic Python Function model
-loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
-
-predictions = loaded_model.predict(X_test)
-
-iris_feature_names = datasets.load_iris().feature_names
-
-result = pd.DataFrame(X_test, columns=iris_feature_names)
-result["actual_class"] = y_test
-result["predicted_class"] = predictions
-
-result[:4]
\ No newline at end of file
diff --git a/titanic/titanic_models.ipynb b/titanic/titanic_models.ipynb
index 75ee362..11e3538 100644
--- a/titanic/titanic_models.ipynb
+++ b/titanic/titanic_models.ipynb
@@ -2,13 +2,13 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "import mlflow\n",
     "from mlflow.models import infer_signature\n",
-    "\n",
+    "from mlflow.data.pandas_dataset import PandasDataset\n",
     "\n",
     "from sklearn import datasets\n",
     "from sklearn.model_selection import train_test_split\n",
@@ -26,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -42,13 +42,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
     "type_of_dataset = \"gentle\"\n",
+    "source_dataset = os.path.join(gen_dirname,f\"data\\{type_of_dataset}\\labelled.csv\")\n",
     "\n",
-    "labeled_data = pd.read_csv(os.path.join(gen_dirname,f\"data\\{type_of_dataset}\\labelled.csv\"))\n",
+    "labeled_data = pd.read_csv(source_dataset)\n",
     "\n",
     "labels = labeled_data[\"Survived\"]\n",
     "inputs = labeled_data.drop(\"Survived\",axis=\"columns\")\n",
@@ -65,7 +66,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -76,81 +77,57 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### XGBoost"
+    "### LogisticRegression"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Define the model hyperparameters\n",
-    "params_xgb = {\n",
-    "    \"n_estimators\":20,\n",
-    "    \"max_depth\":100,\n",
-    "    \"learning_rate\": 0.3,\n",
-    "    \"objective\": \"binary:logistic\",\n",
+    "params_lr = {\n",
+    "    \"solver\": \"lbfgs\",\n",
+    "    \"max_iter\": 1000,\n",
+    "    \"multi_class\": \"auto\",\n",
+    "    \"random_state\": 8888,\n",
     "}\n",
     "\n",
     "# Create model instance\n",
-    "bst = XGBClassifier(**params_xgb)\n",
-    "\n",
-    "# Fit the model\n",
-    "bst.fit(X_train, y_train)\n",
-    "\n",
-    "# # Infer the model signature\n",
-    "# signature = infer_signature(X_train, bst.predict(X_train))\n",
-    "\n",
-    "# # Log the model\n",
-    "# model_info = mlflow.xgboost.autolog()\n",
+    "lr = LogisticRegression(**params_lr)\n",
     "\n",
     "# Register in list \n",
-    "list_models.append([\"XGBoost\",params_xgb,bst,mlflow.xgboost.log_model])#,model_info])\n",
-    "\n"
+    "list_models.append([\"LogisticRegression\",params_lr,lr,mlflow.sklearn.autolog])#,model_info])"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### LogisticRegression"
+    "### XGBoost"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Define the model hyperparameters\n",
-    "params_lr = {\n",
-    "    \"solver\": \"lbfgs\",\n",
-    "    \"max_iter\": 1000,\n",
-    "    \"multi_class\": \"auto\",\n",
-    "    \"random_state\": 8888,\n",
+    "params_xgb = {\n",
+    "    \"n_estimators\":20,\n",
+    "    \"max_depth\":100,\n",
+    "    \"learning_rate\": 0.3,\n",
+    "    \"objective\": \"binary:logistic\",\n",
     "}\n",
     "\n",
     "# Create model instance\n",
-    "lr = LogisticRegression(**params_lr)\n",
-    "\n",
-    "# Fit the model\n",
-    "lr.fit(X_train, y_train)\n",
-    "\n",
-    "# # Log the model\n",
-    "# model_info = mlflow.sklearn.autolog()\n",
+    "bst = XGBClassifier(**params_xgb)\n",
     "\n",
     "# Register in list \n",
-    "list_models.append([\"LogisticRegression\",params_lr,lr,mlflow.sklearn.log_model])#,model_info])"
+    "list_models.append([\"XGBoost\",params_xgb,bst,mlflow.xgboost.autolog])#,model_info])\n",
+    "\n"
    ]
   },
   {
@@ -169,7 +146,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -200,9 +177,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/06/27 15:44:02 WARNING mlflow.utils.autologging_utils: You are using an unsupported version of sklearn. If you encounter errors during autologging, try upgrading / downgrading sklearn to a supported version, or try upgrading MLflow.\n",
+      "2024/06/27 15:44:02 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\mlflow\\types\\utils.py:406: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
+      "c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
+      "  warnings.warn(\n",
+      "2024/06/27 15:44:02 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\mlflow\\types\\utils.py:406: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
+      "2024/06/27 15:44:05 WARNING mlflow.utils.autologging_utils: You are using an unsupported version of sklearn. If you encounter errors during autologging, try upgrading / downgrading sklearn to a supported version, or try upgrading MLflow.\n",
+      "2024/06/27 15:44:05 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\mlflow\\types\\utils.py:406: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
+      "2024/06/27 15:44:07 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\mlflow\\types\\utils.py:406: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
+      "2024/06/27 15:44:07 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\xgboost\\core.py:160: UserWarning: [15:44:07] WARNING: C:\\buildkite-agent\\builds\\buildkite-windows-cpu-autoscaling-group-i-0750514818a16474a-1\\xgboost\\xgboost-ci-windows\\src\\c_api\\c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.\"\n",
+      "2024/06/27 15:44:10 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\mlflow\\types\\utils.py:406: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n"
+     ]
+    }
+   ],
    "source": [
     "# Set our tracking server uri for logging\n",
     "mlflow.set_tracking_uri(uri=\"http://127.0.0.1:5000\")\n",
@@ -215,42 +209,83 @@
     "\n",
     "    # Start an MLflow run\n",
     "    with mlflow.start_run():\n",
+    "        \n",
+    "        log_model()\n",
+    "\n",
     "        # Log the hyperparameters\n",
     "        mlflow.log_params(params)\n",
+    "\n",
+    "        # Fit the model on training data\n",
+    "        model.fit(X_train, y_train)\n",
+    "\n",
+    "        log_model(disable=True)\n",
+    "        # Final evaluation on the training sample\n",
+    "        preds_train = model.predict(X_train)\n",
+    "\n",
+    "        # Log the train metric\n",
+    "        accuracy_train, recall_train, auc_train, cnf_matr_train = eval_metrics(y_train,preds_train)\n",
+    "        mlflow.log_metric(\"accuracy_train\", accuracy_train)\n",
+    "        mlflow.log_metric(\"recall_train\", recall_train)\n",
+    "        mlflow.log_metric(\"auc_train\", auc_train)\n",
+    "\n",
+    "        fig, ax = plt.subplots()\n",
+    "\n",
+    "        sns.heatmap(cnf_matr_train, annot=True)\n",
+    "        ax.set_title(\"Feature confusion Matrix Test Set\", fontsize=14)\n",
+    "        plt.tight_layout()\n",
+    "        plt.close(fig)\n",
+    "\n",
+    "        mlflow.log_figure(fig, \"confusion_matrix_train.png\")\n",
     "        \n",
-    "        preds = model.predict(X_test)\n",
+    "        log_model(disable=False)\n",
+    "        # Make some prediction on the test set\n",
+    "        preds_test = model.predict(X_test)\n",
     "\n",
-    "        # Log the metric\n",
-    "        accuracy, recall, auc, cnf_matr = eval_metrics(y_test,preds)\n",
-    "        mlflow.log_metric(\"accuracy\", accuracy)\n",
-    "        mlflow.log_metric(\"recall\", recall)\n",
-    "        mlflow.log_metric(\"auc\", auc)\n",
+    "        # Log the tests metric\n",
+    "        accuracy_test, recall_test, auc_test, cnf_matr_test = eval_metrics(y_test,preds_test)\n",
+    "        mlflow.log_metric(\"accuracy_test\", accuracy_test)\n",
+    "        mlflow.log_metric(\"recall_test\", recall_test)\n",
+    "        mlflow.log_metric(\"auc_test\", auc_test)\n",
     "\n",
     "        fig, ax = plt.subplots()\n",
     "\n",
-    "        sns.heatmap(cnf_matr, annot=True)\n",
-    "        ax.set_title(\"Feature confusion Matrix\", fontsize=14)\n",
+    "        sns.heatmap(cnf_matr_test, annot=True)\n",
+    "        ax.set_title(\"Feature confusion Matrix Test Set\", fontsize=14)\n",
     "        plt.tight_layout()\n",
     "        plt.close(fig)\n",
     "\n",
-    "        mlflow.log_figure(fig, \"confusion_matrix.png\")\n",
+    "        mlflow.log_figure(fig, \"confusion_matrix_test.png\")\n",
     "\n",
     "        # Set a tag that we can use to remind ourselves what this run was for\n",
     "        mlflow.set_tag(\"Training Info\", f\"{name} model training for {type_of_dataset} titanic dataset\")\n",
     "\n",
-    "        mlflow.set_tag(\"mlflow.runName\", f\"{name}\")\n",
-    "\n",
+    "        # mlflow.set_tag(\"mlflow.runName\", f\"{name}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "SyntaxError",
+     "evalue": "(unicode error) 'unicodeescape' codec can't decode bytes in position 0-1: truncated \\uXXXX escape (822299344.py, line 7)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;36m  Cell \u001b[1;32mIn[10], line 7\u001b[1;36m\u001b[0m\n\u001b[1;33m    inference_dataset = os.path.join(gen_dirname,f\"data\\{type_of_dataset}\\unlabelled.csv\")\u001b[0m\n\u001b[1;37m                                                                                         ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m (unicode error) 'unicodeescape' codec can't decode bytes in position 0-1: truncated \\uXXXX escape\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_name = \"XGBoost\"\n",
+    "model_version = \"1\"\n",
+    "# Load saved model and make predictions\n",
+    "model_uri = f\"models:/{model_name}/{model_version}\"\n",
+    "loaded_model = mlflow.pyfunc.load_model(model_uri)\n",
     "\n",
-    "        # model_info = log_model()\n",
-    "        # # Infer the model signature\n",
-    "        # signature = infer_signature(X_train, model.predict(X_train))\n",
+    "inference_dataset = os.path.join(gen_dirname,f\"data\\\\{type_of_dataset}\\\\unlabelled.csv\")\n",
     "\n",
-    "        # model_info =log_model(\n",
-    "        #     artifact_path=f\"{type_of_dataset}_{name}\",\n",
-    "        #     signature=signature,\n",
-    "        #     input_example=X_train,\n",
-    "        #     registered_model_name=f\" {name}\",\n",
-    "        # )"
+    "unllabeled_data = pd.read_csv(inference_dataset)"
    ]
   }
  ],