Erasing non util file and updating notebook for models and MLFlow

tschuppr · Jun 27, 2024 · 5c542e6 · 5c542e6
1 parent b79d883
commit 5c542e6
Show file tree

Hide file tree

Showing 2 changed files with 108 additions and 153 deletions.
diff --git a/Simple_example.py b/Simple_example.py
diff --git a/titanic/titanic_models.ipynb b/titanic/titanic_models.ipynb
@@ -2,13 +2,13 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "import mlflow\n",
     "from mlflow.models import infer_signature\n",
-    "\n",
+    "from mlflow.data.pandas_dataset import PandasDataset\n",
     "\n",
     "from sklearn import datasets\n",
     "from sklearn.model_selection import train_test_split\n",
@@ -26,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -42,13 +42,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
     "type_of_dataset = \"gentle\"\n",
+    "source_dataset = os.path.join(gen_dirname,f\"data\\{type_of_dataset}\\labelled.csv\")\n",
     "\n",
-    "labeled_data = pd.read_csv(os.path.join(gen_dirname,f\"data\\{type_of_dataset}\\labelled.csv\"))\n",
+    "labeled_data = pd.read_csv(source_dataset)\n",
     "\n",
     "labels = labeled_data[\"Survived\"]\n",
     "inputs = labeled_data.drop(\"Survived\",axis=\"columns\")\n",
@@ -65,7 +66,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -76,81 +77,57 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### XGBoost"
+    "### LogisticRegression"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Define the model hyperparameters\n",
-    "params_xgb = {\n",
-    "    \"n_estimators\":20,\n",
-    "    \"max_depth\":100,\n",
-    "    \"learning_rate\": 0.3,\n",
-    "    \"objective\": \"binary:logistic\",\n",
+    "params_lr = {\n",
+    "    \"solver\": \"lbfgs\",\n",
+    "    \"max_iter\": 1000,\n",
+    "    \"multi_class\": \"auto\",\n",
+    "    \"random_state\": 8888,\n",
     "}\n",
     "\n",
     "# Create model instance\n",
-    "bst = XGBClassifier(**params_xgb)\n",
-    "\n",
-    "# Fit the model\n",
-    "bst.fit(X_train, y_train)\n",
-    "\n",
-    "# # Infer the model signature\n",
-    "# signature = infer_signature(X_train, bst.predict(X_train))\n",
-    "\n",
-    "# # Log the model\n",
-    "# model_info = mlflow.xgboost.autolog()\n",
+    "lr = LogisticRegression(**params_lr)\n",
     "\n",
     "# Register in list \n",
-    "list_models.append([\"XGBoost\",params_xgb,bst,mlflow.xgboost.log_model])#,model_info])\n",
-    "\n"
+    "list_models.append([\"LogisticRegression\",params_lr,lr,mlflow.sklearn.autolog])#,model_info])"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### LogisticRegression"
+    "### XGBoost"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Define the model hyperparameters\n",
-    "params_lr = {\n",
-    "    \"solver\": \"lbfgs\",\n",
-    "    \"max_iter\": 1000,\n",
-    "    \"multi_class\": \"auto\",\n",
-    "    \"random_state\": 8888,\n",
+    "params_xgb = {\n",
+    "    \"n_estimators\":20,\n",
+    "    \"max_depth\":100,\n",
+    "    \"learning_rate\": 0.3,\n",
+    "    \"objective\": \"binary:logistic\",\n",
     "}\n",
     "\n",
     "# Create model instance\n",
-    "lr = LogisticRegression(**params_lr)\n",
-    "\n",
-    "# Fit the model\n",
-    "lr.fit(X_train, y_train)\n",
-    "\n",
-    "# # Log the model\n",
-    "# model_info = mlflow.sklearn.autolog()\n",
+    "bst = XGBClassifier(**params_xgb)\n",
     "\n",
     "# Register in list \n",
-    "list_models.append([\"LogisticRegression\",params_lr,lr,mlflow.sklearn.log_model])#,model_info])"
+    "list_models.append([\"XGBoost\",params_xgb,bst,mlflow.xgboost.autolog])#,model_info])\n",
+    "\n"
    ]
   },
   {
@@ -169,7 +146,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -200,9 +177,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/06/27 15:44:02 WARNING mlflow.utils.autologging_utils: You are using an unsupported version of sklearn. If you encounter errors during autologging, try upgrading / downgrading sklearn to a supported version, or try upgrading MLflow.\n",
+      "2024/06/27 15:44:02 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\mlflow\\types\\utils.py:406: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
+      "c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
+      "  warnings.warn(\n",
+      "2024/06/27 15:44:02 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\mlflow\\types\\utils.py:406: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
+      "2024/06/27 15:44:05 WARNING mlflow.utils.autologging_utils: You are using an unsupported version of sklearn. If you encounter errors during autologging, try upgrading / downgrading sklearn to a supported version, or try upgrading MLflow.\n",
+      "2024/06/27 15:44:05 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\mlflow\\types\\utils.py:406: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
+      "2024/06/27 15:44:07 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\mlflow\\types\\utils.py:406: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
+      "2024/06/27 15:44:07 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\xgboost\\core.py:160: UserWarning: [15:44:07] WARNING: C:\\buildkite-agent\\builds\\buildkite-windows-cpu-autoscaling-group-i-0750514818a16474a-1\\xgboost\\xgboost-ci-windows\\src\\c_api\\c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.\"\n",
+      "2024/06/27 15:44:10 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\mlflow\\types\\utils.py:406: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n"
+     ]
+    }
+   ],
    "source": [
     "# Set our tracking server uri for logging\n",
     "mlflow.set_tracking_uri(uri=\"http://127.0.0.1:5000\")\n",
@@ -215,42 +209,83 @@
     "\n",
     "    # Start an MLflow run\n",
     "    with mlflow.start_run():\n",
+    "        \n",
+    "        log_model()\n",
+    "\n",
     "        # Log the hyperparameters\n",
     "        mlflow.log_params(params)\n",
+    "\n",
+    "        # Fit the model on training data\n",
+    "        model.fit(X_train, y_train)\n",
+    "\n",
+    "        log_model(disable=True)\n",
+    "        # Final evaluation on the training sample\n",
+    "        preds_train = model.predict(X_train)\n",
+    "\n",
+    "        # Log the train metric\n",
+    "        accuracy_train, recall_train, auc_train, cnf_matr_train = eval_metrics(y_train,preds_train)\n",
+    "        mlflow.log_metric(\"accuracy_train\", accuracy_train)\n",
+    "        mlflow.log_metric(\"recall_train\", recall_train)\n",
+    "        mlflow.log_metric(\"auc_train\", auc_train)\n",
+    "\n",
+    "        fig, ax = plt.subplots()\n",
+    "\n",
+    "        sns.heatmap(cnf_matr_train, annot=True)\n",
+    "        ax.set_title(\"Feature confusion Matrix Test Set\", fontsize=14)\n",
+    "        plt.tight_layout()\n",
+    "        plt.close(fig)\n",
+    "\n",
+    "        mlflow.log_figure(fig, \"confusion_matrix_train.png\")\n",
     "        \n",
-    "        preds = model.predict(X_test)\n",
+    "        log_model(disable=False)\n",
+    "        # Make some prediction on the test set\n",
+    "        preds_test = model.predict(X_test)\n",
     "\n",
-    "        # Log the metric\n",
-    "        accuracy, recall, auc, cnf_matr = eval_metrics(y_test,preds)\n",
-    "        mlflow.log_metric(\"accuracy\", accuracy)\n",
-    "        mlflow.log_metric(\"recall\", recall)\n",
-    "        mlflow.log_metric(\"auc\", auc)\n",
+    "        # Log the tests metric\n",
+    "        accuracy_test, recall_test, auc_test, cnf_matr_test = eval_metrics(y_test,preds_test)\n",
+    "        mlflow.log_metric(\"accuracy_test\", accuracy_test)\n",
+    "        mlflow.log_metric(\"recall_test\", recall_test)\n",
+    "        mlflow.log_metric(\"auc_test\", auc_test)\n",
     "\n",
     "        fig, ax = plt.subplots()\n",
     "\n",
-    "        sns.heatmap(cnf_matr, annot=True)\n",
-    "        ax.set_title(\"Feature confusion Matrix\", fontsize=14)\n",
+    "        sns.heatmap(cnf_matr_test, annot=True)\n",
+    "        ax.set_title(\"Feature confusion Matrix Test Set\", fontsize=14)\n",
     "        plt.tight_layout()\n",
     "        plt.close(fig)\n",
     "\n",
-    "        mlflow.log_figure(fig, \"confusion_matrix.png\")\n",
+    "        mlflow.log_figure(fig, \"confusion_matrix_test.png\")\n",
     "\n",
     "        # Set a tag that we can use to remind ourselves what this run was for\n",
     "        mlflow.set_tag(\"Training Info\", f\"{name} model training for {type_of_dataset} titanic dataset\")\n",
     "\n",
-    "        mlflow.set_tag(\"mlflow.runName\", f\"{name}\")\n",
-    "\n",
+    "        # mlflow.set_tag(\"mlflow.runName\", f\"{name}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "SyntaxError",
+     "evalue": "(unicode error) 'unicodeescape' codec can't decode bytes in position 0-1: truncated \\uXXXX escape (822299344.py, line 7)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;36m  Cell \u001b[1;32mIn[10], line 7\u001b[1;36m\u001b[0m\n\u001b[1;33m    inference_dataset = os.path.join(gen_dirname,f\"data\\{type_of_dataset}\\unlabelled.csv\")\u001b[0m\n\u001b[1;37m                                                                                         ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m (unicode error) 'unicodeescape' codec can't decode bytes in position 0-1: truncated \\uXXXX escape\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_name = \"XGBoost\"\n",
+    "model_version = \"1\"\n",
+    "# Load saved model and make predictions\n",
+    "model_uri = f\"models:/{model_name}/{model_version}\"\n",
+    "loaded_model = mlflow.pyfunc.load_model(model_uri)\n",
     "\n",
-    "        # model_info = log_model()\n",
-    "        # # Infer the model signature\n",
-    "        # signature = infer_signature(X_train, model.predict(X_train))\n",
+    "inference_dataset = os.path.join(gen_dirname,f\"data\\\\{type_of_dataset}\\\\unlabelled.csv\")\n",
     "\n",
-    "        # model_info =log_model(\n",
-    "        #     artifact_path=f\"{type_of_dataset}_{name}\",\n",
-    "        #     signature=signature,\n",
-    "        #     input_example=X_train,\n",
-    "        #     registered_model_name=f\" {name}\",\n",
-    "        # )"
+    "unllabeled_data = pd.read_csv(inference_dataset)"
    ]
   }
  ],