Skip to content

Commit

Permalink
Erasing non util file and updating notebook for models and MLFlow
Browse files Browse the repository at this point in the history
  • Loading branch information
Remi Tschupp committed Jun 27, 2024
1 parent b79d883 commit 5c542e6
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 153 deletions.
80 changes: 0 additions & 80 deletions Simple_example.py

This file was deleted.

181 changes: 108 additions & 73 deletions titanic/titanic_models.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
"cells": [
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import mlflow\n",
"from mlflow.models import infer_signature\n",
"\n",
"from mlflow.data.pandas_dataset import PandasDataset\n",
"\n",
"from sklearn import datasets\n",
"from sklearn.model_selection import train_test_split\n",
Expand All @@ -26,7 +26,7 @@
},
{
"cell_type": "code",
"execution_count": 42,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -42,13 +42,14 @@
},
{
"cell_type": "code",
"execution_count": 43,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"type_of_dataset = \"gentle\"\n",
"source_dataset = os.path.join(gen_dirname,f\"data\\{type_of_dataset}\\labelled.csv\")\n",
"\n",
"labeled_data = pd.read_csv(os.path.join(gen_dirname,f\"data\\{type_of_dataset}\\labelled.csv\"))\n",
"labeled_data = pd.read_csv(source_dataset)\n",
"\n",
"labels = labeled_data[\"Survived\"]\n",
"inputs = labeled_data.drop(\"Survived\",axis=\"columns\")\n",
Expand All @@ -65,7 +66,7 @@
},
{
"cell_type": "code",
"execution_count": 44,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -76,81 +77,57 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### XGBoost"
"### LogisticRegression"
]
},
{
"cell_type": "code",
"execution_count": 45,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Define the model hyperparameters\n",
"params_xgb = {\n",
" \"n_estimators\":20,\n",
" \"max_depth\":100,\n",
" \"learning_rate\": 0.3,\n",
" \"objective\": \"binary:logistic\",\n",
"params_lr = {\n",
" \"solver\": \"lbfgs\",\n",
" \"max_iter\": 1000,\n",
" \"multi_class\": \"auto\",\n",
" \"random_state\": 8888,\n",
"}\n",
"\n",
"# Create model instance\n",
"bst = XGBClassifier(**params_xgb)\n",
"\n",
"# Fit the model\n",
"bst.fit(X_train, y_train)\n",
"\n",
"# # Infer the model signature\n",
"# signature = infer_signature(X_train, bst.predict(X_train))\n",
"\n",
"# # Log the model\n",
"# model_info = mlflow.xgboost.autolog()\n",
"lr = LogisticRegression(**params_lr)\n",
"\n",
"# Register in list \n",
"list_models.append([\"XGBoost\",params_xgb,bst,mlflow.xgboost.log_model])#,model_info])\n",
"\n"
"list_models.append([\"LogisticRegression\",params_lr,lr,mlflow.sklearn.autolog])#,model_info])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### LogisticRegression"
"### XGBoost"
]
},
{
"cell_type": "code",
"execution_count": 46,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
" warnings.warn(\n"
]
}
],
"outputs": [],
"source": [
"# Define the model hyperparameters\n",
"params_lr = {\n",
" \"solver\": \"lbfgs\",\n",
" \"max_iter\": 1000,\n",
" \"multi_class\": \"auto\",\n",
" \"random_state\": 8888,\n",
"params_xgb = {\n",
" \"n_estimators\":20,\n",
" \"max_depth\":100,\n",
" \"learning_rate\": 0.3,\n",
" \"objective\": \"binary:logistic\",\n",
"}\n",
"\n",
"# Create model instance\n",
"lr = LogisticRegression(**params_lr)\n",
"\n",
"# Fit the model\n",
"lr.fit(X_train, y_train)\n",
"\n",
"# # Log the model\n",
"# model_info = mlflow.sklearn.autolog()\n",
"bst = XGBClassifier(**params_xgb)\n",
"\n",
"# Register in list \n",
"list_models.append([\"LogisticRegression\",params_lr,lr,mlflow.sklearn.log_model])#,model_info])"
"list_models.append([\"XGBoost\",params_xgb,bst,mlflow.xgboost.autolog])#,model_info])\n",
"\n"
]
},
{
Expand All @@ -169,7 +146,7 @@
},
{
"cell_type": "code",
"execution_count": 47,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -200,9 +177,26 @@
},
{
"cell_type": "code",
"execution_count": 48,
"execution_count": 8,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024/06/27 15:44:02 WARNING mlflow.utils.autologging_utils: You are using an unsupported version of sklearn. If you encounter errors during autologging, try upgrading / downgrading sklearn to a supported version, or try upgrading MLflow.\n",
"2024/06/27 15:44:02 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\mlflow\\types\\utils.py:406: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.\n",
" warnings.warn(\n",
"2024/06/27 15:44:02 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\mlflow\\types\\utils.py:406: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
"2024/06/27 15:44:05 WARNING mlflow.utils.autologging_utils: You are using an unsupported version of sklearn. If you encounter errors during autologging, try upgrading / downgrading sklearn to a supported version, or try upgrading MLflow.\n",
"2024/06/27 15:44:05 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\mlflow\\types\\utils.py:406: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
"2024/06/27 15:44:07 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\mlflow\\types\\utils.py:406: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
"2024/06/27 15:44:07 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\xgboost\\core.py:160: UserWarning: [15:44:07] WARNING: C:\\buildkite-agent\\builds\\buildkite-windows-cpu-autoscaling-group-i-0750514818a16474a-1\\xgboost\\xgboost-ci-windows\\src\\c_api\\c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.\"\n",
"2024/06/27 15:44:10 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"c:\\Users\\RT277831\\Documents\\Projets\\Dauphine\\ML_OPS\\venv\\Lib\\site-packages\\mlflow\\types\\utils.py:406: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n"
]
}
],
"source": [
"# Set our tracking server uri for logging\n",
"mlflow.set_tracking_uri(uri=\"http://127.0.0.1:5000\")\n",
Expand All @@ -215,42 +209,83 @@
"\n",
" # Start an MLflow run\n",
" with mlflow.start_run():\n",
" \n",
" log_model()\n",
"\n",
" # Log the hyperparameters\n",
" mlflow.log_params(params)\n",
"\n",
" # Fit the model on training data\n",
" model.fit(X_train, y_train)\n",
"\n",
" log_model(disable=True)\n",
" # Final evaluation on the training sample\n",
" preds_train = model.predict(X_train)\n",
"\n",
" # Log the train metric\n",
" accuracy_train, recall_train, auc_train, cnf_matr_train = eval_metrics(y_train,preds_train)\n",
" mlflow.log_metric(\"accuracy_train\", accuracy_train)\n",
" mlflow.log_metric(\"recall_train\", recall_train)\n",
" mlflow.log_metric(\"auc_train\", auc_train)\n",
"\n",
" fig, ax = plt.subplots()\n",
"\n",
" sns.heatmap(cnf_matr_train, annot=True)\n",
" ax.set_title(\"Feature confusion Matrix Test Set\", fontsize=14)\n",
" plt.tight_layout()\n",
" plt.close(fig)\n",
"\n",
" mlflow.log_figure(fig, \"confusion_matrix_train.png\")\n",
" \n",
" preds = model.predict(X_test)\n",
" log_model(disable=False)\n",
" # Make some prediction on the test set\n",
" preds_test = model.predict(X_test)\n",
"\n",
" # Log the metric\n",
" accuracy, recall, auc, cnf_matr = eval_metrics(y_test,preds)\n",
" mlflow.log_metric(\"accuracy\", accuracy)\n",
" mlflow.log_metric(\"recall\", recall)\n",
" mlflow.log_metric(\"auc\", auc)\n",
" # Log the tests metric\n",
" accuracy_test, recall_test, auc_test, cnf_matr_test = eval_metrics(y_test,preds_test)\n",
" mlflow.log_metric(\"accuracy_test\", accuracy_test)\n",
" mlflow.log_metric(\"recall_test\", recall_test)\n",
" mlflow.log_metric(\"auc_test\", auc_test)\n",
"\n",
" fig, ax = plt.subplots()\n",
"\n",
" sns.heatmap(cnf_matr, annot=True)\n",
" ax.set_title(\"Feature confusion Matrix\", fontsize=14)\n",
" sns.heatmap(cnf_matr_test, annot=True)\n",
" ax.set_title(\"Feature confusion Matrix Test Set\", fontsize=14)\n",
" plt.tight_layout()\n",
" plt.close(fig)\n",
"\n",
" mlflow.log_figure(fig, \"confusion_matrix.png\")\n",
" mlflow.log_figure(fig, \"confusion_matrix_test.png\")\n",
"\n",
" # Set a tag that we can use to remind ourselves what this run was for\n",
" mlflow.set_tag(\"Training Info\", f\"{name} model training for {type_of_dataset} titanic dataset\")\n",
"\n",
" mlflow.set_tag(\"mlflow.runName\", f\"{name}\")\n",
"\n",
" # mlflow.set_tag(\"mlflow.runName\", f\"{name}\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "(unicode error) 'unicodeescape' codec can't decode bytes in position 0-1: truncated \\uXXXX escape (822299344.py, line 7)",
"output_type": "error",
"traceback": [
"\u001b[1;36m Cell \u001b[1;32mIn[10], line 7\u001b[1;36m\u001b[0m\n\u001b[1;33m inference_dataset = os.path.join(gen_dirname,f\"data\\{type_of_dataset}\\unlabelled.csv\")\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m (unicode error) 'unicodeescape' codec can't decode bytes in position 0-1: truncated \\uXXXX escape\n"
]
}
],
"source": [
"model_name = \"XGBoost\"\n",
"model_version = \"1\"\n",
"# Load saved model and make predictions\n",
"model_uri = f\"models:/{model_name}/{model_version}\"\n",
"loaded_model = mlflow.pyfunc.load_model(model_uri)\n",
"\n",
" # model_info = log_model()\n",
" # # Infer the model signature\n",
" # signature = infer_signature(X_train, model.predict(X_train))\n",
"inference_dataset = os.path.join(gen_dirname,f\"data\\\\{type_of_dataset}\\\\unlabelled.csv\")\n",
"\n",
" # model_info =log_model(\n",
" # artifact_path=f\"{type_of_dataset}_{name}\",\n",
" # signature=signature,\n",
" # input_example=X_train,\n",
" # registered_model_name=f\" {name}\",\n",
" # )"
"unllabeled_data = pd.read_csv(inference_dataset)"
]
}
],
Expand Down

0 comments on commit 5c542e6

Please sign in to comment.