From 0e4b3efbd4908718e753a99fe09724596db2dc63 Mon Sep 17 00:00:00 2001 From: Merari Santana Date: Fri, 24 Jan 2025 21:51:00 -0800 Subject: [PATCH 1/4] qq_and_residuals_plot function documentation added to example.ipynb --- docs/example.ipynb | 411 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 316 insertions(+), 95 deletions(-) diff --git a/docs/example.ipynb b/docs/example.ipynb index 04394cf..ecd5d2e 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -1,97 +1,318 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Example usage\n", - "\n", - "To use `linreg_ally` in a project:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import linreg_ally\n", - "\n", - "print(linreg_ally.__version__)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Paramveer - EDA" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Alex - VIF " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Cheng - model fitting" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Merari - plot" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example usage\n", + "\n", + "To use `linreg_ally` in a project:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.1.0\n" + ] + } + ], + "source": [ + "import linreg_ally\n", + "\n", + "print(linreg_ally.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Paramveer - EDA" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Alex - VIF " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Cheng - model fitting" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Summary\n", + "------------------------\n", + "Test r2: 0.785\n", + "Test neg_mean_squared_error: 345.987\n" + ] + } + ], + "source": [ + "# Merari - plot (using Cheng's function outputs)\n", + "from vega_datasets import data\n", + "from linreg_ally.models import run_linear_regression\n", + "\n", + "df = data.cars()\n", + "df = df[['Horsepower', 'Displacement']].dropna()\n", + "\n", + "# Define parameters for run_linear_regression\n", + "dataframe = df\n", + "target_column = \"Horsepower\"\n", + "numeric_feats = [\"Displacement\"] \n", + "categorical_feats = [] # No categorical features in this case\n", + "drop_feats = None # No columns to drop\n", + "random_state = 123\n", + "\n", + "model_results = run_linear_regression(\n", + " dataframe=dataframe,\n", + " target_column=target_column,\n", + " numeric_feats=numeric_feats,\n", + " categorical_feats=categorical_feats,\n", + " drop_feats=drop_feats,\n", + " random_state=random_state\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Checking Normality and Homoscedasticity of Residuals\n", + "\n", + "A linear regression model assumes that residuals are normally distributed and have constant variance (homoscedasticity). To check whether these assumptions are met, we use the `qq_and_residuals_plot` function. This function generates:\n", + "\n", + "1. A Quantile-Quantile (Q-Q) plot to assess the normality of residuals.\n", + "2. A Residuals vs. Fitted Values plot to check for homoscedasticity." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `qq_and_residuals_plot` function takes two parameters: `y_actual` and `y_predicted`. These values were extracted from the linear regression model we previously created." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "# Unpack the returned values correctly\n", + "best_model, X_train, X_test, y_train, y_test, scores = model_results\n", + "\n", + "# y_actual is y_test (true labels)\n", + "y_actual = y_test\n", + "\n", + "# y_predicted is obtained by predicting on X_test\n", + "y_predicted = best_model.predict(X_test)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that `y_actual` and `y_predicted` have been extracted, let's pass these parameters to the `qq_and_residuals_plot` function." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from linreg_ally.plotting import qq_and_residuals_plot\n", + "\n", + "qq_and_residuals_plot(y_actual, y_predicted)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Interpreting the Q-Q Plot\n", + "\n", + "If the Q-Q plot shows a significant deviation from the red dashed line (which represents perfect normality), the residuals are not normally distributed. In our plot, a few points deviate from the line at the tails, suggesting potential skewness or outliers. However, since these deviations are minor, we can conclude that the residuals are approximately normal." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Interpreting the Residuals vs. Fitted Values Plot\n", + "\n", + "For the homoscedasticity assumption to hold, residuals should be randomly scattered around the red dashed line in the Residuals vs. Fitted Values plot. This would indicate that residual variance remains constant across all fitted values (homoscedasticity).\n", + "\n", + "However, in our case, residuals cluster at different fitted value ranges, suggesting that the variance is not constant (heteroscedasticity)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Implications of Assumption Violations\n", + "\n", + "If the normality assumption is violated:\n", + "Ordinary Least Squares (OLS) regression still produces best linear unbiased estimates (BLUE) as long as independence and homoscedasticity hold. However, hypothesis tests and confidence intervals may be misleading if residuals deviate significantly from normality.\n", + "\n", + "If the homoscedasticity assumption is violated:\n", + "You can still fit a linear regression model, but you should interpret results with caution. The estimated coefficients remain unbiased, but standard errors and p-values become unreliable, affecting statistical inference." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "The `qq_and_residuals_plot` function is a valuable tool for assessing the normality and homoscedasticity assumptions in linear regression. If these assumptions are violated, you should consider corrective measures such as:\n", + "\n", + "- Transforming variables (e.g., logarithmic transformation),\n", + "- Using robust standard errors, or\n", + "- Exploring alternative models (e.g., weighted least squares, generalized least squares)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } From fd0a46404c5223cf7ab14c1b503eb0c71cc0c6d2 Mon Sep 17 00:00:00 2001 From: Merari Santana Date: Fri, 24 Jan 2025 21:55:18 -0800 Subject: [PATCH 2/4] added comment to delete my first code chunk if Cheng uses the same arguments for his run_linear_regression function --- docs/example.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/example.ipynb b/docs/example.ipynb index ecd5d2e..f830e37 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -88,6 +88,7 @@ } ], "source": [ + "# DELETE THIS CODE CHUNK IF CHENG USES THE SAME ARGUMENTS FOR HIS RUN_LINEAR_REGRESSION FUNCTION\n", "# Merari - plot (using Cheng's function outputs)\n", "from vega_datasets import data\n", "from linreg_ally.models import run_linear_regression\n", From eddb01dbfdcda9da1091f6b20d51b75e7499a95c Mon Sep 17 00:00:00 2001 From: Merari Santana Date: Sat, 25 Jan 2025 00:02:57 -0800 Subject: [PATCH 3/4] modified my code and explanations to fit with Chengs outputs. Plots should show up once integrated with Chengs code blocks --- docs/example.ipynb | 141 +++++---------------------------------------- 1 file changed, 15 insertions(+), 126 deletions(-) diff --git a/docs/example.ipynb b/docs/example.ipynb index f830e37..96da9c3 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -71,49 +71,6 @@ "# Cheng - model fitting" ] }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model Summary\n", - "------------------------\n", - "Test r2: 0.785\n", - "Test neg_mean_squared_error: 345.987\n" - ] - } - ], - "source": [ - "# DELETE THIS CODE CHUNK IF CHENG USES THE SAME ARGUMENTS FOR HIS RUN_LINEAR_REGRESSION FUNCTION\n", - "# Merari - plot (using Cheng's function outputs)\n", - "from vega_datasets import data\n", - "from linreg_ally.models import run_linear_regression\n", - "\n", - "df = data.cars()\n", - "df = df[['Horsepower', 'Displacement']].dropna()\n", - "\n", - "# Define parameters for run_linear_regression\n", - "dataframe = df\n", - "target_column = \"Horsepower\"\n", - "numeric_feats = [\"Displacement\"] \n", - "categorical_feats = [] # No categorical features in this case\n", - "drop_feats = None # No columns to drop\n", - "random_state = 123\n", - "\n", - "model_results = run_linear_regression(\n", - " dataframe=dataframe,\n", - " target_column=target_column,\n", - " numeric_feats=numeric_feats,\n", - " categorical_feats=categorical_feats,\n", - " drop_feats=drop_feats,\n", - " random_state=random_state\n", - ")\n" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -139,14 +96,13 @@ "metadata": {}, "outputs": [], "source": [ - "# Unpack the returned values correctly\n", - "best_model, X_train, X_test, y_train, y_test, scores = model_results\n", + "#Using Cheng's outputs! Tested my `qq_and_residuals_plot` function on his branch\n", "\n", "# y_actual is y_test (true labels)\n", "y_actual = y_test\n", "\n", "# y_predicted is obtained by predicting on X_test\n", - "y_predicted = best_model.predict(X_test)\n" + "y_predicted = best_model.predict(X_test)" ] }, { @@ -158,91 +114,24 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 2, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.HConcatChart(...)" - ] - }, - "execution_count": 50, - "metadata": {}, - "output_type": "execute_result" + "ename": "NameError", + "evalue": "name 'y_actual' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mlinreg_ally\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mplotting\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m qq_and_residuals_plot\n\u001b[0;32m----> 3\u001b[0m qq_and_residuals_plot(\u001b[43my_actual\u001b[49m, y_predicted)\n", + "\u001b[0;31mNameError\u001b[0m: name 'y_actual' is not defined" + ] } ], "source": [ + "#move this import to the top \n", + "\n", "from linreg_ally.plotting import qq_and_residuals_plot\n", "\n", "qq_and_residuals_plot(y_actual, y_predicted)" @@ -265,7 +154,7 @@ "\n", "For the homoscedasticity assumption to hold, residuals should be randomly scattered around the red dashed line in the Residuals vs. Fitted Values plot. This would indicate that residual variance remains constant across all fitted values (homoscedasticity).\n", "\n", - "However, in our case, residuals cluster at different fitted value ranges, suggesting that the variance is not constant (heteroscedasticity)." + "However, in our case, the residuals cluster at different fitted value ranges, and the spread increases as the fitted values increase, suggesting that the variance is not constant (heteroscedasticity)." ] }, { From f5c1ce8916aacd90f7258244fee498fd5e6dd242 Mon Sep 17 00:00:00 2001 From: Merari Santana Date: Sat, 25 Jan 2025 18:00:04 -0800 Subject: [PATCH 4/4] merged Chengs changes to my example.ipynb. His outputs are my inputs. --- docs/example.ipynb | 855 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 835 insertions(+), 20 deletions(-) diff --git a/docs/example.ipynb b/docs/example.ipynb index 96da9c3..7e1b572 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -64,13 +64,763 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Cheng - model fitting" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Running Linear Regression Tutorial\n", + "\n", + "In this tutorial, you will learn a streamlined way to preprocess data, run linear regression and output with scoring metrics.\n", + "\n", + "First, ensure you have the `models` package imported." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from linreg_ally.models import run_linear_regression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will be using the `cars` dataset provided by `vega_datasets`. This dataset contains various features related to cars, including both numerical and categorical variables, making it ideal for demonstrating the full capabilities of our linear regression function." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameMiles_per_GallonCylindersDisplacementHorsepowerWeight_in_lbsAccelerationYearOrigin
0chevrolet chevelle malibu18.08307.0130.0350412.01970-01-01USA
1buick skylark 32015.08350.0165.0369311.51970-01-01USA
2plymouth satellite18.08318.0150.0343611.01970-01-01USA
3amc rebel sst16.08304.0150.0343312.01970-01-01USA
4ford torino17.08302.0140.0344910.51970-01-01USA
\n", + "
" + ], + "text/plain": [ + " Name Miles_per_Gallon Cylinders Displacement \\\n", + "0 chevrolet chevelle malibu 18.0 8 307.0 \n", + "1 buick skylark 320 15.0 8 350.0 \n", + "2 plymouth satellite 18.0 8 318.0 \n", + "3 amc rebel sst 16.0 8 304.0 \n", + "4 ford torino 17.0 8 302.0 \n", + "\n", + " Horsepower Weight_in_lbs Acceleration Year Origin \n", + "0 130.0 3504 12.0 1970-01-01 USA \n", + "1 165.0 3693 11.5 1970-01-01 USA \n", + "2 150.0 3436 11.0 1970-01-01 USA \n", + "3 150.0 3433 12.0 1970-01-01 USA \n", + "4 140.0 3449 10.5 1970-01-01 USA " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from vega_datasets import data\n", + "\n", + "df = data.cars()\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As shown above, the dataset includes data about different car models, featuring attributes such as `Miles_per_Gallon`, `Cylinders`, `Displacement` etc. We will utilize these attributes to build a linear regression model, predicting the target variable `Horsepower`.\n", + "\n", + "We will first perform some data cleaning by removing columns that contain `NA` values." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df = df[['Horsepower', 'Displacement']].dropna()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the dataset loaded, you're all set to move forward to the next step: using our package's `run_linear_regression` function to prepare the data, fit a model, and evaluate its performance.\n", + "\n", + "We will specify the `target_column`, `numeric_feats`, `categorical_feats` and `drop_feats`. In this case, `target_column` will be `Horsepower` since we are trying to predict its value. `numeric_feats` will be all the numeric features that we want to scale using scikit-learn's `StandardScaler`. `categorical_feats` will be the categorical features (in this case only `Origin`) that we want to perform one-hot encoding on using scikit-learn's `OneHotEncoder`. `drop_feats` will be the columns that we do not want to include in the analysis, in which in this case will be `Name` since it does not provide any meaningful information to the analysis.\n", + "\n", + "For the `scoring_metrics`, we will specify `r2` to evaluate the performance of the model on test data." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Summary\n", + "------------------------\n", + "Test r2: 0.846\n" + ] + } + ], + "source": [ + "from vega_datasets import data\n", + "from linreg_ally.models import run_linear_regression\n", + "\n", + "df = data.cars()\n", + "df = df.dropna()\n", + "\n", + "# Define parameters for run_linear_regression\n", + "target_column = \"Horsepower\"\n", + "numeric_feats = [\"Miles_per_Gallon\", \"Cylinders\", \"Displacement\", \"Weight_in_lbs\", \"Acceleration\"] \n", + "categorical_feats = [\"Origin\"]\n", + "drop_feats = [\"Name\"]\n", + "random_state = 123\n", + "scoring_metrics = [\"r2\"]\n", + "\n", + "best_model, X_train, X_test, y_train, y_test, scores = run_linear_regression(\n", + " dataframe=df,\n", + " target_column=target_column,\n", + " numeric_feats=numeric_feats,\n", + " categorical_feats=categorical_feats,\n", + " drop_feats=drop_feats,\n", + " random_state=random_state,\n", + " scoring_metrics=scoring_metrics\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`best_model` provides a visual summary of the steps used in the entire linear regression pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('preprocessor',\n",
+       "                 ColumnTransformer(transformers=[('standardscaler',\n",
+       "                                                  StandardScaler(),\n",
+       "                                                  ['Miles_per_Gallon',\n",
+       "                                                   'Cylinders', 'Displacement',\n",
+       "                                                   'Weight_in_lbs',\n",
+       "                                                   'Acceleration']),\n",
+       "                                                 ('onehotencoder',\n",
+       "                                                  OneHotEncoder(), ['Origin']),\n",
+       "                                                 ('drop', 'drop', ['Name'])])),\n",
+       "                ('model', LinearRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('preprocessor',\n", + " ColumnTransformer(transformers=[('standardscaler',\n", + " StandardScaler(),\n", + " ['Miles_per_Gallon',\n", + " 'Cylinders', 'Displacement',\n", + " 'Weight_in_lbs',\n", + " 'Acceleration']),\n", + " ('onehotencoder',\n", + " OneHotEncoder(), ['Origin']),\n", + " ('drop', 'drop', ['Name'])])),\n", + " ('model', LinearRegression())])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "best_model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Scores give the R² and negative mean squared error scores that we are interested in finding out in order to understand how the model performs on the test data." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'r2': 0.8463952369304465}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As shown above, an R² score of 85% indicates that 85% of the variance in the dependent variable can be explained by the independent variables included in the model, showing that the model provides a good fit to the data.\n", + "\n", + "However, R² alone does not tell the whole story, for example if there might be multicollinearity or other issues. You might also want to consider other metrics like Mean Squared Error (MSE), Root Mean Squared Error (RMSE), or visually inspect residual plots to gain a more comprehensive understanding of model performance." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is the end of this tutorial where you have seen how we use the `run_linear_regression` function in our package to preprocess data, run linear regression and output with scoring metrics." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -92,12 +842,10 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "#Using Cheng's outputs! Tested my `qq_and_residuals_plot` function on his branch\n", - "\n", "# y_actual is y_test (true labels)\n", "y_actual = y_test\n", "\n", @@ -114,24 +862,91 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 13, "metadata": {}, "outputs": [ { - "ename": "NameError", - "evalue": "name 'y_actual' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[2], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mlinreg_ally\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mplotting\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m qq_and_residuals_plot\n\u001b[0;32m----> 3\u001b[0m qq_and_residuals_plot(\u001b[43my_actual\u001b[49m, y_predicted)\n", - "\u001b[0;31mNameError\u001b[0m: name 'y_actual' is not defined" - ] + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "#move this import to the top \n", - "\n", "from linreg_ally.plotting import qq_and_residuals_plot\n", "\n", "qq_and_residuals_plot(y_actual, y_predicted)"