From f5c1ce8916aacd90f7258244fee498fd5e6dd242 Mon Sep 17 00:00:00 2001 From: Merari Santana Date: Sat, 25 Jan 2025 18:00:04 -0800 Subject: [PATCH] merged Chengs changes to my example.ipynb. His outputs are my inputs. --- docs/example.ipynb | 855 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 835 insertions(+), 20 deletions(-) diff --git a/docs/example.ipynb b/docs/example.ipynb index 96da9c3..7e1b572 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -64,13 +64,763 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Cheng - model fitting" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Running Linear Regression Tutorial\n", + "\n", + "In this tutorial, you will learn a streamlined way to preprocess data, run linear regression and output with scoring metrics.\n", + "\n", + "First, ensure you have the `models` package imported." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from linreg_ally.models import run_linear_regression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will be using the `cars` dataset provided by `vega_datasets`. This dataset contains various features related to cars, including both numerical and categorical variables, making it ideal for demonstrating the full capabilities of our linear regression function." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameMiles_per_GallonCylindersDisplacementHorsepowerWeight_in_lbsAccelerationYearOrigin
0chevrolet chevelle malibu18.08307.0130.0350412.01970-01-01USA
1buick skylark 32015.08350.0165.0369311.51970-01-01USA
2plymouth satellite18.08318.0150.0343611.01970-01-01USA
3amc rebel sst16.08304.0150.0343312.01970-01-01USA
4ford torino17.08302.0140.0344910.51970-01-01USA
\n", + "
" + ], + "text/plain": [ + " Name Miles_per_Gallon Cylinders Displacement \\\n", + "0 chevrolet chevelle malibu 18.0 8 307.0 \n", + "1 buick skylark 320 15.0 8 350.0 \n", + "2 plymouth satellite 18.0 8 318.0 \n", + "3 amc rebel sst 16.0 8 304.0 \n", + "4 ford torino 17.0 8 302.0 \n", + "\n", + " Horsepower Weight_in_lbs Acceleration Year Origin \n", + "0 130.0 3504 12.0 1970-01-01 USA \n", + "1 165.0 3693 11.5 1970-01-01 USA \n", + "2 150.0 3436 11.0 1970-01-01 USA \n", + "3 150.0 3433 12.0 1970-01-01 USA \n", + "4 140.0 3449 10.5 1970-01-01 USA " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from vega_datasets import data\n", + "\n", + "df = data.cars()\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As shown above, the dataset includes data about different car models, featuring attributes such as `Miles_per_Gallon`, `Cylinders`, `Displacement` etc. We will utilize these attributes to build a linear regression model, predicting the target variable `Horsepower`.\n", + "\n", + "We will first perform some data cleaning by removing columns that contain `NA` values." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df = df[['Horsepower', 'Displacement']].dropna()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the dataset loaded, you're all set to move forward to the next step: using our package's `run_linear_regression` function to prepare the data, fit a model, and evaluate its performance.\n", + "\n", + "We will specify the `target_column`, `numeric_feats`, `categorical_feats` and `drop_feats`. In this case, `target_column` will be `Horsepower` since we are trying to predict its value. `numeric_feats` will be all the numeric features that we want to scale using scikit-learn's `StandardScaler`. `categorical_feats` will be the categorical features (in this case only `Origin`) that we want to perform one-hot encoding on using scikit-learn's `OneHotEncoder`. `drop_feats` will be the columns that we do not want to include in the analysis, in which in this case will be `Name` since it does not provide any meaningful information to the analysis.\n", + "\n", + "For the `scoring_metrics`, we will specify `r2` to evaluate the performance of the model on test data." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Summary\n", + "------------------------\n", + "Test r2: 0.846\n" + ] + } + ], + "source": [ + "from vega_datasets import data\n", + "from linreg_ally.models import run_linear_regression\n", + "\n", + "df = data.cars()\n", + "df = df.dropna()\n", + "\n", + "# Define parameters for run_linear_regression\n", + "target_column = \"Horsepower\"\n", + "numeric_feats = [\"Miles_per_Gallon\", \"Cylinders\", \"Displacement\", \"Weight_in_lbs\", \"Acceleration\"] \n", + "categorical_feats = [\"Origin\"]\n", + "drop_feats = [\"Name\"]\n", + "random_state = 123\n", + "scoring_metrics = [\"r2\"]\n", + "\n", + "best_model, X_train, X_test, y_train, y_test, scores = run_linear_regression(\n", + " dataframe=df,\n", + " target_column=target_column,\n", + " numeric_feats=numeric_feats,\n", + " categorical_feats=categorical_feats,\n", + " drop_feats=drop_feats,\n", + " random_state=random_state,\n", + " scoring_metrics=scoring_metrics\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`best_model` provides a visual summary of the steps used in the entire linear regression pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('preprocessor',\n",
+       "                 ColumnTransformer(transformers=[('standardscaler',\n",
+       "                                                  StandardScaler(),\n",
+       "                                                  ['Miles_per_Gallon',\n",
+       "                                                   'Cylinders', 'Displacement',\n",
+       "                                                   'Weight_in_lbs',\n",
+       "                                                   'Acceleration']),\n",
+       "                                                 ('onehotencoder',\n",
+       "                                                  OneHotEncoder(), ['Origin']),\n",
+       "                                                 ('drop', 'drop', ['Name'])])),\n",
+       "                ('model', LinearRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('preprocessor',\n", + " ColumnTransformer(transformers=[('standardscaler',\n", + " StandardScaler(),\n", + " ['Miles_per_Gallon',\n", + " 'Cylinders', 'Displacement',\n", + " 'Weight_in_lbs',\n", + " 'Acceleration']),\n", + " ('onehotencoder',\n", + " OneHotEncoder(), ['Origin']),\n", + " ('drop', 'drop', ['Name'])])),\n", + " ('model', LinearRegression())])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "best_model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Scores give the R² and negative mean squared error scores that we are interested in finding out in order to understand how the model performs on the test data." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'r2': 0.8463952369304465}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As shown above, an R² score of 85% indicates that 85% of the variance in the dependent variable can be explained by the independent variables included in the model, showing that the model provides a good fit to the data.\n", + "\n", + "However, R² alone does not tell the whole story, for example if there might be multicollinearity or other issues. You might also want to consider other metrics like Mean Squared Error (MSE), Root Mean Squared Error (RMSE), or visually inspect residual plots to gain a more comprehensive understanding of model performance." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is the end of this tutorial where you have seen how we use the `run_linear_regression` function in our package to preprocess data, run linear regression and output with scoring metrics." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -92,12 +842,10 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "#Using Cheng's outputs! Tested my `qq_and_residuals_plot` function on his branch\n", - "\n", "# y_actual is y_test (true labels)\n", "y_actual = y_test\n", "\n", @@ -114,24 +862,91 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 13, "metadata": {}, "outputs": [ { - "ename": "NameError", - "evalue": "name 'y_actual' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[2], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mlinreg_ally\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mplotting\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m qq_and_residuals_plot\n\u001b[0;32m----> 3\u001b[0m qq_and_residuals_plot(\u001b[43my_actual\u001b[49m, y_predicted)\n", - "\u001b[0;31mNameError\u001b[0m: name 'y_actual' is not defined" - ] + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "#move this import to the top \n", - "\n", "from linreg_ally.plotting import qq_and_residuals_plot\n", "\n", "qq_and_residuals_plot(y_actual, y_predicted)"