UBC-MDS · jim-franklin · Feb 1, 2025 · Feb 1, 2025 · Feb 1, 2025 · Feb 1, 2025
diff --git a/docs/example.ipynb b/docs/example.ipynb
@@ -26,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -50,7 +50,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -154,7 +154,7 @@
        "4     Geramny  "
       ]
      },
-     "execution_count": 2,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -185,7 +185,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -251,7 +251,7 @@
        "average_revenue_per_customer  1778.571429"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -266,13 +266,13 @@
    "source": [
     "## Get Revenue Share for each Product Category\n",
     "\n",
-    "Another feature of `saleanalyzer`, the `segment_revenue_share()` function, segments products into three categories (cheap < medium < expensive) — based on their price, and calculates the respective share of total revenue contributed by each segment. By default, the price thresholds are set automatically, but users can define custom thresholds to categorize products according to their specific business needs. This function is particularly useful for analyzing product sales data and understanding revenue distribution across different pricing tiers.\n",
+    "Another feature of `saleanalyzer_mds`, the `segment_revenue_share()` function, segments products into three categories (cheap < medium < expensive) — based on their price, and calculates the respective share of total revenue contributed by each segment. By default, the price thresholds are set automatically, but users can define custom thresholds to categorize products according to their specific business needs. This function is particularly useful for analyzing product sales data and understanding revenue distribution across different pricing tiers.\n",
     "> Use help(sales_summary_statistics) for more information about the function"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -331,7 +331,7 @@
        "2    expensive          7000             56.22"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -344,7 +344,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -403,7 +403,7 @@
        "2    expensive          7700             61.85"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -420,23 +420,28 @@
    "source": [
     "## Predict Future Sales\n",
     "\n",
-    "Now that you have a good summary of your **past** sales, say, you want to peek into the **future** and predict how your products will sell in a month, 2 months or even a year? You can do this with `predict_sales()` function. This function uses a Random Forest machine learning model to make predictions on your specified target (e.g. quantity sold). The output will be a data frame with predicted values, and the model's performance score (Mean Squared Error).\n",
+    "Now that you have a good summary of your **past** sales, say, you want to peek into the **future** and predict how your products will sell in a month, 2 months or even a year? You can do this with `predict_sales()` function. This function uses a Random Forest machine learning model to make predictions on your specified target (e.g. quantity sold). The output will be a data frame with predicted values, and the model's performance score (Mean Squared Error and R Squared).\n",
     "\n",
     "> **Important** <br>\n",
     "> `predict_sales()` checks for duplicate entries, and only considers unique data points <br>\n",
-    "> By default the function uses 70% data for training and 30% for testing, to change that you can pass test_size = 0.2 increase the ratio, if your data size is small "
+    "> By default the function uses 70% data for training and 30% for testing, to change that you can pass test_size = 0.2 increase the ratio, if your data size is small \n",
+    "<br><br>\n",
+    "> **Model Performance Scores:**<br>\n",
+    "> - Mean Squared Error: average squared difference between predicted values and the actual values \n",
+    "> - Coefficient of Determination $(R^2)$: how well-observed results are reproduced by the model, depending on the ratio of total deviation of results described by the model.\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "MSE of the model: 6.7\n"
+      "MSE of the model: 6.7\n",
+      "R_squared of the model: -6.54\n"
      ]
     },
     {
@@ -482,7 +487,7 @@
        "1              1.33"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -514,14 +519,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "MSE of the model: 1.72\n"
+      "MSE of the model: 1.72\n",
+      "R_squared of the model: 0.0\n"
      ]
     },
     {
@@ -567,7 +573,7 @@
        "1              1.88"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -586,7 +592,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "salesanalyzer",
+   "display_name": "salesanalyzser",
    "language": "python",
    "name": "python3"
   },
@@ -600,7 +606,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.12.2"
   }
  },
  "nbformat": 4,

diff --git a/src/salesanalyzer_mds/predict_sales.py b/src/salesanalyzer_mds/predict_sales.py
@@ -4,7 +4,7 @@
 from sklearn.compose import make_column_transformer
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.ensemble import RandomForestRegressor
-from sklearn.metrics import mean_squared_error
+from sklearn.metrics import mean_squared_error, r2_score
 
 def predict_sales(sales_data, new_data, numeric_features, categorical_features, target, date_feature=None, test_size=0.3):
     """
@@ -32,7 +32,7 @@ def predict_sales(sales_data, new_data, numeric_features, categorical_features,
     Returns:
     --------
     pd.DataFrame:
-        A data frame with prediction values, and a printed out MSE score.
+        A data frame with prediction values, and a printed out MSE score and R^2 score.
 
     Examples:
     ---------
@@ -99,10 +99,13 @@ def predict_sales(sales_data, new_data, numeric_features, categorical_features,
 
     y_pred = model.predict(X_test)
     mse = mean_squared_error(y_test, y_pred)
+    r2 = r2_score(y_test, y_pred)
 
     new_pred = model.predict(X_new)
 
     print("MSE of the model:", round(mse, 2))
+    print("R_squared of the model:", round(r2, 2))
+
     result = pd.DataFrame({
         "Predicted values": [round(value, 2) for value in new_pred]
     })

diff --git a/tests/test_data.csv b/tests/test_data.csv
diff --git a/tests/test_predict_sales.py b/tests/test_predict_sales.py
@@ -2,15 +2,33 @@
 import pytest
 import pandas as pd
 
-test_data = pd.read_csv('tests/test_data.csv')
-test_new_data = test_data.copy()
+@pytest.fixture
+def test_data():
+    """Sample data for function testing"""
+    test_data = pd.DataFrame({
+    "product_name": [
+        "Laptop", "Monitor", "Headphones", "Laptop", "Headphones", "Laptop", "Monitor"
+    ],
+    "unit_price": [1200, 800, 150, 3000, 200, 2000, 500],
+    "invoice_date": ['2023-11-03', '2024-12-08', '2024-06-26', '2024-05-11', '2024-02-14', '2024-01-20', '2024-05-30'],
+    "city": ['Vancouver', 'Toronto', 'Calgary', 'Vancouver', 'Calgary', 'Vancouver', 'Toronto'],
+    "quantity": [1, 2, 2, 4, 4, 2, 5]
+    })
+    return test_data
+
+
+@pytest.fixture
+def test_new_data(test_data):
+    """Creates a copy of test data to use for new predictions"""
+    return test_data.copy()
+
 valid_cat_features = ['product_name', 'city']
 valid_num_features = ['unit_price']
 valid_date_feature = 'invoice_date'
 valid_target = 'quantity'
 
 
-def test_input_type():
+def test_input_type(test_data, test_new_data):
     """Test that predict_sales() detects the wrong input data types correctly"""
     with pytest.raises(ValueError, match="sales_data parameter should be a pandas DataFrame"):
         predict_sales("not_a_dataframe", test_new_data, valid_num_features,
@@ -41,7 +59,7 @@ def test_input_type():
                       valid_cat_features, valid_target, valid_date_feature)
 
 
-def test_output_no_date_feature():
+def test_output_no_date_feature(test_data, test_new_data):
     """Tests that predict_sales() round and returns a dictionary without a date feature"""
     result = predict_sales(test_data, test_new_data, valid_num_features, 
                            valid_cat_features, valid_target)
@@ -50,7 +68,7 @@ def test_output_no_date_feature():
     assert all(abs(value - round(value, 2)) < 1e-6 for value in result["Predicted values"])
 
 
-def test_output_with_date_feature():
+def test_output_with_date_feature(test_data, test_new_data):
     """Tests that predict_sales() round and returns a dictionary with a date feature"""
     result = predict_sales(test_data, test_new_data, valid_num_features, 
                            valid_cat_features, valid_target, valid_date_feature)
@@ -59,7 +77,7 @@ def test_output_with_date_feature():
     assert all(abs(value - round(value, 2)) < 1e-6 for value in result["Predicted values"])
 
 
-def test_missing_input():
+def test_missing_input(test_data):
     """Tests if predict_sales() raises a ValueError when there is missing input"""
     with pytest.raises(Exception):
-        predict_sales(test_data, new_data)
+        predict_sales(test_data)