From 1162e45ba98650fa0db7dd7f5570a8e111579079 Mon Sep 17 00:00:00 2001 From: Daria Date: Sat, 1 Feb 2025 14:55:24 -0800 Subject: [PATCH 1/4] fix: feedback addressed by DongchunChen, add other metrics to predict_sales --- src/salesanalyzer_mds/predict_sales.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/salesanalyzer_mds/predict_sales.py b/src/salesanalyzer_mds/predict_sales.py index 5a1fc7a..1347a02 100644 --- a/src/salesanalyzer_mds/predict_sales.py +++ b/src/salesanalyzer_mds/predict_sales.py @@ -4,7 +4,7 @@ from sklearn.compose import make_column_transformer from sklearn.preprocessing import OneHotEncoder from sklearn.ensemble import RandomForestRegressor -from sklearn.metrics import mean_squared_error +from sklearn.metrics import mean_squared_error, r2_score def predict_sales(sales_data, new_data, numeric_features, categorical_features, target, date_feature=None, test_size=0.3): """ @@ -32,7 +32,7 @@ def predict_sales(sales_data, new_data, numeric_features, categorical_features, Returns: -------- pd.DataFrame: - A data frame with prediction values, and a printed out MSE score. + A data frame with prediction values, and a printed out MSE score and R^2 score. Examples: --------- @@ -99,10 +99,13 @@ def predict_sales(sales_data, new_data, numeric_features, categorical_features, y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) + r2 = r2_score(y_test, y_pred) new_pred = model.predict(X_new) print("MSE of the model:", round(mse, 2)) + print("R_squared of the model:", round(r2, 2)) + result = pd.DataFrame({ "Predicted values": [round(value, 2) for value in new_pred] }) From 93a1aa4f7e27fbb97b27401c5f9c94cfd614aa84 Mon Sep 17 00:00:00 2001 From: Daria Date: Sat, 1 Feb 2025 14:57:27 -0800 Subject: [PATCH 2/4] fix: feedback addressed by jenson-chang, #4 add pytest_fixture to test_predict_sales --- tests/test_predict_sales.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/tests/test_predict_sales.py b/tests/test_predict_sales.py index d698291..e597b0c 100644 --- a/tests/test_predict_sales.py +++ b/tests/test_predict_sales.py @@ -2,15 +2,33 @@ import pytest import pandas as pd -test_data = pd.read_csv('tests/test_data.csv') -test_new_data = test_data.copy() +@pytest.fixture +def test_data(): + """Sample data for function testing""" + test_data = pd.DataFrame({ + "product_name": [ + "Laptop", "Monitor", "Headphones", "Laptop", "Headphones", "Laptop", "Monitor" + ], + "unit_price": [1200, 800, 150, 3000, 200, 2000, 500], + "invoice_date": ['2023-11-03', '2024-12-08', '2024-06-26', '2024-05-11', '2024-02-14', '2024-01-20', '2024-05-30'], + "city": ['Vancouver', 'Toronto', 'Calgary', 'Vancouver', 'Calgary', 'Vancouver', 'Toronto'], + "quantity": [1, 2, 2, 4, 4, 2, 5] + }) + return test_data + + +@pytest.fixture +def test_new_data(test_data): + """Creates a copy of test data to use for new predictions""" + return test_data.copy() + valid_cat_features = ['product_name', 'city'] valid_num_features = ['unit_price'] valid_date_feature = 'invoice_date' valid_target = 'quantity' -def test_input_type(): +def test_input_type(test_data, test_new_data): """Test that predict_sales() detects the wrong input data types correctly""" with pytest.raises(ValueError, match="sales_data parameter should be a pandas DataFrame"): predict_sales("not_a_dataframe", test_new_data, valid_num_features, @@ -41,7 +59,7 @@ def test_input_type(): valid_cat_features, valid_target, valid_date_feature) -def test_output_no_date_feature(): +def test_output_no_date_feature(test_data, test_new_data): """Tests that predict_sales() round and returns a dictionary without a date feature""" result = predict_sales(test_data, test_new_data, valid_num_features, valid_cat_features, valid_target) @@ -50,7 +68,7 @@ def test_output_no_date_feature(): assert all(abs(value - round(value, 2)) < 1e-6 for value in result["Predicted values"]) -def test_output_with_date_feature(): +def test_output_with_date_feature(test_data, test_new_data): """Tests that predict_sales() round and returns a dictionary with a date feature""" result = predict_sales(test_data, test_new_data, valid_num_features, valid_cat_features, valid_target, valid_date_feature) @@ -59,7 +77,7 @@ def test_output_with_date_feature(): assert all(abs(value - round(value, 2)) < 1e-6 for value in result["Predicted values"]) -def test_missing_input(): +def test_missing_input(test_data): """Tests if predict_sales() raises a ValueError when there is missing input""" with pytest.raises(Exception): - predict_sales(test_data, new_data) + predict_sales(test_data) From 22e0b9803d48b0d7e6a314e4c43ce34efa5b8ba7 Mon Sep 17 00:00:00 2001 From: Daria Date: Sat, 1 Feb 2025 14:58:44 -0800 Subject: [PATCH 3/4] fix: feedback addressed by Lukman-Lateef, #3 added definitions of MSE to vignette --- docs/example.ipynb | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/docs/example.ipynb b/docs/example.ipynb index 4361d41..7a68b31 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -154,7 +154,7 @@ "4 Geramny " ] }, - "execution_count": 2, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -185,7 +185,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -251,7 +251,7 @@ "average_revenue_per_customer 1778.571429" ] }, - "execution_count": 3, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -266,13 +266,13 @@ "source": [ "## Get Revenue Share for each Product Category\n", "\n", - "Another feature of `saleanalyzer`, the `segment_revenue_share()` function, segments products into three categories (cheap < medium < expensive) — based on their price, and calculates the respective share of total revenue contributed by each segment. By default, the price thresholds are set automatically, but users can define custom thresholds to categorize products according to their specific business needs. This function is particularly useful for analyzing product sales data and understanding revenue distribution across different pricing tiers.\n", + "Another feature of `saleanalyzer_mds`, the `segment_revenue_share()` function, segments products into three categories (cheap < medium < expensive) — based on their price, and calculates the respective share of total revenue contributed by each segment. By default, the price thresholds are set automatically, but users can define custom thresholds to categorize products according to their specific business needs. This function is particularly useful for analyzing product sales data and understanding revenue distribution across different pricing tiers.\n", "> Use help(sales_summary_statistics) for more information about the function" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -331,7 +331,7 @@ "2 expensive 7000 56.22" ] }, - "execution_count": 4, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -344,7 +344,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -403,7 +403,7 @@ "2 expensive 7700 61.85" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -420,23 +420,28 @@ "source": [ "## Predict Future Sales\n", "\n", - "Now that you have a good summary of your **past** sales, say, you want to peek into the **future** and predict how your products will sell in a month, 2 months or even a year? You can do this with `predict_sales()` function. This function uses a Random Forest machine learning model to make predictions on your specified target (e.g. quantity sold). The output will be a data frame with predicted values, and the model's performance score (Mean Squared Error).\n", + "Now that you have a good summary of your **past** sales, say, you want to peek into the **future** and predict how your products will sell in a month, 2 months or even a year? You can do this with `predict_sales()` function. This function uses a Random Forest machine learning model to make predictions on your specified target (e.g. quantity sold). The output will be a data frame with predicted values, and the model's performance score (Mean Squared Error and R Squared).\n", "\n", "> **Important**
\n", "> `predict_sales()` checks for duplicate entries, and only considers unique data points
\n", - "> By default the function uses 70% data for training and 30% for testing, to change that you can pass test_size = 0.2 increase the ratio, if your data size is small " + "> By default the function uses 70% data for training and 30% for testing, to change that you can pass test_size = 0.2 increase the ratio, if your data size is small \n", + "

\n", + "> **Model Performance Scores:**
\n", + "> - Mean Squared Error: average squared difference between predicted values and the actual values \n", + "> - Coefficient of Determination $(R^2)$: how well-observed results are reproduced by the model, depending on the ratio of total deviation of results described by the model.\n" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "MSE of the model: 6.7\n" + "MSE of the model: 6.7\n", + "R_squared of the model: -6.54\n" ] }, { @@ -482,7 +487,7 @@ "1 1.33" ] }, - "execution_count": 6, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -514,14 +519,15 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "MSE of the model: 1.72\n" + "MSE of the model: 1.72\n", + "R_squared of the model: 0.0\n" ] }, { @@ -567,7 +573,7 @@ "1 1.88" ] }, - "execution_count": 7, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -586,7 +592,7 @@ ], "metadata": { "kernelspec": { - "display_name": "salesanalyzer", + "display_name": "salesanalyzser", "language": "python", "name": "python3" }, @@ -600,7 +606,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.2" } }, "nbformat": 4, From f8b4fab69a22202939c8a7d97aa68475847d7302 Mon Sep 17 00:00:00 2001 From: Daria Date: Sat, 1 Feb 2025 15:00:00 -0800 Subject: [PATCH 4/4] fix: removed test_data.csv as per feedback from jenson-chang --- tests/test_data.csv | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 tests/test_data.csv diff --git a/tests/test_data.csv b/tests/test_data.csv deleted file mode 100644 index 39920d2..0000000 --- a/tests/test_data.csv +++ /dev/null @@ -1,8 +0,0 @@ -product_name,unit_price,invoice_date,city,quantity -Laptop,1200,2023-11-03,Vancouver,1 -Monitor,800,2024-12-08,Toronto,2 -Headphones,150,2024-06-26,Calgary,2 -Laptop,3000,2024-05-11,Vancouver,4 -Headphones,200,2024-02-14,Calgary,4 -Laptop,2000,2024-01-20,Vancouver,2 -Monitor,500,2024-05-30,Toronto,5