Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write run_linear_regression function #18

Merged
merged 3 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.11"

[tool.poetry.group.dev.dependencies]
pytest = "^8.3.4"

[tool.semantic_release]
version_toml = [
"pyproject.toml:tool.poetry.version",
Expand Down
76 changes: 71 additions & 5 deletions src/linreg_ally/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
def run_linear_regression(dataframe):
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import get_scorer, get_scorer_names

def run_linear_regression(dataframe, target_column, numeric_feats, categorical_feats, drop_feats=None, test_size=0.2, random_state=None, scoring_metrics=['r2', 'neg_mean_squared_error']):
"""
Performs linear regression with preprocessing using sklearn and outputs evaluation scoring metrics.

Expand All @@ -19,7 +28,7 @@ def run_linear_regression(dataframe):
random_state: `int`, optional
controls the shuffling applied to the data before the split (default None).
scoring_metrics: `list`, optional
scoring metrics to evaluate the model (default 'r2', 'mean_squared_error').
scoring metrics to evaluate the model (default 'r2', 'neg_mean_squared_error').

Returns
-------
Expand All @@ -44,9 +53,66 @@ def run_linear_regression(dataframe):
>>> categorical_feats = ['category']
>>> drop_feats = []
>>> best_model, X_train, X_test, y_train, y_test, scores = run_linear_regression(
... df, target_column, numeric_feats, categorical_feats, drop_feats, metrics=['r2', 'mean_squared_error']
... df, target_column, numeric_feats, categorical_feats, drop_feats, scoring_metrics=['r2', 'neg_mean_squared_error']
... )
>>> scores
{'r2': 0.52, 'mean_squared_error': 1.23}
{'r2': 0.52, 'neg_mean_squared_error': 1.23}
"""
pass

if not isinstance(dataframe, pd.DataFrame):
raise TypeError("dataframe must be a pandas DataFrame.")

if dataframe.shape[1] <= 1:
raise ValueError("dataframe must contain more than one column.")

if target_column not in dataframe.columns:
raise ValueError(f"target_column '{target_column}' is not in the dataframe.")

if not (0.0 < test_size < 1.0):
raise ValueError("test_size must be between 0.0 and 1.0.")

if random_state is not None and not isinstance(random_state, int):
raise TypeError("random_state must be an integer.")

if not isinstance(scoring_metrics, list) or not all(isinstance(metric, str) for metric in scoring_metrics):
raise TypeError("scoring_metrics must be a list of strings.")

if not all(metric in get_scorer_names() for metric in scoring_metrics):
invalid_metrics = [metric for metric in scoring_metrics if metric not in get_scorer_names()]
raise ValueError(f"The following scoring metrics are not valid: {', '.join(invalid_metrics)}")

drop_feats = drop_feats if drop_feats is not None else []

X = dataframe.drop(columns=[target_column])
y = dataframe[target_column]

preprocessor = make_column_transformer(
(StandardScaler(), numeric_feats),
(OneHotEncoder(), categorical_feats),
('drop', drop_feats)
)

pipe = Pipeline([
('preprocessor', preprocessor),
('model', LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

pipe.fit(X_train, y_train)

best_model = pipe

predictions = best_model.predict(X_test)

scores = {}
for metric in scoring_metrics:
scorer = get_scorer(metric)
scores[metric] = scorer._score_func(y_test, predictions)

print("Model Summary")
print("------------------------")
for metric, score in scores.items():
print(f"Test {metric}: {score:.3f}")

return best_model, X_train, X_test, y_train, y_test, scores
90 changes: 90 additions & 0 deletions tests/test_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from linreg_ally.models import run_linear_regression
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

df = pd.DataFrame({
"feature_1": [1, 2, 3, 4, 5],
"feature_2": [0.5, 0.1, 0.4, 0.9, 0.6],
"category": ["a", "b", "a", "b", "c"],
"target": [1.0, 2.5, 3.4, 4.3, 5.1]
})

target_column = 'target'
numeric_feats = ['feature_1', 'feature_2']
categorical_feats = ['category']
drop_feats = []

# Function to create sample DataFrame for testing
def create_sample_dataframe():
return pd.DataFrame({
"feature_1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"feature_2": [0.5, 0.1, 0.4, 0.9, 0.6, 0.7, 0.3, 0.2, 0.8, 1.0],
"category": ["a", "b", "a", "b", "c", "a", "b", "c", "a", "b"],
"target": [1.0, 2.5, 3.4, 4.3, 5.1, 6.2, 7.1, 8.3, 9.5, 10.0]
})

df = create_sample_dataframe()
target_column = 'target'
numeric_feats = ['feature_1', 'feature_2']
categorical_feats = ['category']
drop_feats = []

# Test 1: Check if the function result has correct return type and values
def test_return_type_and_length():
result = run_linear_regression(df, target_column, numeric_feats, categorical_feats, drop_feats)
assert isinstance(result, tuple), "The return type should be a tuple."
assert len(result) == 6, "The tuple should have 6 elements."
assert isinstance(result[0], Pipeline), "The first element should be a Pipeline."
assert isinstance(result[1], pd.DataFrame), "The second element should be a DataFrame (X_train)."
assert isinstance(result[2], pd.DataFrame), "The third element should be a DataFrame (X_test)."
assert isinstance(result[3], pd.Series), "The fourth element should be a Series (y_train)."
assert isinstance(result[4], pd.Series), "The fifth element should be a Series (y_test)."
assert isinstance(result[5], dict), "The sixth element should be a dictionary (scores)."
assert "r2" in result[5], "r2 score should be in the scores dictionary."
assert "neg_mean_squared_error" in result[5], "neg_mean_squared_error score should be in the scores dictionary."

# Test 2: Check if TypeError is raised when a non-DataFrame is provided
def test_invalid_dataframe():
try:
run_linear_regression("not_a_dataframe", target_column, numeric_feats, categorical_feats, drop_feats)
except TypeError as e:
assert str(e) == "dataframe must be a pandas DataFrame."
else:
assert False, "TypeError not raised for non-DataFrame input."

# Test 3: Check if ValueError is raised when the target column is not in the DataFrame
def test_target_column_present():
try:
run_linear_regression(df.drop(columns=[target_column]), target_column, numeric_feats, categorical_feats, drop_feats)
except ValueError as e:
assert f"target_column '{target_column}' is not in the dataframe." in str(e)
else:
assert False, "ValueError not raised for missing target column."

# Test 4: Check if an invalid test_size raises a ValueError
def test_invalid_test_size():
try:
run_linear_regression(df, target_column, numeric_feats, categorical_feats, drop_feats, test_size=1.5)
except ValueError as e:
assert str(e) == "test_size must be between 0.0 and 1.0."
else:
assert False, "ValueError not raised for invalid test_size."

# Test 5: Check if an invalid random_state raises a ValueError
def test_invalid_random_state():
try:
run_linear_regression(df, target_column, numeric_feats, categorical_feats, drop_feats, random_state="not_an_int")
except TypeError as e:
assert str(e) == "random_state must be an integer."
else:
assert False, "TypeError not raised for non-integer random_state."

# Test 6: Check if invalid scoring_metrics raises a ValueError
def test_invalid_scoring_metric():
try:
run_linear_regression(df, target_column, numeric_feats, categorical_feats, drop_feats, scoring_metrics=['invalid_metric'])
except ValueError as e:
assert "are not valid" in str(e)
else:
assert False, "ValueError not raised for invalid scoring_metric."
Loading