Skip to content

Commit

Permalink
Merge pull request #77 from UBC-MDS/test_summary_doc
Browse files Browse the repository at this point in the history
Test summary docs and readme updated
  • Loading branch information
yasmin2424 authored Feb 2, 2025
2 parents bc43463 + 90e2795 commit 902db6c
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 57 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ $ pip install tidylinreg

The `tidylinreg` package fits a linear model to a dataset, and can be used to carry out regression.
`tidylinreg` computes and returns a list of summary statistics of the fitted linear model, including standard error, confidence intervals, and p-values.
These summary statistics are ouput as a [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html). This is advantageous as it allows for fast and convenient manipulation of large regression models,
These summary statistics are output as a [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html). This is advantageous as it allows for fast and convenient manipulation of large regression models,
where, for example, insignificant parameters can easily be filtered out!

## Functions
## FunctionsF

`tidylinreg` is built around the `LinearModel` object, which offers three useful methods:

Expand All @@ -32,10 +32,10 @@ where, for example, insignificant parameters can easily be filtered out!
the object must be fitted to the data before anything else!
- Please be advised that at the current state of development, `fit` only accepts continuous regressors. If your data is categorical,
first transforming into dummy variables with encoding techniques, such as [One-Hot Encoding](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)
- Watch out for collinearity! `tidlinreg` will let you know if there is any linear dependence in your data
- Watch out for collinearity! `tidylinreg` will let you know if there is any linear dependence in your data
before fitting.
provided by Scikit-Learn.
- For convenience, the intercept is automatically included into the regression model. No need to modify your data to accomodate this!
- For convenience, the intercept is automatically included into the regression model. No need to modify your data to accommodate this!
- `predict`:
- Predict the response using given test regressor data. Remember to fit the model first!
- `summary`:
Expand Down Expand Up @@ -87,7 +87,7 @@ Once `tidylinreg` is installed, you can import the `LinearModel` object to begin
my_linear_model.summary(ci=True)
```

The default significance level is 0.05, giving 95% confidence intervals. We can change this by modifying the `alpha` arguument. For example, if we want wider 99% confidence intervals, we can set `alpha` to 0.01:
The default significance level is 0.05, giving 95% confidence intervals. We can change this by modifying the `alpha` argument. For example, if we want wider 99% confidence intervals, we can set `alpha` to 0.01:

```python
my_linear_model.summary(ci=True, alpha=0.01)
Expand Down
78 changes: 26 additions & 52 deletions tests/test_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,83 +4,64 @@
import statsmodels.api as sm
from tidylinreg.tidylinreg import LinearModel


@pytest.fixture
def linear_model():
"""Fixture for initializing a LinearModel instance."""
"""Return a LinearModel instance."""
return LinearModel()


@pytest.fixture
def example_data():
"""Fixture for creating example data."""
"""Generate sample data with a linear relationship and noise."""
np.random.seed(524)
X = pd.DataFrame({'x': [-2, -1, 0, 1, 2]})
y = 3 * X.squeeze() + 2 + np.random.normal(0, 0.5, X.shape[0])
return X, y


def test_summary_no_fit(linear_model):
"""Test that summary raises an error if the model is not fitted."""
"""Ensure summary() raises an error if model is not fitted."""
with pytest.raises(ValueError):
linear_model.summary()


def test_summary_basic(linear_model, example_data):
"""Test the basic functionality of the summary method."""
"""Verify summary() returns a DataFrame with expected columns."""
X, y = example_data
linear_model.fit(X, y)
summary_df = linear_model.summary()

assert isinstance(summary_df, pd.DataFrame), "Summary output should be a DataFrame."
assert all(
col in summary_df.columns
for col in ["Parameter", "Estimate", "Std. Error", "T-Statistic", "P-Value"]
), "Summary should contain the expected columns."

assert isinstance(summary_df, pd.DataFrame)
assert all(col in summary_df.columns for col in
["Parameter", "Estimate", "Std. Error", "T-Statistic", "P-Value"])

def test_summary_with_ci(linear_model, example_data):
"""Test summary output when confidence intervals are requested."""
"""Ensure summary() includes confidence intervals when requested."""
X, y = example_data
linear_model.fit(X, y)
summary_df = linear_model.summary(ci=True, alpha=0.05)

assert isinstance(summary_df, pd.DataFrame), "Summary output should be a DataFrame."
assert all(
col in summary_df.columns
for col in ["Parameter", "Estimate", "Std. Error", "T-Statistic", "P-Value", "CI Lower", "CI Upper"]
), "Summary should include confidence intervals when requested."

assert isinstance(summary_df, pd.DataFrame)
assert all(col in summary_df.columns for col in
["Parameter", "Estimate", "Std. Error", "T-Statistic", "P-Value", "CI Lower", "CI Upper"])

def test_summary_output_without_ci(example_data):
"""
Test if summary outputs the correct DataFrame structure without confidence intervals.
"""
"""Check that summary() excludes confidence intervals when ci=False."""
X, y = example_data
model = LinearModel()
model.fit(X, y)
model.get_std_error()
model.get_test_statistic()
summary = model.summary(ci=False)

assert isinstance(summary, pd.DataFrame), "Summary should return a DataFrame."

assert "Parameter" in summary.columns, "Summary should include 'Parameter' column."
assert "Estimate" in summary.columns, "Summary should include 'Estimate' column."
assert "Std. Error" in summary.columns, "Summary should include 'Std. Error' column."
assert "T-Statistic" in summary.columns, "Summary should include 't-value' column."
assert "P-Value" in summary.columns, "Summary should include 'p-value' column."
assert "CI Lower" not in summary.columns, "Confidence intervals should not be included when ci=False."
assert "CI Upper" not in summary.columns, "Confidence intervals should not be included when ci=False."


assert isinstance(summary, pd.DataFrame)
assert "CI Lower" not in summary.columns
assert "CI Upper" not in summary.columns

def test_summary_invalid_alpha(linear_model, example_data):
"""Test that an error is raised for invalid alpha values."""
"""Ensure summary() raises errors for invalid alpha values."""
X, y = example_data
linear_model.fit(X, y)

with pytest.raises(TypeError):
linear_model.summary(ci=True, alpha="hello")
linear_model.summary(ci=True, alpha="hello")
with pytest.raises(ValueError):
linear_model.summary(ci=True, alpha=0)
with pytest.raises(ValueError):
Expand All @@ -90,23 +71,16 @@ def test_summary_invalid_alpha(linear_model, example_data):
with pytest.raises(ValueError):
linear_model.summary(ci=True, alpha=1.1)


def test_summary_values(linear_model, example_data):
"""Test that the summary values are correct."""
"""Verify summary() estimates match those from statsmodels."""
X, y = example_data
linear_model.fit(X, y)
summary_df = linear_model.summary(ci=True, alpha=0.05)

# Use statsmodels to calculate expected values


X_with_const = sm.add_constant(X)
model = sm.OLS(y, X_with_const).fit()

# Replace expected values with those from statsmodels
expected_params = model.params.values # Intercept and slope
expected_pvalues = model.pvalues # P-values for parameters

# Check parameter estimates
assert np.allclose(summary_df["Estimate"].values, expected_params, atol=0.1), \
f"Parameter estimates are incorrect: {summary_df['Estimate'].values}."


expected_params = model.params.values
expected_pvalues = model.pvalues

assert np.allclose(summary_df["Estimate"].values, expected_params, atol=0.1)

0 comments on commit 902db6c

Please sign in to comment.