-
-
Notifications
You must be signed in to change notification settings - Fork 46.1k
/
polynomial_regression.py
213 lines (170 loc) · 7.64 KB
/
polynomial_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
"""
Polynomial regression is a type of regression analysis that models the relationship
between a predictor x and the response y as an mth-degree polynomial:
y = β₀ + β₁x + β₂x² + ... + βₘxᵐ + ε
By treating x, x², ..., xᵐ as distinct variables, we see that polynomial regression is a
special case of multiple linear regression. Therefore, we can use ordinary least squares
(OLS) estimation to estimate the vector of model parameters β = (β₀, β₁, β₂, ..., βₘ)
for polynomial regression:
β = (XᵀX)⁻¹Xᵀy = X⁺y
where X is the design matrix, y is the response vector, and X⁺ denotes the Moore-Penrose
pseudoinverse of X. In the case of polynomial regression, the design matrix is
|1 x₁ x₁² ⋯ x₁ᵐ|
X = |1 x₂ x₂² ⋯ x₂ᵐ|
|⋮ ⋮ ⋮ ⋱ ⋮ |
|1 xₙ xₙ² ⋯ xₙᵐ|
In OLS estimation, inverting XᵀX to compute X⁺ can be very numerically unstable. This
implementation sidesteps this need to invert XᵀX by computing X⁺ using singular value
decomposition (SVD):
β = VΣ⁺Uᵀy
where UΣVᵀ is an SVD of X.
References:
- https://en.wikipedia.org/wiki/Polynomial_regression
- https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse
- https://en.wikipedia.org/wiki/Numerical_methods_for_linear_least_squares
- https://en.wikipedia.org/wiki/Singular_value_decomposition
"""
import matplotlib.pyplot as plt
import numpy as np
class PolynomialRegression:
__slots__ = "degree", "params"
def __init__(self, degree: int) -> None:
"""
@raises ValueError: if the polynomial degree is negative
"""
if degree < 0:
raise ValueError("Polynomial degree must be non-negative")
self.degree = degree
self.params = None
@staticmethod
def _design_matrix(data: np.ndarray, degree: int) -> np.ndarray:
"""
Constructs a polynomial regression design matrix for the given input data. For
input data x = (x₁, x₂, ..., xₙ) and polynomial degree m, the design matrix is
the Vandermonde matrix
|1 x₁ x₁² ⋯ x₁ᵐ|
X = |1 x₂ x₂² ⋯ x₂ᵐ|
|⋮ ⋮ ⋮ ⋱ ⋮ |
|1 xₙ xₙ² ⋯ xₙᵐ|
Reference: https://en.wikipedia.org/wiki/Vandermonde_matrix
@param data: the input predictor values x, either for model fitting or for
prediction
@param degree: the polynomial degree m
@returns: the Vandermonde matrix X (see above)
@raises ValueError: if input data is not N x 1
>>> x = np.array([0, 1, 2])
>>> PolynomialRegression._design_matrix(x, degree=0)
array([[1],
[1],
[1]])
>>> PolynomialRegression._design_matrix(x, degree=1)
array([[1, 0],
[1, 1],
[1, 2]])
>>> PolynomialRegression._design_matrix(x, degree=2)
array([[1, 0, 0],
[1, 1, 1],
[1, 2, 4]])
>>> PolynomialRegression._design_matrix(x, degree=3)
array([[1, 0, 0, 0],
[1, 1, 1, 1],
[1, 2, 4, 8]])
>>> PolynomialRegression._design_matrix(np.array([[0, 0], [0 , 0]]), degree=3)
Traceback (most recent call last):
...
ValueError: Data must have dimensions N x 1
"""
rows, *remaining = data.shape
if remaining:
raise ValueError("Data must have dimensions N x 1")
return np.vander(data, N=degree + 1, increasing=True)
def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
"""
Computes the polynomial regression model parameters using ordinary least squares
(OLS) estimation:
β = (XᵀX)⁻¹Xᵀy = X⁺y
where X⁺ denotes the Moore-Penrose pseudoinverse of the design matrix X. This
function computes X⁺ using singular value decomposition (SVD).
References:
- https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse
- https://en.wikipedia.org/wiki/Singular_value_decomposition
- https://en.wikipedia.org/wiki/Multicollinearity
@param x_train: the predictor values x for model fitting
@param y_train: the response values y for model fitting
@raises ArithmeticError: if X isn't full rank, then XᵀX is singular and β
doesn't exist
>>> x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
>>> y = x**3 - 2 * x**2 + 3 * x - 5
>>> poly_reg = PolynomialRegression(degree=3)
>>> poly_reg.fit(x, y)
>>> poly_reg.params
array([-5., 3., -2., 1.])
>>> poly_reg = PolynomialRegression(degree=20)
>>> poly_reg.fit(x, y)
Traceback (most recent call last):
...
ArithmeticError: Design matrix is not full rank, can't compute coefficients
Make sure errors don't grow too large:
>>> coefs = np.array([-250, 50, -2, 36, 20, -12, 10, 2, -1, -15, 1])
>>> y = PolynomialRegression._design_matrix(x, len(coefs) - 1) @ coefs
>>> poly_reg = PolynomialRegression(degree=len(coefs) - 1)
>>> poly_reg.fit(x, y)
>>> np.allclose(poly_reg.params, coefs, atol=10e-3)
True
"""
X = PolynomialRegression._design_matrix(x_train, self.degree) # noqa: N806
_, cols = X.shape
if np.linalg.matrix_rank(X) < cols:
raise ArithmeticError(
"Design matrix is not full rank, can't compute coefficients"
)
# np.linalg.pinv() computes the Moore-Penrose pseudoinverse using SVD
self.params = np.linalg.pinv(X) @ y_train
def predict(self, data: np.ndarray) -> np.ndarray:
"""
Computes the predicted response values y for the given input data by
constructing the design matrix X and evaluating y = Xβ.
@param data: the predictor values x for prediction
@returns: the predicted response values y = Xβ
@raises ArithmeticError: if this function is called before the model
parameters are fit
>>> x = np.array([0, 1, 2, 3, 4])
>>> y = x**3 - 2 * x**2 + 3 * x - 5
>>> poly_reg = PolynomialRegression(degree=3)
>>> poly_reg.fit(x, y)
>>> poly_reg.predict(np.array([-1]))
array([-11.])
>>> poly_reg.predict(np.array([-2]))
array([-27.])
>>> poly_reg.predict(np.array([6]))
array([157.])
>>> PolynomialRegression(degree=3).predict(x)
Traceback (most recent call last):
...
ArithmeticError: Predictor hasn't been fit yet
"""
if self.params is None:
raise ArithmeticError("Predictor hasn't been fit yet")
return PolynomialRegression._design_matrix(data, self.degree) @ self.params
def main() -> None:
"""
Fit a polynomial regression model to predict fuel efficiency using seaborn's mpg
dataset
>>> pass # Placeholder, function is only for demo purposes
"""
import seaborn as sns
mpg_data = sns.load_dataset("mpg")
poly_reg = PolynomialRegression(degree=2)
poly_reg.fit(mpg_data.weight, mpg_data.mpg)
weight_sorted = np.sort(mpg_data.weight)
predictions = poly_reg.predict(weight_sorted)
plt.scatter(mpg_data.weight, mpg_data.mpg, color="gray", alpha=0.5)
plt.plot(weight_sorted, predictions, color="red", linewidth=3)
plt.title("Predicting Fuel Efficiency Using Polynomial Regression")
plt.xlabel("Weight (lbs)")
plt.ylabel("Fuel Efficiency (mpg)")
plt.show()
if __name__ == "__main__":
import doctest
doctest.testmod()
main()