-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathoptimizers.py
132 lines (97 loc) · 3.64 KB
/
optimizers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import numpy as np
class Optimizer:
def __init__(self, lr=0.01):
self.lr = lr
def step(self, param, dparam):
pass
class SGD(Optimizer):
def __init__(self, lr=0.01, momentum=0.0):
super().__init__(lr=lr)
self.momentum = momentum
self.v = None
def step(self, param, dparam):
if self.v is None:
self.v = np.zeros_like(dparam)
self.v = self.momentum * self.v + self.lr * dparam
param = param - self.v
return param
class Adagrad(Optimizer):
def __init__(self, lr=0.01, eps=1e-8):
super().__init__(lr=lr)
self.eps = eps
self.squared_grad = None
def step(self, param, dparam):
if self.squared_grad is None:
self.squared_grad = np.zeros_like(dparam)
self.squared_grad = self.squared_grad + dparam ** 2
param = param - (self.lr * dparam / (np.sqrt(self.squared_grad) + self.eps))
return param
class Adadelta(Optimizer):
def __init__(self, lr=0.01, rho=0.9, eps=1e-8):
super().__init__(lr=lr)
self.rho = rho
self.eps = eps
self.acc_update = None
self.squared_avg = None
def step(self, param, dparam):
if self.acc_update is None:
self.acc_update = np.zeros_like(dparam)
self.squared_avg = np.zeros_like(dparam)
# Accumulated gradient
self.squared_avg = self.rho * self.squared_avg + (1 - self.rho) * dparam * dparam
std = np.sqrt(self.squared_avg + self.eps)
delta = (np.sqrt(self.acc_update + self.eps) / std) * dparam
param = param - delta
self.acc_update = self.rho * self.acc_update + (1 - self.rho) * delta * delta
return param
class RMSProp(Optimizer):
def __init__(self, lr=0.01, rho=0.9, eps=1e-8):
super().__init__(lr=lr)
self.rho = rho
self.eps = eps
self.squared_avg = None
def step(self, param, dparam):
if self.squared_avg is None:
self.squared_avg = np.zeros_like(dparam)
self.squared_avg = self.rho * self.squared_avg + (1 - self.rho) * dparam * dparam
param = param - ((self.lr * dparam) / (np.sqrt(self.squared_avg) + self.eps))
return param
class Adam(Optimizer):
def __init__(self, lr=0.01, beta1=0.9, beta2=0.999, eps=1e-8):
super().__init__(lr=lr)
self.beta1 = beta1
self.beta2 = beta2
self.eps = eps
self.t = 0
self.m = None
self.v = None
def step(self, param, dparam):
self.t += 1
if self.m is None:
self.m = np.zeros_like(dparam)
self.v = np.zeros_like(dparam)
self.m = self.beta1 * self.m + (1 - self.beta1) * dparam
self.v = self.beta2 * self.v + (1 - self.beta2) * (dparam * dparam)
mt = self.m / (1 - self.beta1 ** self.t)
vt = self.v / (1 - self.beta2 ** self.t)
param = param - ((self.lr * mt) / (np.sqrt(vt) + self.eps))
return param
class AdaMax(Optimizer):
def __init__(self, lr=0.01, beta1=0.9, beta2=0.999, eps=1e-8):
super().__init__(lr=lr)
self.beta1 = beta1
self.beta2 = beta2
self.eps = eps
self.t = 0
self.m = None
self.u = None
def step(self, param, dparam):
self.t += 1
if self.m is None:
self.m = np.zeros_like(dparam)
self.u = np.zeros_like(dparam)
self.m = self.beta1 * self.m + (1 - self.beta1) * dparam
self.u = np.maximum((self.beta2 * self.u), np.abs(dparam))
mt = self.m / (1 - self.beta1 ** self.t)
param = param - (self.lr * mt) / (self.u + self.eps)
return param