-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsoft_actor_critic_inverted_double_pendulum.py
295 lines (244 loc) · 12.1 KB
/
soft_actor_critic_inverted_double_pendulum.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
import typing as tp
from collections import OrderedDict, deque
import matplotlib.animation as anim
import random
from dataclasses import dataclass
from itertools import count
import torch
from torch import nn, Tensor
ENV_NAME = "InvertedDoublePendulum-v5"
def update_scene(num, frames, patch):
patch.set_data(frames[num])
return patch,
def plot_animation(frames:list, save_path:tp.Optional[str]=None, repeat=False, interval=40, title:tp.Optional[str]=None):
fig = plt.figure()
patch = plt.imshow(frames[0])
plt.axis('off')
if title:
plt.title(title)
animation = anim.FuncAnimation(
fig, update_scene, fargs=(frames, patch),
frames=len(frames), repeat=repeat, interval=interval)
if save_path is not None:
animation.save(save_path, writer="pillow", fps=20)
return animation
def show_one_episode(action_sampler:tp.Callable, save_path:tp.Optional[str]=None, repeat=False, title:tp.Optional[str]=None):
frames = []
env = gym.make(ENV_NAME, render_mode="rgb_array")
obs, info = env.reset()
sum_rewards = int(0)
with torch.no_grad():
for step in count(0):
frames.append(env.render())
action = action_sampler(obs)
obs, reward, done, truncated, info = env.step(action)
sum_rewards += reward
if done or truncated:
print("Sum of Rewards:", sum_rewards)
print("done at step", step+1)
break
env.close()
return plot_animation(frames, repeat=repeat, save_path=save_path, title=title)
@dataclass
class xonfig:
num_episodes:int = 1000
gamma:float = 0.99
device:torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_updates:int = 1 # idk why but for 10 updates, result's weren't so great...
adaptive_alpha:bool = True
alpha:float = 0.12 # initial value
tau: float = 0.005
buffer_size:int = 50_000
batch_size:int = 64
dqn_lr:float = 5e-4
actor_lr:float = 5e-4
alpha_lr:float = 5e-4
weight_decay:float = 0.0
hidden_dim:int = 64
class ActorPolicy(nn.Module):
def __init__(
self,
state_dims:int,
hidden_dim:int,
action_dims:int,
):
super().__init__()
self.l1 = nn.Linear(state_dims, hidden_dim); self.relu1 = nn.ReLU()
self.l2 = nn.Linear(hidden_dim, hidden_dim); self.relu2 = nn.ReLU()
self.mu_mean = nn.Linear(hidden_dim, action_dims)
self.sigma_log_std = nn.Linear(hidden_dim, action_dims)
def forward(self, state:Tensor):
x = self.relu1(self.l1(state))
x = self.relu2(self.l2(x))
mu = self.mu_mean(x)
# If log_std is too small (e.g. log_std << 20, the standard deviation becomes extremely close to zero, leading to highly peaked distributions.
# This can cause numerical issues like exploding gradients or division by near-zero values during backpropagation
# if log_std is too large (e.g., log_std > 2), the standard deviation becomes excessively large, leading to very high-variance policies,
# high exploration, and poor convergence.
# [-20, 2] has been found to work well across a variety of continuous control tasks in reinforcement learning
std = torch.clip(self.sigma_log_std(x), -20, 2).exp() # after exp bounds => (2.06e-09, 7.3890)
return mu, std
class CriticActionValue(nn.Module):
def __init__(self, state_dims:int, hidden_dim:int, action_dims:int):
super().__init__()
self.l1 = nn.Linear(state_dims + action_dims, hidden_dim); self.relu1 = nn.ReLU()
self.l2 = nn.Linear(hidden_dim, hidden_dim); self.relu2 = nn.ReLU()
self.l3 = nn.Linear(hidden_dim, 1)
def forward(
self,
state:Tensor, # (B, state_dims)
action:Tensor # (B, action_dims)
):
x = torch.cat([state, action], dim=-1) # (B, dim = state_dims + action_dims)
x = self.relu1(self.l1(x))
x = self.relu2(self.l2(x))
q_value = self.l3(x)
return q_value # (B, 1)
@torch.no_grad()
def update_ema(ema_model:nn.Module, model:nn.Module, decay:float):
ema_params = OrderedDict(ema_model.named_parameters())
model_params = OrderedDict(model.named_parameters())
for name, param in model_params.items():
# ema = decay * ema + (1 - decay) * param
ema_params[name].mul_(decay).add_(param.data, alpha=1-decay)
def sample_actions(state:Tensor, actions_max_bound:float):
mu, std = policy_net(state)
dist = torch.distributions.Normal(mu, std)
unbound_action = dist.rsample() # (B, actions_dim) # dist.sample() is torch.no_grad() mode
action = torch.tanh(unbound_action)*actions_max_bound # [-1, 1] * max => [-max, max]
# CHATGPTed the tanh correction: TODO: Understand this
log_prob = dist.log_prob(unbound_action) - torch.log(1 - action.pow(2) + 1e-6) # Tanh correction
log_prob:Tensor = log_prob.sum(dim=-1, keepdim=True) # Sum over action dimensions
return action, log_prob
# return action, dist
@torch.compile()
def sac_train_step(
states:Tensor,
actions:Tensor,
next_states:Tensor,
rewards:Tensor,
is_terminal:Tensor
):
"""
* `states`: `(B, state_dim)`
* `actions`: `(B, action_dim)`
* `next_states`: `(B, state_dim)`
* `rewards`: `(B,)`
* `is_terminal`: `(B,)`
"""
rewards, is_terminal = rewards.unsqueeze(-1), is_terminal.unsqueeze(-1) # (B,) => (B, 1)
# Optimize DQNs
## a_next ~ π(s_next)
## get target Q values: y = r + γ * ( Q_target(s_next, a_next) - α * log(π(a_next|s_next)) ) * (1 - is_terminal)
## L1 = MSE(Q1(s, a), y) ## L2 = MSE(Q2(s, a), y) ## optimize loss (L1, L2)
with torch.no_grad():
actions_next, log_prob = sample_actions(next_states, ACTION_BOUNDS)
q_next1, q_next2 = dqn_target1(next_states, actions_next), dqn_target2(next_states, actions_next) # (B, 1), (B, 1)
# why sum on log prob? because dist.log_prob(a_next) is (B, action_dim) and we want to sum over action_dim to get (B, 1)
# why min of the 2 q values? To avoid maximization bias, see https://arxiv.org/abs/1812.05905
q_next:Tensor = torch.min(q_next1, q_next2) - xonfig.alpha * log_prob # (B, 1)
q_next_target:Tensor = rewards + xonfig.gamma * q_next * (1 - is_terminal) # (B, 1)
dqn1_loss = nn.functional.mse_loss(dqn1(states, actions), q_next_target, reduction="mean")
dqn2_loss = nn.functional.mse_loss(dqn2(states, actions), q_next_target, reduction="mean")
(dqn1_loss + dqn2_loss).backward()
# dqn1_loss.backward(); dqn2_loss.backward()
dqn1_optimizer.step(); dqn2_optimizer.step()
dqn1_optimizer.zero_grad(); dqn2_optimizer.zero_grad()
# Optimize Policy
dqn1.requires_grad_(False); dqn2.requires_grad_(False)
actions, log_probs = sample_actions(states, ACTION_BOUNDS)
## maximize entropy, minimize negative entropy
## maximize q value by minimizing -q value, tweaks the policy weights through the actions to maximize q value, doesn't tweak the q network itself as they are freezed
pi_loss:Tensor = (xonfig.alpha * log_probs - torch.min(dqn1(states, actions), dqn2(states, actions))).mean()
pi_loss.backward()
policy_optimizer.step()
policy_optimizer.zero_grad()
dqn1.requires_grad_(True); dqn2.requires_grad_(True)
# Optimize Alpha
if xonfig.adaptive_alpha:
alpha_loss = -log_alpha * (log_prob + target_entropy).mean()
alpha_loss.backward()
alpha_optimizer.step()
alpha_optimizer.zero_grad()
xonfig.alpha = log_alpha.exp().item()
if __name__ == "__main__":
env = gym.make(ENV_NAME)
STATE_DIMS = env.observation_space.shape[0] # 24
ACTION_DIMS = env.action_space.shape[0] # 4
ACTION_BOUNDS = [env.action_space.low, env.action_space.high] # [array([-1., -1., -1., -1.], dtype=float32), array([1., 1., 1., 1.], dtype=float32)]
ACTION_BOUNDS = ACTION_BOUNDS[1][0] # 1.0
if xonfig.adaptive_alpha:
log_alpha = torch.nn.Parameter(
torch.tensor(np.log(xonfig.alpha)),
requires_grad=True
)
target_entropy = torch.tensor(-ACTION_DIMS, device=xonfig.device, dtype=torch.float32)
alpha_optimizer = torch.optim.AdamW([log_alpha], lr=xonfig.alpha_lr, weight_decay=0.0)
dqn1 = CriticActionValue(STATE_DIMS, xonfig.hidden_dim, ACTION_DIMS).to(xonfig.device); dqn1.compile()
dqn2 = deepcopy(dqn1)
dqn_target1 = deepcopy(dqn1).requires_grad_(False)
dqn_target2 = deepcopy(dqn2).requires_grad_(False)
dqn1_optimizer = torch.optim.AdamW(dqn1.parameters(), lr=xonfig.dqn_lr, weight_decay=xonfig.weight_decay)
dqn2_optimizer = torch.optim.AdamW(dqn2.parameters(), lr=xonfig.dqn_lr, weight_decay=xonfig.weight_decay)
policy_net = ActorPolicy(STATE_DIMS, xonfig.hidden_dim, ACTION_DIMS).to(xonfig.device); policy_net.compile()
policy_optimizer = torch.optim.AdamW(policy_net.parameters(), lr=xonfig.actor_lr, weight_decay=xonfig.weight_decay)
replay_buffer = deque(maxlen=xonfig.buffer_size)
sum_rewards_list = []; num_timesteps_list = []
try:
for episode in range(1, xonfig.num_episodes+1):
state, info = env.reset()
state = torch.as_tensor(state, device=xonfig.device, dtype=torch.float32)
sum_rewards = float(0)
for tstep in count(1):
# sample action from policy
with torch.no_grad():
action, _log_prob = sample_actions(state.unsqueeze(0), ACTION_BOUNDS) # (1, actions_dims)
action = action.squeeze(0) # (action_dims,)
# action into the environment and get the next state and reward
next_state, reward, done, truncated, info = env.step(action.cpu().detach().numpy())
next_state = torch.as_tensor(next_state, dtype=torch.float32, device=xonfig.device)
sum_rewards += reward
# store the transition in the replay buffer
replay_buffer.append((
next_state.cpu(), action.cpu(), torch.as_tensor(reward).cpu(),
state.cpu(), torch.as_tensor(done).cpu()
))
# optimize networks
if len(replay_buffer) >= xonfig.batch_size*5:
for _ in range(xonfig.num_updates):
batched_samples = random.sample(replay_buffer, xonfig.batch_size)
next_states, actions, rewards, states, dones = [
torch.as_tensor(np.asarray(inst), device=xonfig.device, dtype=torch.float32) for inst in list(zip(*batched_samples))
] # (B, state_dim), (B, action_dim), (B,), (B, state_dim), (B,)
sac_train_step(states, actions, next_states, rewards, dones)
update_ema(dqn_target1, dqn1, decay=1 - xonfig.tau)
update_ema(dqn_target2, dqn2, decay=1 - xonfig.tau)
if done or truncated:
break
state = next_state
sum_rewards_list.append(sum_rewards)
num_timesteps_list.append(tstep)
print(f"|| Episode: {episode} || Sum of Rewards: {sum_rewards:.4f} || Timesteps: {tstep} ||")
except KeyboardInterrupt:
print("Training Interrupted")
adaptive_str = 'adaptive_alpha' if xonfig.adaptive_alpha else ''
show_one_episode(
lambda x: sample_actions(torch.as_tensor(x, dtype=torch.float32, device=xonfig.device).unsqueeze(0), ACTION_BOUNDS)[0].squeeze(0).cpu().numpy(),
save_path=f"images/sac_{ENV_NAME.lower()}_.gif",
title=f"SAC Trained Agent {f'{adaptive_str} ' if adaptive_str else ''}"
); plt.close()
plt.plot(sum_rewards_list, label="Sum of Rewards")
plt.plot(num_timesteps_list, label="Timesteps")
plt.yticks(np.arange(0, mx:=max(sum_rewards_list), mx//10).tolist())
plt.xlabel("Episodes")
plt.grid(True)
plt.legend()
plt.ylabel("Sum of Rewards")
plt.title("Sum of Rewards per Episode")
plt.savefig(f"images/sac_rewards_{adaptive_str}_{ENV_NAME}.png")
plt.show()
plt.close()