-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain.py
408 lines (325 loc) · 18.3 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
import os, time, numpy as np, argparse, random
import pickle
import simpy
import datetime
import numpy as np
np_precision = np.float32
# tf only on cpu
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import tensorflow as tf
# errors
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# If the machine used does not have enough memory
memory_restriction = False
if memory_restriction:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction = 0.8
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)
# Import agent modules
from src.prod_environment import System
import src.util as u
import src.tfloss as loss
from src.tfmodel import ActiveInferenceModel
from src.tfutils import *
parser = argparse.ArgumentParser(description='Training script.')
parser.add_argument('-b', '--batch', type=int, default=1, help='Select batch size.')
parser.add_argument('-g', '--gamma', type=float, default=0.0, help='Select gamma for balance in the hybrid planning') # gamma_hybrid hyperparameter responsible for controlling the balance between short and long horizon in EFE
parser.add_argument('-s', '--steps', type=int, default=30, help='How many actions the transition considers')
parser.add_argument('--samples', type=int, default=10, help='How many samples should be used to calculate EFE')
parser.add_argument('--calc_mean', action='store_true', help='Whether mean should be considered during calculation of EFE')
args = parser.parse_args()
'''
a: The sum a+d show the maximum value of omega
b: This shows the average value of D_kl[pi] that will cause half sigmoid (i.e. d+a/2)
c: This moves the steepness of the sigmoid
d: This is the minimum omega (when sigmoid is zero)
'''
"""
8 of hours or one shift of the manufacturing environment is around ~ 3000 steps
"""
var_a = 1.0; var_b = 25.0; var_c = 5.0; var_d = 1.5
s_dim = 512; pi_dim = 7; beta_s = 1.0; beta_o = 1.0;
gamma = 0.0; gamma_rate = 0.01; gamma_max = 0.8; gamma_delay = 30
l_rate_top = 1e-04; l_rate_mid = 1e-04; l_rate_down = 1e-04
epochs = 30; ROUNDS = 1000;
replay_capacity = 500; replay_size = 200
discount = 0.99; update_target_frequency = 500;
test_frequency = 1; record_frequency = 1;
random_seed = 0
sim_interval = 0.5 #1 #4
o_dim = 10 + 1 + 6*5 + 3
po_dim = o_dim # all the times are removed in the observation and prediction
random_init_time = 1 * 24 * 60 * 60 # 10 days
test_time = 1 * 24 * 60 * 60 # 1 days
signature = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
signature += '_gamma-EFE-' +str(args.gamma) + '_' + str(args.steps) + '_' + str(args.samples) + '_' + str(s_dim) + '_' + str(args.calc_mean) + '_' + str(args.batch) + '_' + str(l_rate_down) + '_' + str(l_rate_top) + '_' + str(ROUNDS)
signature = signature.replace('.', '-')
folder = './results'
folder_results = folder + '/' + signature
folder_chp = folder + '/checkpoints'
try: os.mkdir(folder)
except: print('Folder already exists!!')
try: os.mkdir(folder_results)
except: print('Folder results creation error')
try: os.mkdir(folder_chp)
except: print('Folder chp creation error')
# fixing the random seeds before initializing the model
random.seed(random_seed)
np.random.seed(random_seed)
tf.random.set_seed(random_seed)
print(f'gamma_hybrid (balance between in hybrid EFE) = {args.gamma}')
print(f'Transition steps = {args.steps}')
model = ActiveInferenceModel(s_dim=s_dim, pi_dim=pi_dim, gamma=gamma, steps=args.steps, beta_s=beta_s, beta_o=beta_o, o_dim=o_dim, po_dim=po_dim, gamma_hybrid=args.gamma, samples=args.samples, calc_mean=args.calc_mean)
replay_buffer = u.ReplayBuffer(capacity=replay_capacity)
# training performance will be recorded here; EFE will also be retrieved from the the model parameters
stats_start = {
'F': {},
'F_top': {},
'F_mid': {},
'F_down': {},
'kl_div_s': {},
'kl_div_s_anal': {},
'kl_div_s_naive': {},
'kl_div_s_naive_anal': {},
'bin_cross_entr_o': {},
'kl_div_pi': {},
'reward_mse': {},
'buffer_recon': {},
'machine_state_recon': {},
'reward': {},
'test_G': {},
'test_q_term': {},
'test_term0': {},
'test_term1': {},
'test_term2': {},
'test_G_mean': {},
'test_q_term_mean': {},
'test_term0_mean': {},
'test_term1_mean': {},
'test_term2_mean': {},
'test_reward': {},
'test_reward_energy': {},
'test_reward_prod': {},
'test_reward_average': {},
'test_throughput_loss': {},
'test_energy_saving': {},
'test_improving_consum_part': {},
'best_test' : {},
}
stats = stats_start
optimizers = {}
optimizers['top'] = tf.keras.optimizers.Adam(learning_rate=l_rate_top)
optimizers['mid'] = tf.keras.optimizers.Adam(learning_rate=l_rate_mid)
optimizers['down'] = tf.keras.optimizers.Adam(learning_rate=l_rate_down)
best_test_reward = 0
best_test_epoch = 0
start_epoch = 1
start_time = time.time()
for epoch in range(start_epoch, epochs + 1):
u.reset_recordings(model, batch_size=args.batch)
# Stats epoch init
stats['F'][epoch] = []
stats['F_top'][epoch] = []
stats['F_mid'][epoch] = []
stats['F_down'][epoch] = []
stats['bin_cross_entr_o'][epoch] = []
stats['kl_div_s'][epoch] = []
stats['kl_div_s_anal'][epoch] = []
stats['kl_div_s_naive'][epoch] = []
stats['kl_div_s_naive_anal'][epoch] = []
stats['kl_div_pi'][epoch] = []
stats['reward_mse'][epoch] = []
stats['buffer_recon'][epoch] = []
stats['machine_state_recon'][epoch] = []
stats['reward'][epoch] = []
stats['test_G'][epoch] = []
if epoch > gamma_delay and model.model_down.gamma < gamma_max:
model.model_down.gamma.assign(model.model_down.gamma+gamma_rate)
env=simpy.Environment()
env_test=simpy.Environment()
system = System(env=env, number_of_systems=args.batch, dmodel=model, warmup=False)
system_test = System(env=env_test, number_of_systems=1, dmodel=model, warmup=False)
# systems warmup, which also removes the profile
system.warmup()
system_test.warmup()
# triggering decisions from the AIF module/algorithm rather than "ALL ON" of the system
system.dmodel_from_now()
system_test.dmodel_from_now()
# systems' initialization with random agent; this will create a good preference start (i.e., ~0.67) to improve
model.decision = 'random'
env.run(until=env.now + random_init_time)
env_test.run(until=env_test.now + random_init_time)
model.decision = 'aif'
print(f'Epoch {epoch}: Random init is now done! Decisions are now based the AIF model!')
# pre-filling of the replay buffer with the model (which is random at this beginning)
while replay_buffer.size() < replay_size:
o0, o1, pi0, log_Ppi, r, os0, os1, pis = u.batch_observe(env, model, sim_interval, batch_size=args.batch, steps=args.steps)
replay_buffer.record_interaction(o0=o0, o1=o1, pi0=pi0, r=r, os0=os0, os1=os1, pis=pis)
# untrained testing and stat recording
if epoch == 1:
print('Initial testing')
epoch = 0 # after recording performance of untrained agent it makes the the epoch 1
u.reset_recordings(model, batch_size=args.batch)
env_test.run(until = env_test.now + test_time)
# creating a query form the last time window of the testing system
rewards = system_test.systems[0].reward()
performances = system_test.systems[0].performance()
print(f'Testing in epoch {epoch} (untrained): Rewards -> {rewards} - Performances -> {performances}')
# recording efe for the testing
# one test env batch -> it's id is 0 then
stats['test_G'][epoch] = np.mean(model.efe[0]['G'], axis=0)
stats['test_q_term'][epoch] = np.mean(model.efe[0]['q_term'], axis=0)
stats['test_term0'][epoch] = np.mean(model.efe[0]['term0'], axis=0)
stats['test_term1'][epoch] = np.mean(model.efe[0]['term1'], axis=0)
stats['test_term2'][epoch]= np.mean(model.efe[0]['term2'], axis=0)
stats['test_G_mean'][epoch] = np.mean(model.efe[0]['G'])
stats['test_q_term_mean'][epoch] = np.mean(model.efe[0]['q_term'])
stats['test_term0_mean'][epoch] = np.mean(model.efe[0]['term0'])
stats['test_term1_mean'][epoch] = np.mean(model.efe[0]['term1'])
stats['test_term2_mean'][epoch] = np.mean(model.efe[0]['term2'])
stats['test_reward'][epoch] = rewards[-1]
stats['test_reward_energy'][epoch] = rewards[-2]
stats['test_reward_prod'][epoch] = rewards[-3]
stats['test_reward_average'][epoch] = np.mean(np.array(model.o_t[0]).reshape(-1, o_dim)[:, -1]) # model.o_t[i] was rest at the beginning of testing
stats['test_throughput_loss'][epoch] = performances[-3]
stats['test_energy_saving'][epoch] = performances[-2]
stats['test_improving_consum_part'][epoch] = performances[-1]
if stats['test_reward'][epoch] > best_test_reward:
best_test_reward = stats['test_reward'][epoch]
best_test_epoch = epoch
stats['best_test'][epoch] = epoch
stats['best_test'][epoch] = best_test_epoch
u.reset_recordings(model, batch_size=args.batch)
# saving the initial stats dictionary for further analysis during training
stats_path = folder_results + '/stats_epoch_' + str(epoch) + '.pkl'
# Save the dictionary to a binary file using pickle
with open(stats_path, 'wb') as file:
pickle.dump(stats, file)
epoch = 1 # now this make the epoch 1 for the first training
for round in range(ROUNDS):
""""
This part of the code is responsible for having a new observation for all the systems in the batch
we check whether i_th element of observation is obtained for all
sim_interval is better to be small to prevent having more than one observation in either case before update
"""
o0, o1, pi0, log_Ppi, r, os0, os1, pis = u.batch_observe(env, model, sim_interval, batch_size=args.batch, steps=args.steps) # obtaining new observation for all systems in the batch
batch = replay_buffer.sample(replay_size -1)
o0b, o1b, pi0b, rb, os0b, os1b, pisb = map(list, zip(*batch)) #zip(*batch)
update_size = args.batch*replay_size
# ensuring the last interaction is added
o0b.append(o0)
o1b.append(o1)
pi0b.append(pi0)
rb.append(r)
os0b.append(os0)
os1b.append(os1)
pisb.append(pis)
# now recording the interaction in the replay buffer
replay_buffer.record_interaction(o0=o0, o1=o1, pi0=pi0, r=r, os0=os0, os1=os1, pis=pis)
# creating np arrays for training
o0b = np.array(o0b, dtype=np_precision).reshape(update_size,-1)
o1b = np.array(o1b, dtype=np_precision).reshape(update_size,-1)
pi0b = np.array(pi0b, dtype=np_precision).reshape(update_size,-1)
rb = np.array(rb, dtype=np_precision).reshape(update_size,-1)
os0b = np.array(os0b, dtype=np_precision).reshape(update_size,-1)
os1b = np.array(os1b, dtype=np_precision).reshape(update_size,-1)
pisb = np.array(pisb, dtype=np_precision).reshape(update_size,-1)
# -- TRAIN TOP LAYER ---------------------------------------------------
""""
can be trained with replay buffer; however, omega can't be computed with replay buffer as it is following the current decision/policy (i.e., log_Ppi) and therefore should be computed with the latest policy
"""
# dqn
F_top = loss.train_model_top(model_top=model.model_top, o0=o0b, o1=o1b, pi0=pi0b, r=rb, optimizer=optimizers['top'], gamma=discount)
if round % update_target_frequency == 0:
synchronize_target_network(model.model_top)
current_omega, kl_div_pi = loss.compute_omega(model=model, o0=o0, log_Ppi=log_Ppi, a=var_a, b=var_b, c=var_c, d=var_d)
current_omega = current_omega.reshape(-1,1)
current_omega_b = np.repeat(current_omega, repeats=(o0b.shape[0]/args.batch), axis=0) # broadcasting current omega for the batch as omega can't be computed with replay buffer as it is following the current decision/policy (i.e., log_Ppi) and therefore should be computed with the latest policy
# -- TRAIN MIDDLE LAYER ------------------------------------------------ can be trained with replay buffer
qss0b, _, _ = model.model_down.encoder_with_sample(os0b)
qss1b_mean, qss1b_logvar = model.model_down.encoder(os1b)
pss1b_mean, pss1b_logvar = loss.train_model_mid(model_mid=model.model_mid, s0=qss0b, qs1_mean=qss1b_mean, qs1_logvar=qss1b_logvar, Ppi_sampled=pisb, omega=current_omega_b, optimizer=optimizers['mid'])
# -- TRAIN DOWN LAYER -------------------------------------------------- can be trained with replay buffer
loss.train_model_down(model_down=model.model_down, o1=os1b, ps1_mean=pss1b_mean, ps1_logvar=pss1b_logvar, omega=current_omega_b, optimizer=optimizers['down'])
# -- COMPUTING LOSS TERMS --------------------------------------------------
F_mid, loss_terms_mid, pss1b, pss1b_mean, pss1b_logvar = loss.compute_loss_mid(model_mid=model.model_mid, s0=qss0b, Ppi_sampled=pisb, qs1_mean=qss1b_mean, qs1_logvar=qss1b_logvar, omega=current_omega_b)
F_down, loss_terms, pos1b, qss1b = loss.compute_loss_down(model_down=model.model_down, o1=os1b, ps1_mean=pss1b_mean, ps1_logvar=pss1b_logvar, omega=current_omega_b)
# # train-level stats
stats['F'][epoch].append(np.mean(F_down) + np.mean(F_mid) + np.mean(F_top))
stats['F_top'][epoch].append(np.mean(F_top))
stats['F_mid'][epoch].append(np.mean(F_mid))
stats['F_down'][epoch].append(np.mean(F_down))
stats['bin_cross_entr_o'][epoch].append(np.mean(loss_terms[0]))
stats['kl_div_s'][epoch].append(np.mean(loss_terms[1]))
stats['kl_div_s_anal'][epoch].append(np.mean(loss_terms[2],axis=0))
stats['kl_div_s_naive'][epoch].append(np.mean(loss_terms[3]))
stats['kl_div_s_naive_anal'][epoch].append(np.mean(loss_terms[4],axis=0))
stats['kl_div_pi'][epoch].append(np.mean(kl_div_pi))
stats['reward_mse'][epoch].append(np.mean(u.compare_reward(o1b, pos1b)))
stats['buffer_recon'][epoch].append(u.buffer_recon(o1b, pos1b)[0]) # using Euclidean distance over batches
stats['machine_state_recon'][epoch].append(u.machine_state_recon(o1b, pos1b))
stats['reward'][epoch].append(np.mean(o1[:,-1])) # batch isn't necessary
# Only recording means of each epoch of train-level stats (to reduce the size of the file)
stats['F'][epoch] = np.mean(stats['F'][epoch])
stats['F_top'][epoch] = np.mean(stats['F_top'][epoch])
stats['F_mid'][epoch] = np.mean(stats['F_mid'][epoch])
stats['F_down'][epoch] = np.mean(stats['F_down'][epoch])
stats['bin_cross_entr_o'][epoch] = np.mean(stats['bin_cross_entr_o'][epoch])
stats['kl_div_s'][epoch] = np.mean(stats['kl_div_s'][epoch])
stats['kl_div_s_anal'][epoch] = np.mean(stats['kl_div_s_anal'][epoch] )
stats['kl_div_s_naive'][epoch] = np.mean(stats['kl_div_s_naive'][epoch])
stats['kl_div_s_naive_anal'][epoch] = np.mean(stats['kl_div_s_naive_anal'][epoch])
stats['kl_div_pi'][epoch] = np.mean(stats['kl_div_pi'][epoch])
stats['reward_mse'][epoch] = np.mean(stats['reward_mse'][epoch])
stats['buffer_recon'][epoch] = np.mean(stats['buffer_recon'][epoch])
stats['machine_state_recon'][epoch] = np.mean(stats['machine_state_recon'][epoch])
stats['reward'][epoch] = np.mean(stats['reward'][epoch])
# testing
if epoch % test_frequency == 0:
u.reset_recordings(model, batch_size=args.batch)
env_test.run(until = env_test.now + test_time)
# creating a query form the last time window of the testing system
rewards = system_test.systems[0].reward()
performances = system_test.systems[0].performance()
print(f'Testing in epoch {epoch}: Rewards -> {rewards} - Performances -> {performances}')
# recording efe for the testing
# one test env batch -> it's id is 0 then
# stats['test_G'][epoch].append(model.efe[0]['G']) # excluded due to the stats file size
stats['test_G'][epoch] = np.mean(model.efe[0]['G'], axis=0)
stats['test_q_term'][epoch] = np.mean(model.efe[0]['q_term'], axis=0)
stats['test_term0'][epoch] = np.mean(model.efe[0]['term0'], axis=0)
stats['test_term1'][epoch] = np.mean(model.efe[0]['term1'], axis=0)
stats['test_term2'][epoch]= np.mean(model.efe[0]['term2'], axis=0)
stats['test_G_mean'][epoch] = np.mean(model.efe[0]['G'])
stats['test_q_term_mean'][epoch] = np.mean(model.efe[0]['q_term'])
stats['test_term0_mean'][epoch] = np.mean(model.efe[0]['term0'])
stats['test_term1_mean'][epoch] = np.mean(model.efe[0]['term1'])
stats['test_term2_mean'][epoch] = np.mean(model.efe[0]['term2'])
stats['test_reward'][epoch] = rewards[-1]
stats['test_reward_energy'][epoch] = rewards[-2]
stats['test_reward_prod'][epoch] = rewards[-3]
stats['test_reward_average'][epoch] = np.mean(np.array(model.o_t[0]).reshape(-1, o_dim)[:, -1]) # model.o_t[i] was rest at the beginning of testing
stats['test_throughput_loss'][epoch] = performances[-3]
stats['test_energy_saving'][epoch] = performances[-2]
stats['test_improving_consum_part'][epoch] = performances[-1]
if stats['test_reward'][epoch] > best_test_reward:
best_test_reward = stats['test_reward'][epoch]
best_test_epoch = epoch
stats['best_test'][epoch] = epoch
stats['best_test'][epoch] = best_test_epoch
u.reset_recordings(model, batch_size=args.batch)
if epoch % record_frequency == 0:
# save the stats dictionary for further analysis during training
stats_path = folder_results + '/stats_epoch_' + str(epoch) + '.pkl'
# save the dictionary to a binary file using pickle
with open(stats_path, 'wb') as file:
pickle.dump(stats, file)
# save the stats dictionary for further analysis
stats_path = folder_results + '/stats_final.pkl'
# save the dictionary to a binary file using pickle
with open(stats_path, 'wb') as file:
pickle.dump(stats, file)