Issue #10: adjust demo hyperparameters

s-sd · Aug 18, 2023 · cb6bc60 · cb6bc60
1 parent 18145cd
commit cb6bc60
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 17 deletions.
diff --git a/demos/cartpole_single_agent.py b/demos/cartpole_single_agent.py
@@ -40,7 +40,7 @@
 reinforce.learning_rate = 0.0001
 reinforce.optimizer = tf.keras.optimizers.Adam(reinforce.learning_rate, epsilon=1e-6, clipnorm=1e1)
 
-reinforce = train(reinforce, trials=10, episodes_per_trial=26, epochs_per_trial=4, batch_size=32, verbose=True)
+reinforce = train(reinforce, trials=12, episodes_per_trial=26, epochs_per_trial=4, batch_size=32, verbose=True)
 
 rewards, lengths = test(reinforce, trials=2, episodes_per_trial=16, deterministic=True)
 

diff --git a/demos/tictactoe_sequential_self_play.py b/demos/tictactoe_sequential_self_play.py
@@ -46,7 +46,7 @@ def invert_state(self, state):
 policy_network = build_policy_network(state_space,
                                       action_space,
                                       policy_type = 'fcn',
-                                      layers = [128, 64, 32])
+                                      layers = [32, 16])
 
 # =============================================================================
 # Vanilla Self-Play (Best for TicTacToe)
@@ -68,7 +68,7 @@ def invert_state(self, state):
 # last tested commit a657e502c0f2dae9eb8afee3853ed8cb1885f49e
 
 # =============================================================================
-# Fictitious Self-Play
+# Fictitious Self-Play - could warm up with vanilla self play for best results
 # =============================================================================
 
 opponents_path = r'./temp/tictactoe_ops'
@@ -88,7 +88,7 @@ def invert_state(self, state):
     print(f'\nMeta Trial: {meta_trial+1} / {meta_trials}\n')
     reinforce = train(reinforce, trials=1, episodes_per_trial=16, epochs_per_trial=2, batch_size=32, verbose=True)    
     rewards, lengths = test(reinforce, trials=1, episodes_per_trial=4, deterministic=True)
-    if lengths > 6.0: # keep training longer for better performance
+    if lengths > 5.0: # keep training longer for better performance
         break
 
 # =============================================================================
@@ -133,6 +133,6 @@ def opponent_sampler(self, opponents_list):
     print(f'\nMeta Trial: {meta_trial+1} / {meta_trials}\n')
     reinforce = train(reinforce, trials=1, episodes_per_trial=16, epochs_per_trial=2, batch_size=32, verbose=True)    
     rewards, lengths = test(reinforce, trials=1, episodes_per_trial=4, deterministic=True)
-    if lengths > 6.0: # keep training longer for better performance
+    if lengths > 5.0: # keep training longer for better performance
         break
 
diff --git a/spurl/algorithms/reinforce/discrete.py b/spurl/algorithms/reinforce/discrete.py
@@ -13,7 +13,7 @@ def select_action(self, state, deterministic=False):
         action_probs = self.policy_network(state)
 
         if np.isnan(action_probs).any():
-            raise ValueError(f'Network outputs contain NaN: {action_probs}, {state}') 
+            raise ValueError(f'Network outputs contain NaN: {action_probs}') 
             # suggestions: reduce network size, clip grads, scale states, add regularisation
 
         if deterministic:

diff --git a/spurl/algorithms/reinforce/self_play_sequential_discrete.py b/spurl/algorithms/reinforce/self_play_sequential_discrete.py
@@ -24,7 +24,7 @@ def select_action(self, policy, state, deterministic=False):
         action_probs = policy(state)
 
         if np.isnan(action_probs).any():
-            raise ValueError('Network outputs contains NaN') 
+            raise ValueError(f'Network outputs contain NaN: {action_probs}')  
             # suggestions: reduce network size, clip grads, scale states, add regularisation
 
         if deterministic:

diff --git a/spurl/core.py b/spurl/core.py
@@ -24,14 +24,15 @@ def test(algorithm, trials, episodes_per_trial, deterministic=False):
         print(f'    Mean Episode Length: {mean_episode_length}')
     return mean_rewards, mean_episode_length
 
-def train_self_play(algorithm, trials, episodes_per_trial, epochs_per_trial, batch_size, verbose, self_play_type):
-    for trial in range(trials):
-        print(f'\nTrial: {trial+1}/{trials}')
-        print('    Running environment')
-        if trials % algorithm.opponent_save_frequency:
-            algorithm.policy_network.save(os.path.join(algorithm.opponents_path, f'{trial}'))
-        states, actions, rewards = algorithm.run(episodes_per_trial)
-        print('    Updating policy network')
-        algorithm.update(states, actions, rewards, epochs_per_trial, batch_size, verbose=verbose)
-    return algorithm
+# saving was implemented within self play run algo
+# def train_self_play(algorithm, trials, episodes_per_trial, epochs_per_trial, batch_size, verbose, self_play_type):
+#     for trial in range(trials):
+#         print(f'\nTrial: {trial+1}/{trials}')
+#         print('    Running environment')
+#         if trials % algorithm.opponent_save_frequency:
+#             algorithm.policy_network.save(os.path.join(algorithm.opponents_path, f'{trial}'))
+#         states, actions, rewards = algorithm.run(episodes_per_trial)
+#         print('    Updating policy network')
+#         algorithm.update(states, actions, rewards, epochs_per_trial, batch_size, verbose=verbose)
+#     return algorithm