Skip to content

Commit

Permalink
Issue #10: adjust demo hyperparameters
Browse files Browse the repository at this point in the history
  • Loading branch information
s-sd committed Aug 18, 2023
1 parent 18145cd commit cb6bc60
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 17 deletions.
2 changes: 1 addition & 1 deletion demos/cartpole_single_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
reinforce.learning_rate = 0.0001
reinforce.optimizer = tf.keras.optimizers.Adam(reinforce.learning_rate, epsilon=1e-6, clipnorm=1e1)

reinforce = train(reinforce, trials=10, episodes_per_trial=26, epochs_per_trial=4, batch_size=32, verbose=True)
reinforce = train(reinforce, trials=12, episodes_per_trial=26, epochs_per_trial=4, batch_size=32, verbose=True)

rewards, lengths = test(reinforce, trials=2, episodes_per_trial=16, deterministic=True)

Expand Down
8 changes: 4 additions & 4 deletions demos/tictactoe_sequential_self_play.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def invert_state(self, state):
policy_network = build_policy_network(state_space,
action_space,
policy_type = 'fcn',
layers = [128, 64, 32])
layers = [32, 16])

# =============================================================================
# Vanilla Self-Play (Best for TicTacToe)
Expand All @@ -68,7 +68,7 @@ def invert_state(self, state):
# last tested commit a657e502c0f2dae9eb8afee3853ed8cb1885f49e

# =============================================================================
# Fictitious Self-Play
# Fictitious Self-Play - could warm up with vanilla self play for best results
# =============================================================================

opponents_path = r'./temp/tictactoe_ops'
Expand All @@ -88,7 +88,7 @@ def invert_state(self, state):
print(f'\nMeta Trial: {meta_trial+1} / {meta_trials}\n')
reinforce = train(reinforce, trials=1, episodes_per_trial=16, epochs_per_trial=2, batch_size=32, verbose=True)
rewards, lengths = test(reinforce, trials=1, episodes_per_trial=4, deterministic=True)
if lengths > 6.0: # keep training longer for better performance
if lengths > 5.0: # keep training longer for better performance
break

# =============================================================================
Expand Down Expand Up @@ -133,6 +133,6 @@ def opponent_sampler(self, opponents_list):
print(f'\nMeta Trial: {meta_trial+1} / {meta_trials}\n')
reinforce = train(reinforce, trials=1, episodes_per_trial=16, epochs_per_trial=2, batch_size=32, verbose=True)
rewards, lengths = test(reinforce, trials=1, episodes_per_trial=4, deterministic=True)
if lengths > 6.0: # keep training longer for better performance
if lengths > 5.0: # keep training longer for better performance
break

2 changes: 1 addition & 1 deletion spurl/algorithms/reinforce/discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def select_action(self, state, deterministic=False):
action_probs = self.policy_network(state)

if np.isnan(action_probs).any():
raise ValueError(f'Network outputs contain NaN: {action_probs}, {state}')
raise ValueError(f'Network outputs contain NaN: {action_probs}')
# suggestions: reduce network size, clip grads, scale states, add regularisation

if deterministic:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def select_action(self, policy, state, deterministic=False):
action_probs = policy(state)

if np.isnan(action_probs).any():
raise ValueError('Network outputs contains NaN')
raise ValueError(f'Network outputs contain NaN: {action_probs}')
# suggestions: reduce network size, clip grads, scale states, add regularisation

if deterministic:
Expand Down
21 changes: 11 additions & 10 deletions spurl/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,15 @@ def test(algorithm, trials, episodes_per_trial, deterministic=False):
print(f' Mean Episode Length: {mean_episode_length}')
return mean_rewards, mean_episode_length

def train_self_play(algorithm, trials, episodes_per_trial, epochs_per_trial, batch_size, verbose, self_play_type):
for trial in range(trials):
print(f'\nTrial: {trial+1}/{trials}')
print(' Running environment')
if trials % algorithm.opponent_save_frequency:
algorithm.policy_network.save(os.path.join(algorithm.opponents_path, f'{trial}'))
states, actions, rewards = algorithm.run(episodes_per_trial)
print(' Updating policy network')
algorithm.update(states, actions, rewards, epochs_per_trial, batch_size, verbose=verbose)
return algorithm
# saving was implemented within self play run algo
# def train_self_play(algorithm, trials, episodes_per_trial, epochs_per_trial, batch_size, verbose, self_play_type):
# for trial in range(trials):
# print(f'\nTrial: {trial+1}/{trials}')
# print(' Running environment')
# if trials % algorithm.opponent_save_frequency:
# algorithm.policy_network.save(os.path.join(algorithm.opponents_path, f'{trial}'))
# states, actions, rewards = algorithm.run(episodes_per_trial)
# print(' Updating policy network')
# algorithm.update(states, actions, rewards, epochs_per_trial, batch_size, verbose=verbose)
# return algorithm

0 comments on commit cb6bc60

Please sign in to comment.