diff --git a/demos/cartpole_single_agent.py b/demos/cartpole_single_agent.py index d6a81df..67df534 100644 --- a/demos/cartpole_single_agent.py +++ b/demos/cartpole_single_agent.py @@ -10,31 +10,26 @@ tf.random.set_seed(42) np.random.seed(42) -<<<<<<< HEAD -======= -def build_policy_network(state_shape, num_actions): - inputs = tf.keras.layers.Input(shape=state_shape) - flat = tf.keras.layers.Flatten()(inputs) - dense1 = tf.keras.layers.Dense(64, activation='relu')(flat) - dropout1 = tf.keras.layers.Dropout(0.4)(dense1) - dense2 = tf.keras.layers.Dense(32, activation='relu')(dropout1) - dropout2 = tf.keras.layers.Dropout(0.4)(dense2) - dense3 = tf.keras.layers.Dense(14, activation='relu')(dropout2) - dense4 = tf.keras.layers.Dense(num_actions, activation='softmax')(dense3) - policy_network = tf.keras.Model(inputs=inputs, outputs=dense4) - return policy_network - ->>>>>>> main +# def build_policy_network(state_shape, num_actions): +# inputs = tf.keras.layers.Input(shape=state_shape) +# flat = tf.keras.layers.Flatten()(inputs) +# dense1 = tf.keras.layers.Dense(64, activation='relu')(flat) +# dropout1 = tf.keras.layers.Dropout(0.4)(dense1) +# dense2 = tf.keras.layers.Dense(32, activation='relu')(dropout1) +# dropout2 = tf.keras.layers.Dropout(0.4)(dense2) +# dense3 = tf.keras.layers.Dense(14, activation='relu')(dropout2) +# dense4 = tf.keras.layers.Dense(num_actions, activation='softmax')(dense3) +# policy_network = tf.keras.Model(inputs=inputs, outputs=dense4) +# return policy_network + env = gym.make('CartPole-v1') -state_shape = env.observation_space.shape +state_space = env.observation_space action_space = env.action_space -num_actions = action_space.n # Build policy network -policy_network = build_policy_network(state_shape, - action_size = num_actions, - action_space = action_space, +policy_network = build_policy_network(state_space, + action_space, policy_type = 'fcn', layers = [64, 32, 14]) diff --git a/demos/dummy_single_agent_continuous.py b/demos/dummy_single_agent_continuous.py index f935bdd..8580a37 100644 --- a/demos/dummy_single_agent_continuous.py +++ b/demos/dummy_single_agent_continuous.py @@ -39,11 +39,10 @@ def reset(self): env = DummyEnv(action_size) action_space = env.action_space -state_shape = env.observation_space.shape +state_space = env.observation_space -policy_network = build_policy_network(state_shape, - action_size = action_size, - action_space = action_space, +policy_network = build_policy_network(state_space, + action_space, policy_type = 'fcn', layers = [128], activation_fn = 'linear') diff --git a/demos/dummy_single_agent_discrete.py b/demos/dummy_single_agent_discrete.py index 299f987..057531c 100644 --- a/demos/dummy_single_agent_discrete.py +++ b/demos/dummy_single_agent_discrete.py @@ -38,11 +38,10 @@ def reset(self): env = DummyEnv(num_actions) # Building policy network -state_shape = env.observation_space.shape +state_space = env.observation_space action_space = env.action_space -policy_network = build_policy_network(state_shape, - action_size = num_actions, - action_space = action_space, +policy_network = build_policy_network(state_space, + action_space, policy_type = 'fcn', layers = [128]) diff --git a/demos/pendulum_single_agent.py b/demos/pendulum_single_agent.py index 5b957fd..7af2b88 100644 --- a/demos/pendulum_single_agent.py +++ b/demos/pendulum_single_agent.py @@ -10,33 +10,28 @@ tf.random.set_seed(42) np.random.seed(42) -<<<<<<< HEAD -======= -def build_policy_network(state_shape, action_size): - inputs = tf.keras.layers.Input(shape=state_shape) - flat = tf.keras.layers.Flatten()(inputs) - dense1 = tf.keras.layers.Dense(128, activation='relu')(flat) - dropout1 = tf.keras.layers.Dropout(0.4)(dense1) - dense2 = tf.keras.layers.Dense(64, activation='relu')(dropout1) - dropout2 = tf.keras.layers.Dropout(0.4)(dense2) - dense3 = tf.keras.layers.Dense(32, activation='relu')(dropout2) - dense4 = tf.keras.layers.Dense(np.prod(action_size), activation='tanh')(dense3) +# def build_policy_network(state_shape, action_size): +# inputs = tf.keras.layers.Input(shape=state_shape) +# flat = tf.keras.layers.Flatten()(inputs) +# dense1 = tf.keras.layers.Dense(128, activation='relu')(flat) +# dropout1 = tf.keras.layers.Dropout(0.4)(dense1) +# dense2 = tf.keras.layers.Dense(64, activation='relu')(dropout1) +# dropout2 = tf.keras.layers.Dropout(0.4)(dense2) +# dense3 = tf.keras.layers.Dense(32, activation='relu')(dropout2) +# dense4 = tf.keras.layers.Dense(np.prod(action_size), activation='tanh')(dense3) - scaled_outputs = tf.keras.layers.Lambda(lambda x: (x + 1) * 2 - 2)(dense4) # scale to action space +# scaled_outputs = tf.keras.layers.Lambda(lambda x: (x + 1) * 2 - 2)(dense4) # scale to action space - policy_network = tf.keras.Model(inputs=inputs, outputs=scaled_outputs) - return policy_network +# policy_network = tf.keras.Model(inputs=inputs, outputs=scaled_outputs) +# return policy_network ->>>>>>> main env = gym.make("Pendulum-v1") action_space = env.action_space -action_size = env.action_space.shape -state_shape = env.observation_space.shape +state_space = env.observation_space -policy_network = build_policy_network(state_shape, - action_size = action_size, - action_space = action_space, +policy_network = build_policy_network(state_space, + action_space, policy_type = 'fcn', layers = [128, 64, 32], activation_fn = 'tanh') diff --git a/demos/tictactoe_sequential_selfplay.py b/demos/tictactoe_sequential_selfplay.py index 95839e9..e472a3d 100644 --- a/demos/tictactoe_sequential_selfplay.py +++ b/demos/tictactoe_sequential_selfplay.py @@ -18,21 +18,18 @@ tf.random.set_seed(42) np.random.seed(42) -<<<<<<< HEAD -======= -def build_policy_network(state_shape, action_size): - inputs = tf.keras.layers.Input(shape=state_shape) - flat = tf.keras.layers.Flatten()(inputs) - dense1 = tf.keras.layers.Dense(128, activation='relu')(flat) - dropout1 = tf.keras.layers.Dropout(0.4)(dense1) - dense2 = tf.keras.layers.Dense(64, activation='relu')(dropout1); - dropout2 = tf.keras.layers.Dropout(0.4)(dense2) - dense3 = tf.keras.layers.Dense(32, activation='relu')(dropout2) - dense4 = tf.keras.layers.Dense(np.prod(action_size), activation='softmax')(dense3) - policy_network = tf.keras.Model(inputs=inputs, outputs=dense4) - return policy_network - ->>>>>>> main +# def build_policy_network(state_shape, action_size): +# inputs = tf.keras.layers.Input(shape=state_shape) +# flat = tf.keras.layers.Flatten()(inputs) +# dense1 = tf.keras.layers.Dense(128, activation='relu')(flat) +# dropout1 = tf.keras.layers.Dropout(0.4)(dense1) +# dense2 = tf.keras.layers.Dense(64, activation='relu')(dropout1); +# dropout2 = tf.keras.layers.Dropout(0.4)(dense2) +# dense3 = tf.keras.layers.Dense(32, activation='relu')(dropout2) +# dense4 = tf.keras.layers.Dense(np.prod(action_size), activation='softmax')(dense3) +# policy_network = tf.keras.Model(inputs=inputs, outputs=dense4) +# return policy_network + env = TicTacToeEnv() class REINFORCE_TicTacToe(REINFORCE): @@ -43,13 +40,11 @@ def invert_state(self, state): state[:, :, 0] *= -1 return state -state_shape = env.observation_space.shape +state_space = env.observation_space action_space = env.action_space -num_actions = (action_space.n,) -policy_network = build_policy_network(state_shape, - action_size = num_actions, - action_space = action_space, +policy_network = build_policy_network(state_space, + action_space, policy_type = 'fcn', layers = [128, 64, 32])