-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patha3c.py
55 lines (44 loc) · 2.54 KB
/
a3c.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import gym
import os
import tensorflow as tf
import core
from scipy.misc import imresize
from tensorboardX import SummaryWriter
class A3CNetwork(object):
def __init__(self, name, input_shape, output_dim, logdir=None):
"""Network structure is defined here
Args:
name (str): The name of scope
input_shape (list): The shape of input image [H, W, C]
output_dim (int): Number of actions
logdir (str, optional): directory to save summaries
TODO: create a summary op
"""
with tf.variable_scope(name):
self.states = tf.placeholder(tf.float32, shape=[None, *input_shape], name="states")
self.actions = tf.placeholder(tf.uint8, shape=[None], name="actions")
self.rewards = tf.placeholder(tf.float32, shape=[None], name="rewards")
self.advantage = tf.placeholder(tf.float32, shape=[None], name="advantage")
action_onehot = tf.one_hot(self.actions, output_dim, name="action_onehot")
self.action_prob, self.values = core.cnn_model(self.states, output_dim, tf.nn.relu, tf.nn.softmax)
single_action_prob = tf.reduce_sum(self.action_prob * action_onehot, axis=1)
clip_single_action_prob = tf.clip_by_value(single_action_prob, 1e-7, 1.0)
entropy = - self.action_prob * tf.log(self.action_prob + 1e-7)
entropy = tf.reduce_sum(entropy, axis=1)
log_action_prob = tf.log(clip_single_action_prob)
maximize_objective = log_action_prob * self.advantage + entropy * 0.01
self.actor_loss = -tf.reduce_mean(maximize_objective)
self.value_loss = tf.losses.mean_squared_error(labels=self.rewards, predictions=self.values)
# tensorboardX
self.entropy = tf.reduce_mean(entropy)
self.pi_loss = tf.reduce_mean(log_action_prob * self.advantage)
self.mean_value_loss = tf.reduce_mean(self.value_loss)
# optimization
self.total_loss = self.actor_loss + self.value_loss * .5
self.optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name)
self.gradients = self.optimizer.compute_gradients(self.total_loss, var_list)
self.gradients_placeholders = []
for grad, var in self.gradients:
self.gradients_placeholders.append((tf.placeholder(var.dtype, shape=var.get_shape()), var))
self.apply_gradients = self.optimizer.apply_gradients(self.gradients_placeholders)