-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathutils.py
88 lines (70 loc) · 3.33 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#coding=utf-8
import tensorflow as tf
def average_gradients(tower_grads):
"""Calculate the average gradient for each shared variable across all towers.
Note that this function provides a synchronization point across all towers.
Args:
tower_grads: List of lists of (gradient, variable) tuples. The outer list
is over individual gradients. The inner list is over the gradient
calculation for each tower.
Returns:
List of pairs of (gradient, variable) where the gradient has been averaged
across all towers.
"""
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(0, grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
def tower_loss(cnn, input_x, input_y):
logits = cnn.inference(input_x)
total_loss = cnn.loss(logits, input_y)
accuracy = cnn.accuracy(logits, input_y)
return logits, total_loss, accuracy
def cal_predictions(cnn, input_x):
logits = cnn.inference(input_x)
predictions = tf.argmax(logits, 1, name="predictions")
return predictions
def attention(inputs, attention_size, l2_reg_lambda ):
"""
Attention mechanism layer.
:param inputs: outputs of RNN/Bi-RNN layer (not final state)
:param attention_size: linear size of attention weights
:return: outputs of the passed RNN/Bi-RNN reduced with attention vector
"""
# In case of Bi-RNN input we need to concatenate outputs of its forward and backward parts
if isinstance(inputs, tuple):
inputs = tf.concat(2, inputs)
sequence_length = inputs.get_shape()[1].value # the length of sequences processed in the antecedent RNN layer
hidden_size = inputs.get_shape()[2].value # hidden size of the RNN layer
# Attention mechanism
W_omega = tf.get_variable("W_omega", initializer=tf.random_normal([hidden_size, attention_size], stddev=0.1))
b_omega = tf.get_variable("b_omega", initializer=tf.random_normal([attention_size], stddev=0.1))
u_omega = tf.get_variable("u_omega", initializer=tf.random_normal([attention_size], stddev=0.1))
v = tf.tanh(tf.matmul(tf.reshape(inputs, [-1, hidden_size]), W_omega) + tf.reshape(b_omega, [1, -1]))
vu = tf.matmul(v, tf.reshape(u_omega, [-1, 1]))
exps = tf.reshape(tf.exp(vu), [-1, sequence_length])
alphas = exps / tf.reshape(tf.reduce_sum(exps, 1), [-1, 1])
# Output of Bi-RNN is reduced with attention vector
output = tf.reduce_sum(inputs * tf.reshape(alphas, [-1, sequence_length, 1]), 1)
#if l2_reg_lambda > 0:
# l2_loss += tf.nn.l2_loss(W_omega)
# l2_loss += tf.nn.l2_loss(b_omega)
# l2_loss += tf.nn.l2_loss(u_omega)
# tf.add_to_collection('losses', l2_loss)
return output