-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
101 lines (71 loc) · 3.91 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
__author__ = 'gpfinley'
"""
idea: train what is effectively a constituency parser using bi-directional LSTM that learns the boundaries of phrases
(ends going forward, beginnings going backward)
still need: data source (and munging code; big), batching function (small), grad descent step (small), and decoder (big)
"""
import numpy as np
import tensorflow as tf
from read_word2vec import get_dictionary
LSTM_UNITS = 100
BATCH_SIZE = 100
NUM_ITERATIONS = 10000000
VECTORS_PATH = "wiki-vectors_lc.bin"
# maximum number of words per sentence
maxlen = 30
# EMBEDDINGS
embeddings_dic = get_dictionary(VECTORS_PATH)
words, embeddings = zip(*embeddings_dic.iteritems())
n_words = len(words)
embeddings_dimensionality = len(embeddings[0])
# "map" word indices to embeddings (perform mapping by multiplying a one-hot matrix by this matrix)
embeddings_matrix = np.array(embeddings)
embeddings = None
# map words to integers
vocab = {word:i for (i, word) in enumerate(words)}
vocab_size = len(words)
# todo: load treebank data and convert (inputs to list of word indices; outputs to n-hot phrase type vectors)
# (or load embeddings directly rather than training on them!)
# number of types of phrases to detect
n_classes = 12
# num_sentences x max_seq_len; values in range(0, vocab_size)
this_batch_dense = tf.placeholder(dtype=tf.int32, shape=(None, maxlen))
# num_sentences x max_seq_len x vocab_size; one-hot
this_batch_onehot = tf.one_hot(this_batch_dense, depth=vocab_size)
# num_sentences x max_seq_len x embeddings_dimensionality
this_batch_embeddings = tf.einsum("abi,ic->abc", this_batch_onehot, embeddings_matrix)
# reverse this batch in time
this_batch_backwards_embeddings = tf.reverse(this_batch_embeddings, dims=(0,1,0))
this_batch_true_classes_f = tf.placeholder(dtype=tf.float32, shape=(None, n_classes))
this_batch_true_classes_b = tf.placeholder(dtype=tf.float32, shape=(None, n_classes))
lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=LSTM_UNITS)
forward_rnn = tf.nn.rnn(cell=lstm_cell, inputs=this_batch_embeddings)
backward_rnn = tf.nn.rnn(cell=lstm_cell, inputs=this_batch_backwards_embeddings)
# project lstm outputs to classes of phrase types for foward pass
W_f = tf.Variable(initial_value=np.random.random(size=(LSTM_UNITS, n_classes)))
b_f = tf.Variable(initial_value=np.random.random(size=(1, n_classes)))
# ...for backward pass
W_b = tf.Variable(initial_value=np.random.random(size=(LSTM_UNITS, n_classes)))
b_b = tf.Variable(initial_value=np.random.random(size=(1, n_classes)))
class_layer_f = tf.matmul(forward_rnn, W_f) + b_f
class_layer_b = tf.matmul(backward_rnn, W_b) + b_b
# todo: confirm how this loss function works (logits on just input, n-hot on output??)
loss_f = tf.nn.sigmoid_cross_entropy_with_logits(class_layer_f, this_batch_true_classes_f)
loss_b = tf.nn.sigmoid_cross_entropy_with_logits(class_layer_b, this_batch_true_classes_b)
# todo: gradient descent step
update_step = ...
# todo: batch and feed data into graph
# denser representation: matrix: #_sentences x max_length (and values in domain of vocab_size)
# (hopefully this can hold most or all of the data in memory?)
# use tf.one_hot to get: representation of input data: 3d tensor: #_sentences x max_length x vocab_size
# matmul step to get embeddings by matrix: vocab_size x embedding_length
with tf.Session() as session:
session.run(tf.global_variables_initializer())
for _ in range(NUM_ITERATIONS):
# todo: get next batch (dense matrix of words; forward classes; backward classes)
feed_dict = {this_batch_dense : next_batch_dense,
this_batch_true_classes_f : next_batch_forward_classes,
this_batch_true_classes_b : next_batch_backward_classes}
val_loss_f, val_loss_b, _ = session.run([loss_f, loss_b, update_step], feed_dict=feed_dict)
# todo: decode and score
# how will decoding work? will be tricky with multiple (or zero) classes. todo: invent an algorithm for that?