-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmin-char-lstm.js
368 lines (304 loc) · 12.3 KB
/
min-char-lstm.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
const tf = require('@tensorflow/tfjs-node');
const data = readData();
// All unique characters / entities in the data set.
const data_size = data.length;
const chars = [...new Set(data)];
const V = vocab_size = chars.length;
console.log('data has %d characters, %d unique.', data_size, vocab_size);
// Each character in the vocabulary gets a unique integer index assigned, in the
// half-open interval [0:N). These indices are useful to create one-hot encoded
// vectors that represent characters in numerical computations.
const char_to_ix = chars.reduce((accum, char, index) => {
accum[char] = index;
return accum;
}, {});
const ix_to_char = chars;
//console.log('char_to_ix', char_to_ix)
//console.log('ix_to_char', ix_to_char)
// Hyperparameters.
// Size of hidden state vectors; applies to h and c.
const H = hidden_size = 100;
const seq_length = 16 // number of steps to unroll the LSTM for
const learning_rate = 0.1
// The input x is concatenated with state h, and the joined vector is used to
// feed into most blocks within the LSTM cell. The combined height of the column
// vector is HV.
const HV = H + V
console.log('HV %d H %d V %d', HV, H, V);
// Stop when processed this much data
const MAX_DATA = 1000000;
// Model parameters/weights -- these are shared among all steps. Weights
// initialized randomly; biases initialized to 0.
// Inputs are characters one-hot encoded in a vocab-sized vector.
// Dimensions: H = hidden_size, V = vocab_size, HV = hidden_size + vocab_size
let Wf = tf.tidy(() => tf.randomUniform([H, HV]).mul(tf.scalar(0.01)));
let bf = tf.zeros([H, 1]);
let Wi = tf.tidy(() => tf.randomUniform([H, HV]).mul(tf.scalar(0.01)));
let bi = tf.zeros([H, 1]);
let Wcc = tf.tidy(() => tf.randomUniform([H, HV]).mul(tf.scalar(0.01)));
let bcc = tf.zeros([H, 1]);
let Wo = tf.tidy(() => tf.randomUniform([H, HV]).mul(tf.scalar(0.01)));
let bo = tf.zeros([H, 1]);
let Wy = tf.tidy(() => tf.randomUniform([V, H]).mul(tf.scalar(0.01)));
let by = tf.zeros([V, 1])
// Uncomment this to run gradient checking instead of training
//basicGradCheck();
//process.exit();
// n is the iteration counter; p is the input sequence pointer, at the beginning
// of each step it points at the sequence in the input that will be used for
// training this iteration.
let n = 0;
let p = 0;
// Memory variables for Adagrad.
let mWf = tf.zerosLike(Wf)
let mbf = tf.zerosLike(bf)
let mWi = tf.zerosLike(Wi)
let mbi = tf.zerosLike(bi)
let mWcc = tf.zerosLike(Wcc)
let mbcc = tf.zerosLike(bcc)
let mWo = tf.zerosLike(Wo)
let mbo = tf.zerosLike(bo)
let mWy = tf.zerosLike(Wy)
let mby = tf.zerosLike(by)
let smooth_loss = -Math.log(1.0 / V) * seq_length;
let hprev, cprev;
while (p < MAX_DATA) {
// Prepare inputs (we're sweeping from left to right in steps seq_length long
if (n === 0 || p + seq_length + 1 >= data.length) {
// Reset RNN memory
hprev = tf.zeros([H, 1]);
cprev = tf.zeros([H, 1])
p = 0 // go from start of data
}
// In each step we unroll the RNN for seq_length cells, and present it with
// seq_length inputs and seq_length target outputs to learn.
const inputs = data.substring(p, p + seq_length).split('').map(ch => char_to_ix[ch]);
const targets = data.substring(p + 1, p + seq_length + 1).split('').map(ch => char_to_ix[ch]);
// Sample from the model now and then.
if (n % 1000 == 0) {
let sample_ix = sample(hprev, cprev, inputs[0], 200);
let txt = sample_ix.map(ix => ix_to_char[ix]).join('');
console.log('----\n %s \n----', txt);
}
// Forward seq_length characters through the RNN and fetch gradient.
let loss, dWf, dbf, dWi, dbi, dWcc, dbcc, dWo, dbo, dWy, dby;
[loss, dWf, dbf, dWi, dbi, dWcc, dbcc, dWo, dbo, dWy, dby, hprev, cprev] = lossFun(inputs, targets, hprev, cprev);
smooth_loss = smooth_loss * 0.999 + loss * 0.001;
if (n % 200 == 0) {
console.log('iter %d (p=%d), loss %f', n, p, smooth_loss);
//console.log('tf.memory:', tf.memory());
}
// Perform parameter update with Adagrad.
[Wf,mWf] = paramUpdates(Wf,dWf,mWf);
[bf,mbf] = paramUpdates(bf,dbf,mbf);
[Wi,mWi] = paramUpdates(Wi,dWi,mWi);
[bi,mbi] = paramUpdates(bi,dbi,mbi);
[Wcc,mWcc] = paramUpdates(Wcc,dWcc,mWcc);
[bcc,mbcc] = paramUpdates(bcc,dbcc,mbcc);
[Wo,mWo] = paramUpdates(Wo,dWo,mWo);
[bo,mbo] = paramUpdates(bo,dbo,mbo);
[Wy,mWy] = paramUpdates(Wy,dWy,mWy);
[by,mby] = paramUpdates(by,dby,mby);
p += seq_length;
n += 1;
}
console.log('tf.memory:', tf.memory());
function paramUpdates(param, dparam, mem) {
return tf.tidy(() => {
//mem += dparam * dparam
//param += -learning_rate * dparam / np.sqrt(mem + 1e-8)
let mem1 = mem.add(dparam.mul(dparam));
let param1 = param.add(tf.scalar(-learning_rate).mul(dparam).div(tf.sqrt(mem1.add(tf.scalar(1e-8)))));
tf.dispose([param, dparam, mem]);
return [param1, mem1];
});
}
// Sample a sequence of integers from the model.
// Runs the LSTM in forward mode for n steps; seed_ix is the seed letter for
// the first time step, h and c are the memory state. Returns a sequence of
// letters produced by the model (indices).
function sample(h, c, seed_ix, n) {
return tf.tidy(() => {
const ixes = []
let p;
let ix = seed_ix;
for (let t = 0; t < n; t++) {
// Run the forward pass only.
({p, h, c} = forward(ix, h, c));
// Sample from the distribution produced by softmax.
ix = choice(p.flatten().arraySync());
ixes.push(ix)
}
return ixes
});
}
function forward(ix, h, c) {
return tf.tidy(() => {
// Prepare a one-hot encoded vector of
// shape (V, 1). inputs[t] is the index where the 1 goes.
const x = tf.buffer([V, 1]);
x.set(1, ix, 0);
// hprev and xs are column vector;
// concat them together into a "taller"
// column vector - first the elements of x, then h.
const xh = tf.concat([x.toTensor(), h]);
// Gates f, i and o.
const fg = tf.sigmoid(tf.dot(Wf, xh).add(bf));
const ig = tf.sigmoid(tf.dot(Wi, xh).add(bi));
const og = tf.sigmoid(tf.dot(Wo, xh).add(bo));
// Candidate cc.
const cc = tf.tanh(tf.dot(Wcc, xh).add(bcc));
// This step's h and c.
c = fg.mul(c).add(ig.mul(cc));
h = tf.tanh(c).mul(og);
// Softmax for output.
const y = tf.dot(Wy, h).add(by);
const p = tf.exp(y).div(tf.sum(tf.exp(y)));
return {p, xh, h, c, fg, ig, og, cc, y};
});
}
function choice(ps) {
const random = Math.random();
let ix = 0;
let pAcum = 0;
for (const p of ps) {
pAcum = pAcum + p;
if (random <= pAcum) {
return ix;
}
ix++;
}
}
// Runs forward and backward passes through the RNN.
// inputs, targets: Lists of integers. For some i, inputs[i] is the input
// character (encoded as an index into the ix_to_char map) and
// targets[i] is the corresponding next character in the training data (similarly encoded).
// hprev: Hx1 array of initial hidden state
// cprev: Hx1 array of initial hidden state
// returns: loss, gradients on model parameters, and last hidden states
function lossFun(inputs, targets, hprev, cprev) {
return tf.tidy(() => {
// Caches that keep values computed in the forward pass at each time step, to
// be reused in the backward pass.
const cache = [];
// Initial incoming states.
let h = hprev;
let c = cprev;
let loss = 0;
// Forward pass
//for t in range(len(inputs)):
for (let t = 0; t < inputs.length; t++) {
//inputs[t] is the index where the 1 goes.
let run = forward(inputs[t], h, c);
cache.push(run); //run = {p, h, c, fg, ig, og, cc, y}
h = run.h;
c = run.c;
// Cross-entropy loss.
loss += -Math.log(run.p.arraySync()[targets[t]][0]);
}
//Initialize gradients of all weights/biases to 0.
let dWf = tf.zerosLike(Wf);
let dbf = tf.zerosLike(bf);
let dWi = tf.zerosLike(Wi);
let dbi = tf.zerosLike(bi);
let dWcc = tf.zerosLike(Wcc);
let dbcc = tf.zerosLike(bcc);
let dWo = tf.zerosLike(Wo);
let dbo = tf.zerosLike(bo);
let dWy = tf.zerosLike(Wy);
let dby = tf.zerosLike(by);
// Incoming gradients for h and c; for backwards loop step these represent
// dh[t] and dc[t]; we do truncated BPTT, so assume they are 0 initially.
let dhnext = tf.zerosLike(cache[0].h)
let dcnext = tf.zerosLike(cache[0].c)
// The backwards pass iterates over the input sequence backwards.
for (let t = inputs.length-1; t >= 0; t--) {
// Backprop through the gradients of loss and softmax.
let dy = cache[t].p.clone().bufferSync();
dy.set(dy.get(targets[t], 0) - 1, targets[t], 0);// dy[targets[t]] -= 1
dy = dy.toTensor();
// Compute gradients for the Wy and by parameters.
dWy = dWy.add(tf.dot(dy, cache[t].h.transpose()));
dby = dby.add(dy);
// Backprop through the fully-connected layer (Wy, by) to h. Also add up
// the incoming gradient for h from the next cell.
const dh = tf.dot(Wy.transpose(), dy).add(dhnext);
// Backprop through multiplication with output gate; here "dtanh" means
// the gradient at the output of tanh.
const dctanh = cache[t].og.mul(dh);
// Backprop through the tanh function; since cs[t] branches in two
// directions we add dcnext too.
// dc = dctanh * (1 - np.tanh(cs[t]) ** 2) + dcnext
const dc = dctanh.mul(
tf.scalar(1).sub(tf.square(tf.tanh(cache[t].c)))
).add(dcnext);
// Backprop through multiplication with the tanh; here "dhogs" means
// the gradient at the output of the sigmoid of the output gate. Then
// backprop through the sigmoid itself (ogs[t] is the sigmoid output).
const dhogs = dh.mul(tf.tanh(cache[t].c));
// dho = dhogs * ogs[t] * (1 - ogs[t])
const dho = dhogs.mul(cache[t].og).mul(tf.scalar(1).sub(cache[t].og));
// Compute gradients for the output gate parameters.
dWo = dWo.add( tf.dot(dho, cache[t].xh.transpose()));
dbo = dbo.add(dho);
// Backprop dho to the xh input.
const dxh_from_o = tf.dot(Wo.transpose(), dho);
// Backprop through the forget gate: sigmoid and elementwise mul.
// dhf = cs[t-1] * dc * fgs[t] * (1 - fgs[t])
const prevC = cache[t-1]? cache[t-1].c :cprev;
const dhf = prevC.mul(dc).mul(cache[t].fg).mul(tf.scalar(1).sub(cache[t].fg));
dWf = dWf.add(tf.dot(dhf, cache[t].xh.transpose()));
dbf = dbf.add(dhf);
const dxh_from_f = tf.dot(Wf.transpose(), dhf);
// Backprop through the input gate: sigmoid and elementwise mul.
const dhi = cache[t].cc.mul(dc).mul(cache[t].ig).mul(tf.scalar(1).sub(cache[t].ig));
dWi = dWi.add(tf.dot(dhi, cache[t].xh.transpose()));
dbi = dbi.add(dhi);
const dxh_from_i = tf.dot(Wi.transpose(), dhi)
const dhcc = cache[t].ig.mul(dc).mul(tf.scalar(1).sub(cache[t].cc.square()));
dWcc = dWcc.add(tf.dot(dhcc, cache[t].xh.transpose()));
dbcc = dbcc.add(dhcc);
dxh_from_cc = tf.dot(Wcc.transpose(), dhcc)
// Combine all contributions to dxh, and extract the gradient for the
// h part to propagate backwards as dhnext.
const dxh = dxh_from_o.add(dxh_from_f).add(dxh_from_i).add(dxh_from_cc);
dhnext = dxh.slice(V);
// dcnext from dc and the forget gate.
dcnext = cache[t].fg.mul(dc);
}
// Gradient clipping to the range [-5, 5].
dWf = dWf.clipByValue(-5, 5);
dbf = dbf.clipByValue(-5, 5);
dWi = dWi.clipByValue(-5, 5);
dbi = dbi.clipByValue(-5, 5);
dWcc = dWcc.clipByValue(-5, 5);
dbcc = dbcc.clipByValue(-5, 5);
dWo = dWo.clipByValue(-5, 5);
dbo = dbo.clipByValue(-5, 5);
dWy = dWy.clipByValue(-5, 5);
dby = dby.clipByValue(-5, 5);
let prevH, prevC;
if (cache[inputs.length-1]) {
prevH = cache[inputs.length-1].h;
prevC = cache[inputs.length-1].h;
tf.dispose([hprev,cprev]);
} else {
prevH = hprev;
prevC = cprev;
}
return [loss, dWf, dbf, dWi, dbi, dWcc, dbcc, dWo, dbo, dWy, dby, prevH, prevC]
});
}
function readData() {
const fs = require('fs');
const filename = process.argv[2];
const data_raw = fs.readFileSync(filename, 'utf8');
let head = data_raw.indexOf('***')
head = data_raw.indexOf('***', head + 3) + 3;
let tail = data_raw.indexOf('***', head);
const data = data_raw.substring(head, tail).trim();
console.log(data.substr(0, 30));
console.log('***');
console.log(data.substr(-30));
return data;
}