-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmain.js
407 lines (333 loc) · 14.7 KB
/
main.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
/*
* GREYCORTEX Research DomAIn
* Main file for node.js.
*
* Copyright (C) 2019 GreyCortex s.r.o.
* @author p3
*
* // TODO: output may use $DATE$ and $TIMESTAMP$ - see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Date
*
*/
// https://github.com/adaltas/node-csv
// csv.js https://github.com/avoidwork/csv.js
const CSV = require("./js/csv.js/csv");
// File System https://nodejs.org/docs/latest-v8.x/api/fs.html
const FS = require("fs");
// modules
const DOMAIN = require("./js/domain");
const {Domain, Stub} = DOMAIN;
const TRAIN = require("./js/train");
// default values
var input = undefined;
var output = undefined;
var train = "data/train.csv";
var test = "data/test.csv";
var dict = DOMAIN.DICT; // DICT = "data/dict.json"
var suffix = DOMAIN.SUFFIX; // SUFFIX = "data/suffix.json";
var model = "data/model.json";
// CSV/JSON thingy...
var keySuf = "suffix"; // if (args.key) keySuf = args.key;
var keyDom = "domain"; // if (args.key) keyDom = args.key;
var valRep = "rep";
var valPop = "pop";
var valNeg = "neg";
var valPos = "pos";
/**
* Prints help...
*/
function printHelp() {
console.log("Run:");
console.log(" node main.js -h --help");
console.log(" node --max-old-space-size=4096 main.js train-bigrams --train=data/samples-train.csv [--rate=0.01] [--its=10000] [--states=720] [] [] [--debug] [--characters [--fullalpha]]");
console.log(" node main.js train test --train=data/samples-train.csv --test=data/samples-test.csv --model=data/model.json");
console.log(" node main.js csv2json --input=data/suffixes-v0-20190309.csv --key=suffix --output=data/suffix.json --array=popularity,cntNeg,cntPos --pretty --debug");
console.log(" node main.js csv2json --input=data/words-bs.domain.csv --output=data/dict.json --key=word --array=pop,neg,pos --pretty --debug");
console.log(" node main.js json2json --input=data.json --output=data-pretty.json --pretty");
console.log();
console.log("--input represents input and less specific train/test file for the default data:");
console.log(" "+ train +" "+ test +" "+ dict +" "+ suffix +" "+ model);
console.log();
console.log("Test:");
console.log(" node main.js test");
console.log(" chrome greycortex.com test");
console.log();
}
/** Main args object can contain whatever from the above help example and main activities to be performed */
const args = {
main: new Array() // what to do (they may be somehow combined :)
}
/**
* Gets arguments and formats them into a simple object.
*/
function getArgs() {
process.argv
.slice(2, process.argv.length)
.forEach(arg => {
// long arg
if (arg.slice(0, 2) === "--") {
const longArg = arg.split("=");
var la1 = longArg[1];
if (longArg.length == 1) la1 = true;
args[longArg[0].slice(2, longArg[0].length)] = la1;
}
// flags
else if (arg[0] === "-") {
const flags = arg.slice(1, arg.length).split("")
flags.forEach(flag => {
args[flag] = true;
})
}
// run command (just one, take the last):
else {
args.main.push(arg);
}
});
// unify (extend) short inputs...
if (args.h) args["help"] = true;
return args;
}
////////////////////////////////////////////////////////////////////////////////
// main() //////////////////////////////////////////////////////////////////////
// process.stdout._handle.setBlocking(true); //fix https://stackoverflow.com/questions/38085746/why-does-writing-to-stdout-in-a-hot-loop-cause-an-out-of-memory-shutdown#38086131
console.log("GREYCORTEX Research DomAIn");
// get args
getArgs();
if (args.debug) console.log(args);
console.log();
// print help
if (args.main.length == 0 || args.help) {
printHelp();
}
// test ////////////////////////////////////////////////////////////////////////
if (args.main.includes("test") || args.main.includes("test-main")) {
// here all the modules produce their tests...
console.log();
console.log("GREYCORTEX Research DomAIn test ... ");
// do some more tests... all units are included automatically :)
console.log("... all tests passed!");
// this is a sandbox when running as test-main :) //////////////////////////
////////////////////////////////////////////////////////////////////////////
}
// csv2json ////////////////////////////////////////////////////////////////////
if (args.main.includes("csv2json")) {
input = args.input;
output = input.slice(0, input.length-4) +".json";
if (args.output) output = args.output;
console.log("csv2json "+ input + " >> "+ output +" ...");
try {
const inputData = FS.readFileSync(input, "utf8");
// console.log("typeof inputData: "+ typeof inputData);
// console.log(inputData); // .toString()
const inputCSV = CSV.decode(inputData, ",");
// console.log("typeof inputCSV: "+ typeof inputCSV);
// console.log(JSON.stringify(inputCSV));
// output JSON
var outputJSON = inputCSV;
// transform the key if necessary
if (args.key) {
outputJSON = {};
// for (var obj in inputCSV) ... inputCSV is an array(!)
for(var i = 0; i < inputCSV.length; i++) {
const obj = inputCSV[i];
if (args.debug && i < 10) console.log(JSON.stringify(obj));
// this is what to store
const key = obj[args.key];
var val = null;
// transform the value into an array if necessary
if (args.array) {
val = [];
// var array = args.array; // create an array and hate JS, that cannot do args.array.split(",")
args.array.split(",") // array
.forEach(v => {
val.push(obj[v]);
})
}
// just delete the key
else {
delete obj[args.key];
val = obj;
}
// search the array +-efficiently
outputJSON[key] = val;
if (args.debug && i < 10) console.log("'"+ key +"': "+ JSON.stringify(outputJSON[key]));
}
}
// save it
if (args.pretty) FS.writeFileSync(output, JSON.stringify(outputJSON, null, 2), "utf8");
else FS.writeFileSync(output, JSON.stringify(outputJSON), "utf8");
console.log("... done.");
} catch(e) {
console.log(e.stack);
}
}
// csv2json ////////////////////////////////////////////////////////////////////
if (args.main.includes("json2json")) {
input = args.input;
output = input.slice(0, input.length-4) +"-out.json";
if (args.output) output = args.output;
console.log("json2json "+ input + " >> "+ output +" ...");
try {
const json = JSON.parse(FS.readFileSync(input, "utf8"));
// save it, some is pretty, some is not...
if (args.pretty) FS.writeFileSync(output, JSON.stringify(json, null, 2), "utf8");
else FS.writeFileSync(output, JSON.stringify(json), "utf8");
console.log("... done.");
} catch(e) {
console.log(e.stack);
}
}
// train-bigrams ///////////////////////////////////////////////////////////////
if (args.main.includes("train-bigrams")) {
var start = Date.now();
// input
if (args.train) input = args.train;
else if (args.input) input = args.input;
else input = train;
// output
output = input.slice(0, input.length-4) +".model.json";
if (args.output) output = args.output;
console.log("train-bigrams processing input "+ input + " ...");
try {
// const inputData = FS.readFileSync(input, "utf8");
// domain,rep,pop,level,branches,leaves,cnt,pos,neg,psl,features
const inputCSV = CSV.decode(FS.readFileSync(input, "utf8"), ",");
// get the domain name and bigrams metadata
if (args.key) key = args.key;
const bigrams = DOMAIN.genBigrams();
// trainingSet = [ {id: "example.com", input: [0,0],output: [0]}, ... ]
const trainSet = [];
// for (var obj in inputCSV) ... inputCSV is an array(!)
for(var i = 0; i < inputCSV.length; i++) {
const obj = inputCSV[i];
if (args.debug && i < 1) console.log(JSON.stringify(obj));
// get the domain name
const dom = obj[keyDom];
const rep = (obj[valRep]+1)/2;
if (dom == null || rep == null) {
console.log("null at sample "+ (trainSet.length + 1) +": "+ dom +" "+ rep);
continue;
}
// transform
var domain = DOMAIN.replaceChars(dom.split(".").reverse().join("."));
// feature vector
var fvect = null;
if (args.characters) {
var alphabet = DOMAIN.alphabet;
if (args.fullalpha) {
domain = DOMAIN.replaceBS(dom.split(".").reverse().join("."));
alphabet = [".","-","_","0","1","2","3","4","5","6","7","8","9","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","?"];
}
// create feature vector using (trasformed) letters
fvect = Float64Array ? new Float64Array(alphabet.length) : Array(alphabet.length); // Array(alphabet.length);
for (var n = 0; n < alphabet.length; n++) fvect[n] = 0;
// put the samples into trainSet
for (var n = 0; n < domain.length; n++) {
var bg = domain.substring(n, n + 1);
var p = alphabet.indexOf(bg);
if (p >= 0 && p <= alphabet.length) fvect[p] += 1;
else console.log("O-OU: "+ domain +"["+ n +"] = '"+ bg +"' > alphabet["+ p +"]");
}
}
else {
// create feature vector using transformed bigrams
fvect = Array(bigrams.length);
for (var n = 0; n < bigrams.length; n++) fvect[n] = 0;
// put the samples into trainSet
for (var n = 0; n < domain.length-1; n++) {
const bg = domain.substring(n, n + 2);
const p = bigrams.indexOf(bg);
if (p >= 0 && p <= bigrams.length) fvect[p] += 1;
else console.log("O-OU: "+ domain +"["+ n +"] = '"+ bg +"' > bigrams["+ p +"]");
}
}
// console.log("fvect "+ typeof(fvect[0]) +" = "+ fvect[0] +" print:"+ fvect);
const sample = {"id": dom, "input": fvect, "output": [rep]};
trainSet.push(sample);
if (args.debug && i < 1) console.log(JSON.stringify(sample));
if (i == inputCSV.length-1 || ((trainSet.length) % 1000 == 0)) {
if (args.debug) console.log("samples: "+ trainSet.length +"\t "+ (JSON.stringify(trainSet).length/(1024*1024)).toFixed(3) +"M");
else console.log("samples: "+ trainSet.length);
}
}
// FS.writeFileSync(input+"transformed.csv", CSV.encode(trainSet), "utf8");
console.log("... loaded in "+ ((Date.now() - start)/1000).toFixed(3) +" train-bigrams training ...");
start = Date.now();
// DOMAIN.train();
var network = DOMAIN.train(trainSet);
// print it - console.log(JSON.stringify(trainer));
if (args.pretty) FS.writeFileSync(output, JSON.stringify(network, null, 2), "utf8");
else FS.writeFileSync(output, JSON.stringify(network), "utf8");
var evalSet = DOMAIN.eval(trainSet, network);
// print it - console.log(JSON.stringify(trainer));
if (args.pretty) FS.writeFileSync(output.slice(0, input.length-5)+"-eval.csv", CSV.encode(evalSet), "utf8");
else FS.writeFileSync(output.slice(0, input.length-5)+"-eval.csv", CSV.encode(evalSet), "utf8");
console.log("... done in "+ ((Date.now() - start)/1000).toFixed(3) +"s. Model: "+ output);
} catch(e) {
console.log(e.stack);
}
}
//// test ------------------------------------------------------------------------
//if (args.main.includes("test")) {
// if (args.test) input = args.test;
// else if (args.input) input = args.input;
// else input = test;
//
// output = input.slice(0, input.length-4) +".out";
// if (args.output) output = args.output;
// console.log("testing "+ input + " >> "+ output +" ...");
//
// try {
// // const inputData = FS.readFileSync(input, "utf8");
// const inputCSV = CSV.decode(FS.readFileSync(input, "utf8"), ",");
// // console.log("typeof inputCSV: "+ typeof inputCSV);
// // console.log(JSON.stringify(inputCSV));
//
// // output ARRAY
// const outputCSV = [];
//
// // domain,rep,pop,level,branches,leaves,cnt,pos,neg,psl,features
// var keyDom = "domain";
// if (args.key) key = args.key;
//
// // TODO: process args.array.split(",")
// var valPop = "pop";
// var valNeg = "neg";
// var valPos = "pos";
// var valRep = "rep";
//
// var absErr = 0;
//
// // for (var obj in inputCSV) ... inputCSV is an array(!)
// for(var i = 0; i < inputCSV.length; i++) {
// const obj = inputCSV[i];
// if (args.debug && i < 10) console.log(JSON.stringify(obj));
//
// const dom = obj[keyDom];
// const pop = obj[valPop];
// const neg = obj[valNeg];
// const pos = obj[valPos];
// const rep = obj[valRep];
//
// // process
//
//
// // should return something like this:
// const res = {};
// res[keyDom] = dom;
// res[valRep] = rep;
// res["diff"] = (rep - rep); // less then 1, do not square
// outputCSV.push(res);
//
// if (args.debug && i < 10) console.log(res); // JSON.stringify
// }
//
// // print it
// FS.writeFileSync(output +".json", JSON.stringify(outputCSV, null, 2), "utf8");
// FS.writeFileSync(output +".csv", CSV.encode(outputCSV), "utf8");
//
// console.log();
// console.log("... done. Mean Square Error: "+ absErr/i +" of "+ i +" samples.");
// } catch(e) {
// console.log(e.stack);
// }
//}