-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathdept_parser.js
117 lines (94 loc) · 2.81 KB
/
dept_parser.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env node
/*
* This is Simple-and-F*-dirty script to parse html files to generate json.
* I know its dirty. shame on me!
*
* Usage:
*
* node dept_parser.js -d mirror/prvCode_07/cityCode_000-areaCode_06-deptCode_023/
*/
var FS = require('fs'),
Path = require('path'),
Cheerio = require('cheerio'),
GetOpt = require('node-getopt');
function parseTitle(obj, $, code) {
var tr, idx, tds;
tr = $('table tr.title');
if (tr.length != 1) {
console.log(tr.length);
throw "more than one table title:" + code;
}
obj.title = [];
tds = $('td', tr);
for (idx = 0; idx < tds.length; idx++) {
obj.title.push($(tds[idx]).text());
}
return tds.length;
}
function parseTrData(array, firstField, fieldsLength, $, trDom, code) {
var idx, tds = $('td', trDom);
var data = []
// update row title, if needed
if (tds.length == fieldsLength) {
firstField = $(tds[0]).text();
}
if (tds.length < fieldsLength) {
data.push(firstField);
}
for (idx = 0; idx < tds.length; idx++) {
data.push($(tds[idx]).text());
}
data.push(code);
array.push(data);
return firstField;
}
/* find out each tr.data and parse its content.*/
function parseData(obj, $, fieldsLength, code) {
var idx,
first = "",
trs = $('table .data');
for (idx = 0; idx < trs.length; idx++) {
first = parseTrData(obj.data, first, fieldsLength, $, trs[idx], code);
}
}
function parseContent(obj, $, code) {
title = $('.titlebox .title');
obj.head = $('.head', title).text();
obj.date = $('.date', title).text();
// we should know how many fields in one row.
length = parseTitle(obj, $, code);
parseData(obj, $, length, code);
}
function parseFile(obj, filename, content) {
var $ = Cheerio.load(content, {decodeEntities: false});
// filename is matter. it contains important information
// so we will store the information in eacn data object.
var code = Path.basename(filename, '.html');
parseContent(obj, $, code);
}
// -- main --
(function () {
var getopt = new GetOpt([
['d' , '=ARG+' , 'Input dir'],
['h' , 'help' , 'display this help']
]);
var args = getopt.bindHelp().parseSystem();
var opts = args.options;
if (!opts.d) {
getopt.showHelp();
return 1;
}
// store parsed data
var obj = { data: [] }
// iterate each file under the directory
var dir = opts.d[0];
var files = FS.readdirSync(dir);
files.map(function (name) {
var fname = dir + "/" + name;
if (fname.match(/\.html/)) { // only process .html file
var f = FS.readFileSync(fname, "utf-8");
parseFile(obj, fname, f);
}
});
console.log(JSON.stringify(obj, null, 4));
})();