-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmodel_bilstm_crf.py
105 lines (89 loc) · 2.83 KB
/
model_bilstm_crf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
import glob
import itertools
import collections
import numpy as np
import tensorflow as tf
# physical_devices = tf.config.experimental.list_physical_devices("GPU")
# assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
# tf.config.experimental.set_memory_growth(physical_devices[0], True)
from tensorflow.keras import Model
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing import sequence
from dataset import CharTokenizer
from dataset import load_ctb6_cws, build_sbme_tags
from layers import MaskBiLSTM
from crf import CRF, ModelWithCRFLoss
def load_dataset(file):
sentences = load_ctb6_cws(file=file)
X = ["".join(sentence) for sentence in sentences]
y = build_sbme_tags(sentences, onehot=False)
return X, y
def preprocess_dataset(X, y, maxlen, tokenizer):
X = tokenizer.transform(X)
X = sequence.pad_sequences(
X,
maxlen=maxlen,
dtype="int32",
padding="post",
truncating="post",
value=0
)
y = sequence.pad_sequences(
y,
maxlen=maxlen,
dtype="float32",
padding="post",
truncating="post",
value=0
)
return X, y
X_train, y_train = load_dataset("train.txt")
tokenizer = CharTokenizer(mintf=5)
tokenizer.fit(X_train)
maxlen = 128
hdims = 128
num_classes = 4
vocab_size = tokenizer.vocab_size
X_train, y_train = preprocess_dataset(X_train, y_train, maxlen, tokenizer)
X_val, y_val = load_dataset("dev.txt")
X_val, y_val = preprocess_dataset(X_val, y_val, maxlen, tokenizer)
inputs = Input(shape=(maxlen,))
mask = Lambda(lambda x: tf.not_equal(x, 0))(inputs) # 全局mask
x = Embedding(input_dim=vocab_size, output_dim=hdims)(inputs)
x = LayerNormalization()(x)
x = MaskBiLSTM(hdims)(x, mask=mask)
x = Dense(hdims)(x)
x = Dense(num_classes)(x)
# CRF需要mask来完成不定长序列的处理,这里是手动传入
# 可以设置Embedding参数mask_zero=True,避免手动传入
crf = CRF(trans_initializer="orthogonal")
outputs = crf(x, mask=mask)
base = Model(inputs=inputs, outputs=outputs)
model = ModelWithCRFLoss(base)
model.summary()
model.compile(optimizer="adam")
batch_size = 32
epochs = 5
file = "weights/weights.bilstm.crf"
model.fit(
X_train,
y_train,
batch_size=batch_size,
epochs=epochs,
validation_data=(X_val, y_val)
)
X_test, y_test = load_dataset("test.txt")
X_test, y_test = preprocess_dataset(X_test, y_test, maxlen, tokenizer)
model.evaluate(X_test, y_test)
model.save_weights(file)
if __name__ == "__main__":
import dataset
import evaluation
from model_utils import CRFBasedTokenizer
trans = tf.convert_to_tensor(crf.trans)
trans = np.array(trans, dtype=np.float32)
print(trans)
tokenizer = CRFBasedTokenizer(model, tokenizer, maxlen)
for text in dataset.load_sentences():
print(tokenizer.cut(text))