-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbasic_segment_trie.py
154 lines (127 loc) · 4.05 KB
/
basic_segment_trie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from base import TokenizerBase
_sentinel = object()
class Node:
def __init__(self, value):
self._children = {}
self._value = value
def _add_child(self, char, value, overwrite=False):
child = self._children.get(char)
if child is None:
child = Node(value)
self._children[char] = child
elif overwrite:
child._value = value
return child
def find(self, key):
state = self
for char in key:
state = state._children.get(char)
if state is None:
break
return state
class Trie(Node):
"""简单实现的Trie"""
def __init__(self):
super(Trie, self).__init__(_sentinel)
def __contains__(self, key):
return self[key] is not None
def __getitem__(self, key):
state = self
for char in key:
state = state._children.get(char)
if state is None:
return None
return state._value
def __setitem__(self, key, value):
state = self
for i, char in enumerate(key, start=0):
if i < len(key) - 1:
state = state._add_child(char, None, False)
else:
state = state._add_child(char, value, True)
def __delitem__(self, key):
self[key] = None
def add(self, key):
self[key] = _sentinel
def pop(self, key):
del self[key]
def update(self, keys):
for key in keys:
self[key] = _sentinel
def find_prefix(self, key):
pass
def test_trie():
import random
import dataset
trie = Trie()
words = dataset.load_words()
words = random.sample(words, k=100)
trie.update(words)
trie.update(["广东省", "长假", "成绩单"])
assert "广东省" in trie
trie.pop("成绩单")
assert "成绩单" not in trie
assert trie["长假"] is _sentinel
class TrieTokenizer(TokenizerBase):
"""把词图构建在Trie树上,当词表很大时Python的实现比较慢且耗内存"""
def __init__(self, words, fully=False):
self.trie = Trie()
self.trie.update(words)
self.fully = fully
def find_word(self, sentence):
if self.fully:
words = self.fully_segment(sentence)
else:
words = self.forward_segment(sentence)
for word in words:
yield word[0]
def fully_segment(self, text):
words = []
size = len(text)
for i in range(size):
state = self.trie
for j in range(i, size):
state = state.find(text[j])
if state:
if state._value is not None:
words.append((text[i: j+1], state._value, i, j+1))
else:
break
return words
def forward_segment(self, text):
# 原理可参看basic_forward_segment.py
i = 0
size = len(text)
words = []
while i < size:
state = self.trie.find(text[i])
if state:
j = i + 1
k = j
value = state._value
for j in range(i+1, size):
state = state.find(text[j])
if not state:
break
if state._value is not None:
value = state._value
k = j + 1
if value is not None:
words.append((text[i:k], value, i, k))
i = k - 1
i += 1
return words
def backward_segment(self, text):
pass
if __name__ == "__main__":
import dataset
import evaluation
words, total = dataset.load_freq_words()
tokenizer = TrieTokenizer(words, fully=False)
for text in dataset.load_sentences():
print(tokenizer.cut(text))
# 测试分词的完整性
text = dataset.load_human_history()
words = tokenizer.cut(text)
assert "".join(words) == text
evaluation.evaluate_speed(tokenizer.cut, text, rounds=5)