-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbasic_segment_shortest_path.py
91 lines (83 loc) · 2.27 KB
/
basic_segment_shortest_path.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import collections
import numpy as np
from base import TokenizerBase
# 最短路分词、N-最短路分词
def _create_DAG(sentence, words):
# 构建等权重DAG词图,使用邻接表的方式存储图
DAG = collections.defaultdict(list)
size = len(sentence)
for i in range(size):
j = i
word = sentence[i]
while j < size and word in words:
if words[word]:
DAG[i].append(j)
j += 1
word = sentence[i:j+1]
if not DAG[i]:
DAG[i].append(i)
return DAG
def create_DAG(sentence, words):
# 构建等权重DAG词图,使用邻接表的方式存储图
DAG = collections.defaultdict(list)
size = len(sentence)
i = 0
while i < size:
for j in range(i, size):
word = sentence[i:j+1]
if word in words and words[word]:
DAG[i].append(j)
if not DAG[i]:
DAG[i].append(i)
i += 1
return DAG
def __create_DAG(D, start=None, end=None):
size = len(sentence)
dp = {}
dp[size] = (0, 0)
for i in reversed(range(size)):
# 最大概率的词
dp[i] = max(
(self.word_logproba(sentence[i:j+1]) + dp[j+1][0], j) \
for j in DAG[i]
)
# i = 0
while i < size:
j = dp[i][1] + 1
# 逐词返回
word = sentence[i:j]
yield word
i = j
if __name__ == "__main__":
import dataset
import pprint
words, total = dataset.load_freq_words()
text = "独立自主和平等互利的原则"
DAG = create_DAG(text, words)
pprint.pprint(DAG)
# defaultdict(<class 'list'>,
# {0: [0, 1, 3],
# 1: [1],
# 2: [2, 3],
# 3: [3],
# 4: [4, 5],
# 5: [5, 6, 8],
# 6: [6],
# 7: [7, 8],
# 8: [8],
# 9: [9],
# 10: [10, 11],
# 11: [11]})
# defaultdict(<class 'list'>,
# {0: [0, 1, 3],
# 1: [1],
# 2: [2, 3],
# 3: [3],
# 4: [4, 5],
# 5: [5, 6, 8],
# 6: [6],
# 7: [7, 8],
# 8: [8],
# 9: [9],
# 10: [10, 11],
# 11: [11]})