-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain_tokenizer_jieba_statistics.py
64 lines (56 loc) · 1.61 KB
/
train_tokenizer_jieba_statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pickle
import numpy as np
from itertools import chain
import jieba.posseg as pseg
from tqdm import tqdm
def isname(single_word_string):
pair_word_list = pseg.lcut(single_word_string)
for eve_word, cixing in pair_word_list:
if cixing == "nr":
return True
return False
with open('tmp_wSet.final.pkl', 'rb') as file:
wSet = pickle.load(file)
wSet_1 = {}
wSet_2 = {}
wSet_3 = {}
wSet_4 = {}
wSet_5 = {}
# wSet_n = {}
with open('tmp_wSet_n.pkl', 'rb') as file:
wSet_n = pickle.load(file)
for k, v in tqdm(wSet.items(), "Converting"):
# if len(k) > 1 and isname(k):
# if v > 1000:
# wSet_n[k] = v
# continue
if k in wSet_n:
continue
if len(k) == 1:
if v > 1000:
wSet_1[k] = v
elif len(k) == 2:
if v > 2000:
wSet_2[k] = v
elif len(k) == 3:
if v > 3000:
wSet_3[k] = v
elif len(k) == 4:
if v > 2000:
wSet_4[k] = v
else:
if v > 100:
wSet_5[k] = v
# tmp = np.array(list(wSet_1.values()))
# np.percentile(tmp,25)
# np.mean(tmp) - np.std(tmp, ddof=1)/np.sqrt(len(tmp))
# with open('tmp_wSet_n.pkl', 'wb') as file:
# pickle.dump(wSet_n, file)
with open('tmp_jieba.final.txt', 'w', encoding='utf-8') as file:
for x in chain(wSet_1.keys(), wSet_2.keys(), wSet_3.keys(), wSet_4.keys(), wSet_5.keys()):
for i in range(int(wSet[x]**(0.33))+1):
file.write(x)
file.write('§')
for x in wSet_n.keys():
file.write(x)
file.write('§')