-
Notifications
You must be signed in to change notification settings - Fork 2
/
get_wsd.py
66 lines (51 loc) · 1.66 KB
/
get_wsd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import sys
if "dwsd-beta" not in sys.path:
sys.path.append("dwsd-beta")
from dotted_wsd import DottedWsdTagger
tagger = DottedWsdTagger()
import torch
import ckip_transformers
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger
import re
import pandas as pd
device = 0 if torch.cuda.is_available() else -1
ws_driver = CkipWordSegmenter(device=device)
pos_driver = CkipPosTagger(device=device)
def get_dotted_wsd(text):
id_pattern = '\[(.*?)\]'
con_pattern = '\((.*?)\)'
gloss_pattern = '(\[|\()\S+(\]|\))|\s'
lemma, pos, senseID, confidence, gloss = [], [], [], [], []
for t in text:
tt = tagger.sense_tag_per_sentence(t)
for i in tt:
lemma.append(i[0])
pos.append(i[1])
if len(i[2]) > 0:
id_search = re.search(id_pattern, i[2])
con_search = re.search(con_pattern, i[2])
senseID.append(id_search[0][1:-1])
confidence.append(con_search[0][1:-1])
gloss.append(re.sub(gloss_pattern, '', i[2]))
else:
gloss.append('')
senseID.append('')
confidence.append('')
df = pd.DataFrame({
'Lemma':lemma,
'Part-of-Speech':pos,
'Sense_id':senseID,
'Gloss': gloss,
'Confidence': confidence
})
return df
def get_wsd(data, save=False, output=None):
content = [re.sub('\W+', ' ', c) for c in data]
content = [s for c in content for s in c.split(' ') ]
ws = ws_driver(content, show_progress=False)
pos = pos_driver(ws, show_progress=False)
tagged = [[(a[i], b[i]) for i in range(len(a))] for a,b in zip(ws, pos)]
tagged = get_dotted_wsd(tagged)
if save:
tagged.to_csv(output, index=False)
return tagged