forked from MachineLP/TextMatch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathedit_sim.py
76 lines (66 loc) · 2.64 KB
/
edit_sim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# -*- coding:utf-8 -*-
'''
-------------------------------------------------
Description : edit sim
Author : machinelp
Date : 2020-06-03
-------------------------------------------------
'''
import os
import jieba
import pickle
import distance
import numpy as np
from textmatch.utils.logging import logging
from textmatch.config.config import Config as conf
from textmatch.config.constant import Constant as const
from textmatch.models.model_base.model_base import ModelBase
from textmatch.models.text_embedding.stop_words import StopWords
class EditDistance(ModelBase):
def __init__( self, stop_word=StopWords ):
'''
'''
self.stop_word = stop_word()
# init
def init(self, words_list=None, update=True):
self.word_list = self._seg_word(words_list)
return self
'''
# seg word
def _seg_word(self, words_list, jieba_flag=True, del_stopword=False):
if jieba_flag:
word_list = [[self.stop_word.del_stopwords(words) if del_stopword else word for word in jieba.cut(words)] for words in words_list]
else:
word_list = [[self.stop_word.del_stopwords(words) if del_stopword else word for word in words] for words in words_list]
print( 'word_list>>>', word_list )
return [ ' '.join(word) for word in word_list ]
'''
# seg word
def _seg_word(self, words_list, jieba_flag=conf.JIEBA_FLAG, del_stopword=conf.DEL_STOPWORD):
word_list = []
if jieba_flag:
for words in words_list:
if del_stopword:
if words!='' and type(words) == str:
word_list.append( [word for word in self.stop_word.del_stopwords(jieba.cut(words))] )
else:
if words!='' and type(words) == str:
word_list.append( [word for word in jieba.cut(words)] )
else:
for words in words_list:
if del_stopword:
if words!='' and type(words) == str:
word_list.append( [word for word in self.stop_word.del_stopwords(words)] )
else:
if words!='' and type(words) == str:
word_list.append( [word for word in words] )
return [ ' '.join(word) for word in word_list ]
def edit_sim(self, s1, s2):
maxLen = max(len(s1), len(s2))
dis = distance.levenshtein(s1, s2)
sim = 1 - dis * 1.0 / maxLen
return sim
def predict(self, words):
word = self._seg_word([words])[0]
results = [ self.edit_sim(x, word) for x in self.word_list ]
return results