-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspellcheck.py
62 lines (53 loc) · 1.61 KB
/
spellcheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from nltk_lite.stem.porter import Porter
from nltk_lite.corpora import brown
from nltk_lite import tokenize
import sys
from collections import defaultdict
import operator
def sortby(nlist ,n, reverse=0):
nlist.sort(key=operator.itemgetter(n), reverse=reverse)
class mydict(dict):
def __missing__(self, key):
return 0
class DidYouMean:
def __init__(self):
self.stemmer = Porter()
def specialhash(self, s):
s = s.lower()
s = s.replace("z", "s")
s = s.replace("h", "")
for i in [chr(ord("a") + i) for i in range(26)]:
s = s.replace(i+i, i)
s = self.stemmer.stem(s)
return s
def test(self, token):
hashed = self.specialhash(token)
if hashed in self.learned:
words = self.learned[hashed].items()
sortby(words, 1, reverse=1)
if token in [i[0] for i in words]:
return 'This word seems OK'
else:
if len(words) == 1:
return 'Did you mean "%s" ?' % words[0][0]
else:
return 'Did you mean "%s" ? (or %s)' \
% (words[0][0], ", ".join(['"''"'+i[0]+'"' \for i in words[1:]]))
return "I can't found similar word in my learned db"
def learn(self, listofsentences=[], n=2000):
self.learned = defaultdict(mydict)
if listofsentences == []:
listofsentences = brown.raw()
for i, sent in enumerate(listofsentences):
if i >= n: # Limit to the first nth sentences of the corpus
break
for word in sent:
self.learned[self.specialhash(word)][word.lower()] += 1
def demo():
d = DidYouMean()
d.learn()
# choice of words to be relevant related to the brown corpus
for i in "birdd, oklaoma, emphasise, bird, carot".split(", "):
print i, "-", d.test(i)
if __name__ == "__main__":
demo()