-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathngb.py
126 lines (107 loc) · 2.79 KB
/
ngb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import re
import nltk.tokenize
__version__ = "0.1.0"
class Counter(dict):
def add(self, other):
for ngram in other.keys():
self[ngram] = self.get(ngram, 0) + other[ngram]
def remove_subphrases(self):
builder = NgramBuilder()
to_remove = {}
for phrase in list(self.keys()):
for length in range(1, len(phrase.split(" "))):
for subphrase in list(builder.find_ngrams(phrase, length).keys()):
if subphrase in self and self[subphrase] == self[phrase]:
to_remove[subphrase] = 1
for subphrase in list(to_remove.keys()):
del self[subphrase]
class NgramBuilder(object):
def __init__(self, stopwords=None):
self.stopwords = stopwords
def find_ngrams(self, text, length):
counter = Counter()
num_unigrams, unigrams = self.split_into_unigrams(text.lower())
for i in range(num_unigrams):
if (num_unigrams <= i + length - 1):
break
unigram_group = unigrams[i:i + length]
if not self.ngram_is_filtered(unigram_group):
ngram = " ".join(unigram_group)
counter[ngram] = counter.get(ngram, 0) + 1
return counter
def split_into_unigrams(self, text):
unigrams = []
for token in nltk.tokenize.WhitespaceTokenizer().tokenize(text):
unigram = self.token_to_unigram(token)
if unigram:
unigrams.append(unigram)
return len(unigrams), unigrams
def token_to_unigram(self, token):
token = token.strip().strip(",.!|&-_()[]<>{}/\"'").strip()
def has_no_chars(token):
for char in token:
if char.isalpha():
return False
return True
if len(token) == 1 or token.isdigit() or has_no_chars(token):
return None
return token
def ngram_starts_or_ends_in_stopword(self, unigrams):
if self.stopwords is None:
return False
return unigrams[0] in self.stopwords or unigrams[-1] in self.stopwords
def ngram_is_filtered(self, unigrams):
return self.ngram_starts_or_ends_in_stopword(unigrams)
stopwords = set([
"a",
"all",
"an",
"and",
"are",
"as",
"at",
"be",
"but",
"by",
"can",
"do",
"for",
"from",
"had",
"has",
"have",
"he",
"his",
"if",
"in",
"is",
"it",
"its",
"it's",
"my",
"no",
"not",
"of",
"on",
"or",
"our",
"so",
"that",
"the",
"their",
"these",
"they",
"this",
"to",
"us",
"was",
"we",
"were",
"when",
"where",
"which",
"who",
"with",
"would",
"you",
])