-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpre_process_funcs.py
executable file
·56 lines (48 loc) · 1.61 KB
/
pre_process_funcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# Features #
# 1) word_punkt tokenizer #
# 2) porter stemmer #
# 3) Will calculate up to trigrams #
from nltk import ingrams
from nltk.corpus import stopwords
from nltk import wordpunct_tokenize
from nltk.stem.porter import PorterStemmer
from collections import defaultdict
def _remove_stopwords_lowercase(words):
"""
Remove stopwords, convert all words to lowercase, exclude numbers
"""
return [w.lower() for w in words if not w.lower()
in stopwords.words('english') and w.isalpha()]
def _stemmer(words):
"""
Uses nltk.porter.stemmer to stem all words.
"""
st = PorterStemmer()
return [st.stem(w) for w in words]
def make_ngram_dict(string, n_min=1, n_max=1):
"""
Takes a string and turns into dictionary representation.
Arguments:
`string` - string to tokenize, etc.
`n_min` - minimum size ngram
`n_max - maximum size ngram
"""
tokens = wordpunct_tokenize(string)
lowercase = _remove_stopwords_lowercase(tokens)
stemmed = _stemmer(lowercase)
feature_dict = defaultdict(int)
while (n_min <= n_max):
for ngram in ingrams(stemmed, n_min):
feature_dict[' '.join(ngram)] += 1
n_min += 1
return feature_dict
def gen_feature_dict(list_of_strings, n_min=1, n_max=1):
"""
Takes a list of strings, pre-processes, creates feature
dictionary able to pass to scikit learn's DictVectorizer
"""
list_of_feature_dicts = []
for counter, string in enumerate(list_of_strings):
f_dict = make_ngram_dict(string, n_min, n_max)
list_of_feature_dicts.append(f_dict)
return list_of_feature_dicts