-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocessor.py
320 lines (269 loc) · 9.55 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
#!/usr/bin/python
import re
import os, os.path, sys
import gzip
from math import log10, sqrt
from porter import PorterStemmer
class Corpus(object):
"""A class to represent a corpus of text.
Initialised with no parameters.
Class constants:
WORD_PATT - determines what sort of terms the corpus retains
STOPFILE - the filename of the file containing the stop words
STOPWORDS - the set of stop words extracted from the file
"""
WORD_PATT = re.compile(r'([A-Za-z])+', re.IGNORECASE)
STOPFILE = 'english.stop'
STOPWORDS = set()
def __init__(self):
"""Create a new, empty corpus of text."""
self.doc_count = 0
self.words = WordList()
self.removed = []
self._populate_stopw()
def _populate_stopw(self):
"""A helper method to extract the stop words from the stop words file."""
f_handle = open(self.STOPFILE, 'rU')
for word in f_handle.read().splitlines():
self.STOPWORDS.add(word)
def get_stopwords(self):
"""Return the set of stop words used as a Python list."""
return list(self.STOPWORDS)
def get_removed(self):
"""Return the list of words removed by the stop word removal."""
return self.removed
def get_words(self):
"""Get all the words for this corpus."""
return self.words.get_words()
def get_word(self, word):
"""Return the word object corresponding to the specified word."""
return self.words.get_word(word)
def get_df(self, word):
"""Get the document frequency (DF) for a particular word. The DF is the
number of documents in the collection in which the word appears in.
If the word is not found, return 0."""
word_obj = self.words.get_word(word)
if word_obj:
return word_obj.get_df()
else:
return 0
def add(self, text, fname, stem=False):
"""Add a string of text to the corpus by first splitting it into features
defined by WORD_PAT, and then removing stop words.
Takes a string as its argument."""
for match in re.finditer(self.WORD_PATT, text):
if match:
word = match.group(0).lower()
if word in self.STOPWORDS:
self.removed.append(word)
self.words.add_word(word, fname)
continue
if stem:
p = PorterStemmer()
word = p.stem(word, 0, len(word)-1)
class Word(object):
def __init__(self, word, fname):
"""Create a new word object, for use in the Corpus.
Arguments:
word -- the actual word that this object holds
fname -- the name of the file that this word first appeared in
"""
self.freqs = {}
self.word = word
self.freqs[fname] = 1
self.cls_count = {'spam': 0, 'nonspam': 0}
self.update_clscount(fname)
def update_freq(self, fname):
"""Update the value of the frequencies of this word.
If the word has already been in the file, increment the counter.
If not, add a new entry to indicate that is word has been seen in the file
called fname."""
if fname in self.freqs:
self.freqs[fname] += 1
else:
self.freqs[fname] = 1
def update_clscount(self, fname):
"""Update the frequency of this word in either the spam or nonspam category
for the purposes of implementing categorical proportional difference (CPD)
feature selection."""
if 'spmsg' in fname:
self.cls_count['spam'] += 1
else:
self.cls_count['nonspam'] += 1
def get_clscount(self, cls):
return self.cls_count[cls]
def get_df(self):
"""Get the number of documents in the corpus that the word appears in."""
return len(self.freqs)
def get_cpd(self):
"""Return the categorical proportional difference of this word in the
collection.
The values of the CPD are in the range (-1,1]. Values close to -1 indicate
that the word occurs in a roughly equal # of documents in all categories.
A value of 1 indicates that the word appears in documents of only 1 category.
CPD(w) = max_i{CPD(w, c_i)}
CPD(w, c) = (A-B)/(A+B)
where A is the # of times w and c occur together,
B is the # of times w occurs without category c"""
highest = -1000
for category in self.cls_count:
A = self.get_clscount(category)
if category == 'spam':
B = self.get_clscount('nonspam')
else:
B = self.get_clscount('spam')
cpd = float(A-B)/(A+B)
if cpd > highest:
highest = cpd
return highest
def get_freq(self, fname):
"""Get the number of occurences of this word in a particular file."""
if fname in self.freqs:
return self.freqs[fname]
else:
return 0
def get_text(self):
"""Get the actual word that this object holds, as a string."""
return self.word
def get_filenames(self):
"""Returns a list of the filenames that this word appears in."""
return self.freqs.keys()
def __repr__(self):
return self.word
def __str__(self):
return self.word
def __eq__(self, other):
if self.word == other.get_text():
return True
return False
def __ne__(self, other):
if self.word != other.get_text():
return True
return False
class WordList(object):
def __init__(self):
"""Creates a new, empty WordList to store the words of the corpus."""
self.words = {}
def add_word(self, word, fname):
"""Add a word to this list.
If the word has already been seen, update its frequencies.
If the word hasn't been seen, create a new entry for it in this list.
Arguments:
word -- the string containing the word text
fname -- the filename that the word appears in"""
if word in self.words:
word_obj = self.words[word]
word_obj.update_freq(fname)
else:
word_obj = Word(word, fname)
self.words[word] = word_obj
word_obj.update_clscount(fname)
def get_words(self):
"""Get all the words in this list as a Python list of strings."""
return self.words.values()
def get_word(self, text):
"""Get the word object in with the given text.
Returns None if the word isn't found."""
if text in self.words:
return self.words[text]
return None
def preprocess(source_dir, stem=False):
"""Preprocess the source files in the directory source_dir.
If stem=True (default False), then the words are stemmed using the Porter
Stemmer.
Returns the two corpora as a 2-tuple."""
subj_corp = Corpus()
body_corp = Corpus()
src_files = os.listdir(source_dir)
for f in src_files:
path = os.path.join(source_dir, f)
src_f = gzip.open(path)
content = src_f.read()
content_lines = content.split('\n', 1) # first line is always subject, rest is body
subj, body = content_lines[0].strip(), content_lines[1].strip()
subj = subj.lstrip('Subject: ') # strip away the word subject
# add the contents of the file just read to the corpus
subj_corp.add(subj, f, stem)
body_corp.add(body, f, stem)
src_f.close()
return subj_corp, body_corp
def feature_select(corpus, method, max_no):
"""Select the top max_no features from the given corpus.
The top max_no number of features are selected using the specified method.
method can be:
'df' -- for document frequency (DF) selection
'cpd' -- for categorical proportional difference (CPD) selection
Returns a list of Word objects."""
# df_vals = {}
# selected = []
# count = 0
# for word in corpus.get_words():
# df = word.get_df()
# if df in df_vals:
# df_vals[df].append(word)
# else:
# df_vals[df] = [word]
# for df_val in sorted(df_vals, reverse=True):
# df_vals[df_val].sort()
# while df_vals[df_val]:
# if count < max_no:
# word = df_vals[df_val].pop()
# selected.append(word)
# count += 1
# else:
# return sorted(selected)
vals = []
for word in corpus.get_words():
if method == 'df':
vals.append((word.get_df(), word))
elif method == 'cpd':
vals.append((word.get_cpd(), word))
vals.sort(cmp=lambda x,y:cmp(x[0],y[0]), reverse=True)
feat_list = [w[1] for w in vals[:max_no]]
return feat_list
def write_dataset(feat_list, src_dir, sav_name):
"""Write the dataset for the given list of features.
Arguments:
feat_list -- the list of features selected from feature_select()
src_dir -- the source directory of files that we want to calculate weights for
sav_name -- the name of the output file"""
nfeatures = len(feat_list)
header_list = ['f%d' % x for x in xrange(1,nfeatures+1)]
header_list.append('class')
header_str = ','.join(header_list) + '\n'
src_files = sorted(os.listdir(src_dir))
collection_size = len(src_files)
weights = []
for f in src_files:
row = []
for word_obj in feat_list:
row.append(str(cos_norm(word_obj, f, feat_list, collection_size)))
if 'spmsg' in f:
row.append('spam')
else:
row.append('nonspam')
row_str = ','.join(row)
weights.append(row_str)
weight_str = '\n'.join(weights)
out_f = open(sav_name, 'w')
out_f.write(header_str+weight_str)
out_f.close()
def tf_idf(word, fname, collection_size):
"""Compute the tf-idf score for a word in a file called fname in a collection
of size collection_size."""
# tfidf(t_k, d_j) = #(t_k, d_j) * log(|Tr| / #Tr(t_k))
tf_idf = word.get_freq(fname) * log10(float(collection_size)/word.get_df())
return tf_idf
def cos_norm(word, fname, words, collection_size):
"""Returns a normalised tf-idf score between 0 and 1.
If all tf-idf scores in the denominator are 0, return 0 regardless of what
the numerator was."""
numerator = tf_idf(word, fname, collection_size)
denom = 0.0
for word in words:
denom += tf_idf(word, fname, collection_size)**2
if denom == 0:
weight = 0
else:
weight = numerator / sqrt(denom)
return weight