-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
626 lines (526 loc) · 20.2 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
import numbers
import numpy as np
from numpy.lib.function_base import iterable
from sklearn.base import ClassifierMixin
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils import check_random_state
from groot.model import BaseGrootTree, CompiledTree, Node, NumericalNode
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import maximum_bipartite_matching
# Much of this code is adapted from the GROOT code at:
# https://github.com/tudelft-cda-lab/GROOT/blob/main/groot/model.py
_TREE_LEAF = -1
_TREE_UNDEFINED = -2
LEFT = 0
LEFT_INTERSECT = 1
RIGHT_INTERSECT = 2
RIGHT = 3
NOGIL = True
def _attack_model_to_tuples(attack_model, n_features):
if isinstance(attack_model, numbers.Number):
return [(attack_model, attack_model) for _ in range(n_features)]
elif iterable(attack_model):
new_attack_model = []
for attack_mode in attack_model:
if attack_mode == "":
new_attack_model.append((0, 0))
elif attack_mode == ">":
new_attack_model.append((0, 10e9))
elif attack_mode == "<":
new_attack_model.append((10e9, 0))
elif attack_mode == "<>":
new_attack_model.append((10e9, 10e9))
elif isinstance(attack_mode, numbers.Number):
new_attack_model.append((attack_mode, attack_mode))
elif isinstance(attack_mode, tuple) and len(attack_mode) == 2:
new_attack_model.append(attack_mode)
else:
raise Exception("Unknown attack model spec:", attack_mode)
return new_attack_model
else:
raise Exception(
"Unknown attack model spec, needs to be perturbation radius or perturbation"
" per feature:",
attack_model,
)
def _scan_numerical_feature_matching(
samples,
y,
dec,
inc,
left_bound,
right_bound,
samples_reachable,
leaves_reachable,
original_leaf,
new_leaf,
i_0,
i_1,
):
sort_order = samples.argsort()
sorted_labels = y[sort_order]
sample_queue = samples[sort_order]
dec_queue = sample_queue - dec
inc_queue = sample_queue + inc
# Initialize queue values and indices
dec_i = inc_i = 0
dec_val = dec_queue[0]
inc_val = inc_queue[0]
best_score = 10e9
best_split = None
score = None
while True:
# Find the current point and label from the queue with smallest value.
if dec_val < inc_val:
point = dec_val
label = sorted_labels[dec_i]
# If the sample could reach the original leaf before,
# then now it can also reach the new leaf
orig_sample_i = sort_order[dec_i]
leaves_reachable[orig_sample_i, new_leaf] = leaves_reachable[
orig_sample_i, original_leaf
]
# Update dec_val and i to the values belonging to the next
# sample in queue. If we reached the end of the queue then store
# a high number to make sure the dec_queue does not get picked
if dec_i < dec_queue.shape[0] - 1:
dec_i += 1
dec_val = dec_queue[dec_i]
else:
dec_val = 10e9
else:
point = inc_val
label = sorted_labels[inc_i]
# If the sample cannot reach the original leaf anymore are it
# is too far from the decision boundary
orig_sample_i = sort_order[inc_i]
leaves_reachable[orig_sample_i, original_leaf] = False
# Update inc_val and i to the values belonging to the next
# sample in queue. If we reached the end of the queue then store
# a high number to make sure the inc_queue does not get picked
if inc_i < inc_queue.shape[0] - 1:
inc_i += 1
inc_val = inc_queue[inc_i]
else:
inc_val = 10e9
if label == 0:
# If the label of this sample is 0 then update its reachability
# to all samples of label 1
i_curr_sample = np.searchsorted(i_0, orig_sample_i)
# assert orig_sample_i in i_0
curr_sample_reachable_leaves = leaves_reachable[orig_sample_i, :]
for j, original_index_1 in enumerate(i_1):
if np.any(
curr_sample_reachable_leaves & leaves_reachable[original_index_1, :]
):
samples_reachable[i_curr_sample, j] = 1
else:
samples_reachable[i_curr_sample, j] = 0
else:
# If the label of this sample is 1 then update its reachability
# to all samples of label 0
i_curr_sample = np.searchsorted(i_1, orig_sample_i)
# assert orig_sample_i in i_1
curr_sample_reachable_leaves = leaves_reachable[orig_sample_i, :]
for i, original_index_0 in enumerate(i_0):
if np.any(
curr_sample_reachable_leaves & leaves_reachable[original_index_0, :]
):
samples_reachable[i, i_curr_sample] = 1
else:
samples_reachable[i, i_curr_sample] = 0
# print(list(leaves_reachable[:, 0]), list(leaves_reachable[:, 1]))
# print(samples_reachable.todense())
if point >= right_bound:
break
if point < left_bound:
continue
# If the next point is not the same as this one
next_point = min(dec_val, inc_val)
if next_point != point:
maximum_matching = maximum_bipartite_matching(csr_matrix(samples_reachable))
score = np.sum(maximum_matching != -1)
# print(point, score)
# Maximize the margin of the split
split = (point + next_point) * 0.5
if score is not None and score < best_score:
best_score = score
best_split = split
return best_score, best_split
class RelabelingCriterionTreeClassifier(BaseGrootTree, ClassifierMixin):
"""
A robust decision tree for binary classification.
"""
def __init__(
self,
max_depth=5,
min_samples_split=2,
min_samples_leaf=1,
max_features=None,
robust_weight=1.0,
attack_model=None,
one_adversarial_class=False,
chen_heuristic=False,
compile=True,
random_state=None,
):
"""
Parameters
----------
max_depth : int, optional
The maximum depth for the decision tree once fitted.
min_samples_split : int, optional
The minimum number of samples required to split a node.
min_samples_leaf : int, optional
The minimum number of samples required to make a leaf.
max_features : int or {"sqrt", "log2"}, optional
The number of features to consider while making each split, if None then all features are considered.
robust_weight : float, optional
The ratio of samples that are actually moved by an adversary.
attack_model : array-like of shape (n_features,), optional
Attacker capabilities for perturbing X. By default, all features are considered not perturbable.
one_adversarial_class : bool, optional
Whether one class (malicious, 1) perturbs their samples or if both classes (benign and malicious, 0 and 1) do so.
chen_heuristic : bool, optional
Whether to use the heuristic for the adversarial Gini impurity from Chen et al. (2019) instead of GROOT's adversarial Gini impurity.
compile : bool, optional
Whether to compile the tree for faster predictions.
random_state : int, optional
Controls the sampling of the features to consider when looking for the best split at each node.
Attributes
----------
classes_ : ndarray of shape (n_classes,)
The class labels.
max_features_ : int
The inferred value of max_features.
n_samples_ : int
The number of samples when `fit` is performed.
n_features_ : int
The number of features when `fit` is performed.
root_ : Node
The root node of the tree after fitting.
compiled_root_ : CompiledTree
The compiled root node of the tree after fitting.
"""
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.max_features = max_features
self.robust_weight = robust_weight
self.attack_model = attack_model
self.one_adversarial_class = one_adversarial_class
self.chen_heuristic = chen_heuristic
self.compile = compile
self.random_state = random_state
def _check_target(self, y):
target_type = type_of_target(y)
if target_type != "binary":
raise ValueError(
"Unknown label type: classifier only supports binary labels but found"
f" {target_type}"
)
self.classes_, y = np.unique(y, return_inverse=True)
self.n_classes_ = len(self.classes_)
return y
def _create_leaf(self, y):
"""
Create a leaf object that predicts according to the ratio of benign
and malicious labels in the array y.
"""
# Count the number of points that fall into this leaf including
# adversarially moved points
label_counts = np.bincount(y, minlength=2)
# Set the leaf's prediction value to the weighted average of the
# prediction with and without moving points
value = label_counts / np.sum(label_counts)
return Node(_TREE_UNDEFINED, _TREE_LEAF, _TREE_LEAF, value)
def fit(self, X, y, check_input=True):
"""
Build a robust and fair binary decision tree from the training set
(X, y) using greedy splitting according to the weighted adversarial
Gini impurity and fairness impurity.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The training samples.
y : array-like of shape (n_samples,)
The class labels as integers 0 (benign) or 1 (malicious)
Returns
-------
self : object
Fitted estimator.
"""
if check_input:
X, y = check_X_y(X, y)
y = self._check_target(y)
self.n_samples_, self.n_features_in_ = X.shape
if self.attack_model is None:
attack_model = [""] * X.shape[1]
else:
attack_model = self.attack_model
# Turn numerical features in attack model into tuples to make fitting
# code simpler
self.attack_model_ = np.array(
_attack_model_to_tuples(attack_model, X.shape[1]), dtype=X.dtype
)
self.random_state_ = check_random_state(self.random_state)
if self.max_features == "sqrt":
self.max_features_ = int(np.sqrt(self.n_features_in_))
elif self.max_features == "log2":
self.max_features_ = int(np.log2(self.n_features_in_))
elif self.max_features is None:
self.max_features_ = self.n_features_in_
else:
self.max_features_ = self.max_features
if self.max_features_ == 0:
self.max_features_ = 1
# Keep track of the minimum and maximum split value for each feature
constraints = np.concatenate(
(np.min(X, axis=0).reshape(-1, 1), np.max(X, axis=0).reshape(-1, 1)), axis=1
)
# samples_reachable = lil_array(np.ones(tuple(np.bincount(y)), dtype=bool))
samples_reachable = np.ones(tuple(np.bincount(y)), dtype=bool)
leaves_reachable = np.zeros(
shape=(self.n_samples_, 2**self.max_depth), dtype=np.bool
)
leaves_reachable[:, -1] = True
i_0 = np.where(y == 0)[0]
i_1 = np.where(y == 1)[0]
self.score_ = np.min(np.bincount(y))
self.root_ = self.__fit_recursive(
X, y, constraints, samples_reachable, leaves_reachable, i_0, i_1
)
# Compile the tree into a representation that is faster when predicting
if self.compile:
self.compiled_root_ = CompiledTree(self.root_)
return self
def __fit_recursive(
self,
X,
y,
constraints,
samples_reachable,
leaves_reachable,
i_0,
i_1,
depth=0,
node_id=1,
):
"""
Recursively fit the decision tree on the training dataset (X, y).
The constraints make sure that leaves are well formed, e.g. don't
cross an earlier split. Stop when the depth has reached self.max_depth,
when a leaf is pure or when the leaf contains too few samples.
"""
if (
(self.max_depth is not None and depth == self.max_depth)
or len(y) < self.min_samples_split
or np.all(y == y[0])
):
return self._create_leaf(y)
original_leaf = node_id
# Follow all the right decisions for nodes that do not exist yet,
# all the way to the leaves
for _ in range(self.max_depth - depth):
original_leaf = original_leaf * 2 + 1
original_leaf -= 2**self.max_depth
new_leaf = 2 * node_id
for _ in range(self.max_depth - depth - 1):
new_leaf = new_leaf * 2 + 1
new_leaf -= 2**self.max_depth
# print(depth, self.max_depth, original_leaf, new_leaf)
rule, feature, split_score = self.__best_adversarial_decision(
X,
y,
constraints,
samples_reachable,
leaves_reachable,
original_leaf,
new_leaf,
i_0,
i_1,
)
score_gain = self.score_ - split_score
# print(f"Score gain: {score_gain} ({split_score})")
if rule is None or score_gain <= 0.00:
return self._create_leaf(y)
self.score_ = split_score
# Assert that the split obeys constraints made by previous splits
assert rule >= constraints[feature][0]
assert rule < constraints[feature][1]
self.__update_reachability(
X,
samples_reachable,
leaves_reachable,
original_leaf,
new_leaf,
i_0,
i_1,
rule,
feature,
)
# Set the right bound and store old one for after recursion
old_right_bound = constraints[feature][1]
constraints[feature][1] = rule
left_node = self.__fit_recursive(
X,
y,
constraints,
samples_reachable,
leaves_reachable,
i_0,
i_1,
depth + 1,
node_id * 2,
)
# Reset right bound, set left bound, store old one for after recursion
constraints[feature][1] = old_right_bound
old_left_bound = constraints[feature][0]
constraints[feature][0] = rule
right_node = self.__fit_recursive(
X,
y,
constraints,
samples_reachable,
leaves_reachable,
i_0,
i_1,
depth + 1,
node_id * 2 + 1,
)
# Reset the left bound
constraints[feature][0] = old_left_bound
node = NumericalNode(feature, rule, left_node, right_node, _TREE_UNDEFINED)
return node
def __best_adversarial_decision(
self,
X,
y,
constraints,
samples_reachable,
leaves_reachable,
original_leaf,
new_leaf,
i_0,
i_1,
):
"""
Find the best split by iterating through each feature and scanning
it for that feature's optimal split.
"""
best_score = 10e9
best_rule = None
best_feature = None
# If there is a limit on features to consider in a split then choose
# that number of random features.
all_features = np.arange(self.n_features_in_)
features = self.random_state_.choice(
all_features, size=self.max_features_, replace=False
)
for feature in features:
# score, decision_rule = self._scan_feature(X, y, feature, constraints)
samples = X[:, feature]
attack_mode = self.attack_model_[feature]
constraint = constraints[feature]
score, decision_rule = _scan_numerical_feature_matching(
samples,
y,
*attack_mode,
*constraint,
samples_reachable.copy(),
leaves_reachable.copy(),
original_leaf,
new_leaf,
i_0,
i_1,
)
# print(feature, score, decision_rule)
if decision_rule is not None and score < best_score:
best_score = score
best_rule = decision_rule
best_feature = feature
return best_rule, best_feature, best_score
def __update_reachability(
self,
X,
samples_reachable,
leaves_reachable,
original_leaf,
new_leaf,
i_0,
i_1,
rule,
feature,
):
dec, inc = self.attack_model_[feature]
for i, feature_value in enumerate(X[:, feature]):
if leaves_reachable[i, original_leaf]:
if feature_value - dec <= rule:
leaves_reachable[i, new_leaf] = True
if feature_value + inc <= rule:
leaves_reachable[i, original_leaf] = False
for i, original_index_0 in enumerate(i_0):
for j, original_index_1 in enumerate(i_1):
if np.any(
leaves_reachable[original_index_0, :]
& leaves_reachable[original_index_1, :]
):
samples_reachable[i, j] = 1
else:
samples_reachable[i, j] = 0
def predict_proba(self, X):
"""
Predict class probabilities of the input samples X.
The class probability is the fraction of samples of the same class in
the leaf.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input samples to predict.
Returns
-------
proba : array of shape (n_samples,)
The probability for each input sample of being malicious.
"""
check_is_fitted(self, "root_")
X = check_array(X)
if X.shape[1] != self.n_features_in_:
raise ValueError(
"Received different number of features during predict than during fit"
)
# If model has been compiled, use compiled predict_proba
if self.compile:
return self.compiled_root_.predict_classification_proba(X)
predictions = []
for sample in X:
predictions.append(self.root_.predict(sample))
predictions = np.array(predictions)
predictions /= np.sum(predictions, axis=1)[:, np.newaxis]
return predictions
def predict(self, X):
"""
Predict the classes of the input samples X.
The predicted class is the most frequently occuring class label in a
leaf.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input samples to predict.
Returns
-------
y : array-like of shape (n_samples,)
The predicted class labels
"""
# If model has been compiled, use compiled predict
if self.compile:
check_is_fitted(self, "root_")
X = check_array(X)
if X.shape[1] != self.n_features_in_:
raise ValueError(
"Received different number of features during predict than"
" during fit"
)
return self.classes_.take(self.compiled_root_.predict_classification(X))
y_pred_proba = self.predict_proba(X)
return self.classes_.take(np.argmax(y_pred_proba, axis=1))