-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataPreparation.m
111 lines (85 loc) · 4.62 KB
/
DataPreparation.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
classdef DataPreparation
methods (Static)
function dataset = dataCleaning(dataset)
% Data Cleaning
% remove rows with missing data > 10
threshold = 10;
missingValuesPerRow = sum(ismissing(dataset), 2); % sum along columns (dim 2)
rowsToRemove = missingValuesPerRow >= threshold;
dataset = dataset(~rowsToRemove, :);
% fill rows with missing data for categorical features
for i = 1 : size(Utils.categoricalFeatures)
currentFeature = Utils.categoricalFeatures(i);
fillingValue = mode(dataset.(currentFeature));
dataset(:, currentFeature) = fillmissing(dataset(:, currentFeature), 'constant', fillingValue);
end
% fill rows with missing data for numerical features
for i = 1 : size(Utils.numericalFeatures)
currentFeature = Utils.numericalFeatures(i);
fillingValue = mean(dataset.(currentFeature), 'omitnan');
dataset(:, currentFeature) = fillmissing(dataset(:, currentFeature), 'constant', fillingValue);
end
% outlier removal
for i = 1 : size(Utils.numericalFeatures)
currentFeature = Utils.numericalFeatures(i);
% compute iqr for current column
q75 = prctile(dataset.(currentFeature), 75, 'all');
q25 = prctile(dataset.(currentFeature), 25, 'all');
iqrValues = q75 - q25;
% values greater than thresh old are outliers
threshold = 3;
% indexes of outliers
outliersIndices = abs(dataset.(currentFeature) - median(dataset.(currentFeature))) > threshold * iqrValues;
% outlier removal
dataset.(currentFeature)(outliersIndices) = NaN;
dataset = rmmissing(dataset);
end
% final dataset
head(dataset);
summary(dataset);
end
function dataset = featureEngineering(dataset)
% convert categorical features to numerical
for i = 1 : size(Utils.categoricalFeatures)
featureName = Utils.categoricalFeatures(i);
dataset.(featureName) = grp2idx(dataset.(featureName));
end
end
function [trainingSet, testSet] = zscoreNormalization(trainingSet, testSet)
% z-score normalization
meanTrain = mean(trainingSet{:, Utils.numericalFeatures});
stdTrain = std(trainingSet{:, Utils.numericalFeatures});
trainingSet{:, Utils.numericalFeatures} = (trainingSet{:, Utils.numericalFeatures} - meanTrain) ./ stdTrain;
testSet{:, Utils.numericalFeatures} = (testSet{:, Utils.numericalFeatures} - meanTrain) ./ stdTrain;
end
function [trainingSet, testSet] = trainTestSplit(dataset)
% hold out train-test split
% split dataset in training and test set
cv = cvpartition(size(dataset, 1), 'HoldOut', 0.2);
trainingSet = dataset(training(cv), :);
testSet = dataset(test(cv), :);
% z-score normalization
[trainingSet, testSet] = DataPreparation.zscoreNormalization(trainingSet, testSet);
end
function [x, y] = featureSelection(dataset)
% feature selection
allFeatures = dataset.Properties.VariableNames;
% select all features except targetFeature
includedFeatures = setdiff(allFeatures, Utils.targetFeature);
% predictor variables
x = table2array(dataset(:, includedFeatures));
% target variable
y = table2array(dataset(:, Utils.targetFeature));
end
function [xTrainReduced, xTestReduced] = principalComponentAnalysis(xTrain, xTest)
% PCA
[coeff, ~, ~, ~, explained] = pca(xTrain);
% choose the number of principal components that retains 95% of variance
desiredVariance = 95;
numComponents = find(cumsum(explained) >= desiredVariance, 1);
% retain only the selected number of principal components
xTrainReduced = xTrain * coeff(:, 1:numComponents);
xTestReduced = xTest * coeff(:, 1:numComponents);
end
end
end