-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreparation.py
76 lines (60 loc) · 2.48 KB
/
preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
try:
import pandas as pd
except ImportError:
print("You need to install pandas")
exit()
try:
import seaborn as sns
except ImportError:
print("You need to install seaborn")
exit()
try:
import matplotlib.pyplot as plt
except ImportError:
print("You need to install matplotlib")
exit()
try:
import numpy as np
except ImportError:
print("You need to install numpy")
exit()
try:
from imblearn.under_sampling import RandomUnderSampler
except ImportError:
print("You need to install imbalanced-learn")
exit()
def load_data(features_path, labels_path):
features = pd.read_csv(os.path.abspath(features_path))
labels = pd.read_csv(os.path.abspath(labels_path))
return features, labels
def save_correlation_matrix(df, labels, filename):
features_with_labels = df.copy()
features_with_labels['Label'] = labels
corr = features_with_labels.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(corr, annot=False, cmap='coolwarm')
plt.savefig(filename)
print(f"Correlation matrix saved to {filename}")
def resample_data(features, labels):
rus = RandomUnderSampler(random_state=17, sampling_strategy='majority')
features_resampled, labels_resampled = rus.fit_resample(features, labels)
return features_resampled, labels_resampled
def clean_correlated_features(df, threshold=0.9):
corr_matrix = df.corr().abs()
# Create the upper triangle of the correlation matrix (no need to keep the lower triangle and diagonal)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
return df.drop(columns=to_drop)
def save_data(features, labels, features_path, labels_path):
features.to_csv(features_path, index=False)
labels.to_csv(labels_path, index=False)
if __name__ == "__main__":
features, labels = load_data('data/features.csv', 'data/labels.csv')
save_correlation_matrix(features, labels, 'data/corr_matrix.png')
features_resampled, labels_resampled = resample_data(features, labels)
save_correlation_matrix(features_resampled, labels_resampled, 'data/corr_matrix_resampled.png')
features = clean_correlated_features(features)
save_correlation_matrix(features, labels, 'data/corr_matrix_cleaned.png')
save_data(features_resampled, labels_resampled, 'data/features_cleaned.csv', 'data/labels_cleaned.csv')