-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean.py
69 lines (53 loc) · 2.44 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
from download import download_dataset
try:
import pandas as pd
except ImportError:
print("You need to install pandas")
exit()
try:
import numpy as np
except ImportError:
print("You need to install numpy")
exit()
COLUMN_SIMILARITY_THRESHOLD = 95 # Drop columns with more than x% of same data - try 90 ?
def get_percentage_columns_similarity(df):
return (1 - (df.nunique() / len(df))) * 100
def load_and_clean_data(data_path=download_dataset()):
global_df = pd.DataFrame()
for root, dirs, files in os.walk(data_path):
for file in files:
if file.endswith(".csv"):
df = pd.read_csv(os.path.join(root, file))
global_df = pd.concat([global_df, df], ignore_index=True)
global_df.columns = global_df.columns.str.strip()
# Drop columns according to similarity threshold
columns_to_drop = get_percentage_columns_similarity(global_df)
columns_to_drop = columns_to_drop[columns_to_drop > COLUMN_SIMILARITY_THRESHOLD].index
# Don't drop the label column
columns_to_drop = columns_to_drop.drop('Label')
global_df = global_df.drop(columns=columns_to_drop)
global_df = global_df.replace([float('-inf'), float('inf')], float('nan')).dropna()
# drop duplicates
global_df = global_df.drop_duplicates()
# Drop all lines where the label is not BENIGN or does not contains "DoS"
global_df = global_df[global_df['Label'].str.contains('DoS') | global_df['Label'].str.contains('BENIGN')]
return global_df
def save_features_and_labels(global_df, features_path='data/features.csv', labels_path='data/labels.csv'):
# Separate label column from features
global_df['Label'] = np.where(global_df['Label'].isin(['BENIGN']), 0, 1)
print("Labels count : ", global_df['Label'].value_counts())
labels = global_df['Label']
features = global_df.drop(columns=['Label'])
# Create features_path and labels_path's directory if it doesn't exist
if not os.path.exists(os.path.dirname(features_path)):
os.makedirs(os.path.dirname(features_path))
if not os.path.exists(os.path.dirname(labels_path)):
os.makedirs(os.path.dirname(labels_path))
# Save to csv
features.to_csv(features_path, index=False)
labels.to_csv(labels_path, index=False)
print(f"Features and labels saved to {features_path} and {labels_path}")
if __name__ == "__main__":
global_df = load_and_clean_data()
save_features_and_labels(global_df)