-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsc_1_compute_mean_std_from_csv.py
187 lines (134 loc) · 6.74 KB
/
sc_1_compute_mean_std_from_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
"""
Writes to a csv the mean and std values of the merged
dataframes generated from the different experimental trials.
Usage: compute_mean_std_from_csv.py [-h] [--input INPUT] [--output OUTPUT]
Script that merges the input .csv files.
options:
-h, --help show this help message and exit
--input INPUT path to the input directory where the csv files are stored.
--output OUTPUT path to the output directory where the generated csv files will be stored.
Author:
A.J. Sanchez-Fernandez - 07/07/2023
"""
import pandas as pd
import numpy as np
import argparse
import sys
import os
def get_args() -> argparse.Namespace:
"""
Parse and retrieve command-line arguments.
Returns:
An 'argparse.Namespace' object containing the parsed arguments.
"""
parser = argparse.ArgumentParser(
description='Script that merges the .csv files.'
)
parser.add_argument('--input', '-i', required=True,
help='csv file(s) to be merged.')
parser.add_argument('--output', '-o', required=True,
help='folder to output files.')
return parser.parse_args(sys.argv[1:])
def main(args):
# Print target folder.
print(f'Target folder: {args.input}')
# Get a list of all files in the directory.
files = os.listdir(args.input)
# Filter the files to include only the ones with the desired pattern.
filtered_files = [f for f in files
if '_s=' in f]
# Sort the files based on the first part of the filename.
sorted_files = sorted(
filtered_files,
key=lambda f: f.split('_lr=')[0],
reverse=False
)
# Create a list of lists to store the arranged files.
arranged_files = []
current_group = []
current_prefix = None
# Iterate through the sorted files.
for file in sorted_files:
prefix = file.split('_s=')[0] # Get the prefix.
if prefix != current_prefix: # If the prefix is different from the previous file, start a new group.
if current_group:
arranged_files.append(current_group) # Start a new group with the current file.
current_group = [file]
current_prefix = prefix
else:
current_group.append(file) # Add the file to the current group.
# Add the last group to the arranged_files list.
if current_group:
arranged_files.append(current_group)
# Print the arranged files.
for i, group in enumerate(arranged_files):
print()
for file in group:
print(f'{i} --> {file}')
# Iterate over the groups of files.
for group in arranged_files:
print()
dataframes = []
# Get task from first item.
task = group[0].split('_tr=')[0]
print(f"{'Task:'.ljust(8)}{task}")
# Target metric.
if task == 'multiclass':
per_class_metric = 'f1_per_class'
elif task == 'multilabel':
per_class_metric = 'rmse_per_class'
else:
per_class_metric = None
# Get prefix from first item.
prefix = group[0].split('_s=')[0]
print(f"{'Prefix:'.ljust(8)}{prefix}")
# ==============================================
# Iterate over the csv files.
for file in group:
print(f"{'File:'.ljust(8)}{file}") # Show file.
df = pd.read_csv(os.path.join(args.input, file)) # Create the dataframe.
for split in ['val_', 'test_']: # Iterate over the splits.
df[split+per_class_metric] = df[split+per_class_metric].apply( # Convert every row of the last column to np.array with floats.
lambda x: np.array(
[float(i) for i in x.strip('[]').split(',')],
dtype=float
)
)
per_class_metric_df = pd.DataFrame( # Expand the target column into multiple columns, one per value.
df[split+per_class_metric].tolist(),
index=df.index
)
per_class_metric_df.columns = [ # Rename the columns.
f'{split+per_class_metric}_{i}'
for i in range(per_class_metric_df.shape[1])
]
df.drop(columns=split+per_class_metric, inplace=True) # Drop the target column from the original dataframe.
df = pd.concat([df, per_class_metric_df], # Concatenate the original dataframe with the expanded target column.
axis=1)
dataframes.append(df) # Append the expanded dataframe to the list of dataframes.
print(f'\nDATAFRAMES:\n{dataframes}')
# ==============================================
df_concat = pd.concat(dataframes, axis=0) # Merge the dataframes by rows.
print(f'\nCONCAT:\n{df_concat}')
by_row_index = df_concat.groupby(df_concat.index) # Group the dataframe by row index (epoch).
df_means = by_row_index.mean() #.round(3) # Compute the mean and std values per column.
df_stds = by_row_index.std() #.round(3)
df_means['epoch'] = df_means.index.astype('int') # Add the epoch column to the new dataframe and convert it to int.
df_stds['epoch'] = df_stds.index.astype('int')
print(f'\nMEANs:\n{df_means}')
print(f'\nSTDs:\n{df_stds}')
df_means.to_csv(os.path.join(args.output, f'pp_mean_{prefix}.csv'), index=False)
df_stds.to_csv(os.path.join(args.output, f'pp_std_{prefix}.csv'), index=False)
# Create a new DataFrame with mean+-std values wo/ col_name.
# df_both = pd.DataFrame()
# df_both['epoch'] = df_means['epoch']
# for column in df_means.columns:
# if column != 'epoch':
# df_both[column] = df_means[column].astype(str) + '+-' + df_stds[column].astype(str)
# df_both = df_both.drop(col_name, axis=1)
# df_both.to_csv(os.path.join(args.output, f'pp_both_{prefix}.csv'), index=False)
# Save the mean and standard deviation dataframes to CSV files.
return 0
if __name__ == '__main__':
args = get_args() # Parse and retrieve command-line arguments.
sys.exit(main(args)) # Execute the main function.