-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_dataset_script.py
185 lines (136 loc) · 5.81 KB
/
build_dataset_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/python
import os
import h5py
import numpy as np
import tensorflow as tf
import multiprocessing as mp
import argparse
import gc
from sklearn.model_selection import train_test_split
def makedir(*args: str) -> str:
"""Creates a directory"""
path = os.path.join(*args)
os.makedirs(path, exist_ok=True)
return path
def get_files(src: str, dst: str, should_split=False) -> list:
files = os.listdir(src)
if should_split:
makedir(os.path.join(dst, 'train'))
makedir(os.path.join(dst, 'valid'))
else:
makedir(dst)
return files
def from_h5_to_npz(args: tuple):
id, file_name, src, dst, split, normalize, should_sum = args
dtype = np.float32
should_split = split > 0.0
path = os.path.join(src, file_name)
print(f'[{id}] processing "{file_name}"..')
# determine label
if 'h125' in file_name:
mass = 1
label = 1
elif 'h200' in file_name:
mass = 2
label = 1
elif 'h300' in file_name:
mass = 3
label = 1
elif 'h400' in file_name:
mass = 2 + 2
label = 1
elif 'h700' in file_name:
mass = 3 + 2
label = 1
elif 'h1000' in file_name:
mass = 4 + 2
label = 1
elif 'svj' in file_name:
label = 2
# file_name has the format "svj_[mass]_[id]_[n].h5"
part = file_name.split('_')[1]
# TODO: should change mass label for SVJ; these are still unique if paired with class label
mass = {2100: 5, 3100: 6, 4100: 7}[int(part)]
else:
mass = 0
label = 0
# each file contains N 286x360 images of the plane (eta, phi)
with h5py.File(path, 'r') as file:
# inner-tracker image (with pile-up correction)
image_trk = np.array(file.get('ImageTrk_PUcorr'), dtype=dtype)
# ECAL image
image_ecal = np.array(file.get('ImageECAL'), dtype=dtype)
# HCAL image
image_hcal = np.array(file.get('ImageHCAL'), dtype=dtype)
# number of tracks
tracks = np.array(file.get('NTrk_PUcorr'), dtype=dtype)
# stack the three images to form 3-channel images
# shape: (N, 286, 360, 3)
images = np.stack([image_trk, image_ecal, image_hcal], axis=-1).astype(dtype)
# transpose to have (phi, eta) instead of (eta, phi)
# shape: (N, 360, 286, 3)
images = np.transpose(images, axes=[0, 2, 1, 3])
labels = np.ones(len(images), dtype=dtype) * float(label)
masses = np.ones(len(images), dtype=dtype) * float(mass)
# pre-processing
# down-sample (by 5x) and normalize images (to sum 1)
with tf.device('cpu'):
x = tf.nn.depthwise_conv2d(images, filter=tf.ones((5, 5, 3, 1)),
strides=[1, 5, 5, 1], padding='SAME')
if should_sum:
x = tf.reduce_sum(x, axis=-1, keepdims=True)
if normalize:
x /= tf.reduce_sum(x, axis=[1, 2, 3], keepdims=True)
images = np.array(x, dtype=dtype)
# training-validation split
if should_split:
x_train, x_valid, \
y_train, y_valid, \
t_train, t_valid, \
m_train, m_valid = train_test_split(images, labels, tracks, masses,
test_size=split, random_state=42)
# save
save_path_train = os.path.join(dst, 'train', file_name)
save_path_valid = os.path.join(dst, 'valid', file_name)
save_path_train, _ = os.path.splitext(save_path_train) # remove .h5 extension
save_path_valid, _ = os.path.splitext(save_path_valid)
np.savez_compressed(save_path_train, images=x_train, labels=y_train,
masses=m_train, n_tracks=t_train)
np.savez_compressed(save_path_valid, images=x_valid, labels=y_valid,
masses=m_valid, n_tracks=t_valid)
print(f' [{id}] saved at "{save_path_train}.npz"')
print(f' [{id}] saved at "{save_path_valid}.npz"')
del x_train, x_valid, y_train, y_valid
gc.collect()
else:
save_path = os.path.join(dst, file_name)
save_path, _ = os.path.splitext(save_path)
np.savez_compressed(save_path, images=images, labels=labels, masses=masses,
n_tracks=tracks)
print(f' [{id}] saved at "{save_path}.npz"')
# cleanup
del file, image_trk, image_ecal, image_hcal, images, labels
gc.collect()
if __name__ == '__main__':
# NOTE: the QCD "valid" folder is manually copied in "test"
# Call as follows:
# QCD: src='../data/n_tracks/qcd', dst='../data/n_tracks-3c', split=0.4
# SUEP: src='../data/n_tracks/suep', dst='../data/n_tracks-3c/test'
# SVJ: src='../data/n_tracks/svj', dst='../data/n_tracks-3c/test'
parser = argparse.ArgumentParser()
# require arguments
parser.add_argument('--src', required=True, type=str, help='source folder')
parser.add_argument('--dst', required=True, type=str, help='destination folder')
# optional arguments
parser.add_argument('-s', '--split', default=0.0, type=float, help='% of train-valid split')
parser.add_argument('--normalize', default=False, help='whether or not to normalize')
parser.add_argument('--sum', default=False, help='whether or not to sum over channels')
args = parser.parse_args()
# processing
files = get_files(src=args.src, dst=args.dst, should_split=args.split > 0.0)
# prepare arguments for processes
arguments = [(f'{i + 1}/{len(files)}', file, args.src, args.dst, args.split,
args.normalize, args.sum)
for i, file in enumerate(files)]
with mp.Pool(processes=mp.cpu_count()) as pool:
pool.map(from_h5_to_npz, arguments)