-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
918 lines (797 loc) · 32.3 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
"""Different utilities to help with training and manage the experiments"""
import logging
import os
import stat
import subprocess
import sys
from io import StringIO
from pathlib import Path
from typing import Dict, List
import numpy as np
import pandas as pd
import SimpleITK as sitk
import tensorflow as tf
# configure logger
logger = logging.getLogger(__name__)
def get_gpu(memory_limit=4000, silent=False) -> str:
"""Get the name of the GPU with the most free memory as required by tensorflow
Parameters
----------
memory_limit : int, optional
The minimum free memory in MB, by default 4000
silent : bool, optional
If the card should not be printed, by default False
Returns
-------
str
The GPU with the most free memory
Raises
------
SystemError
If not free GPU is available
"""
output = (
subprocess.check_output(
"nvidia-smi --query-gpu=name,memory.total,memory.free,memory.used --format=csv,nounits",
shell=True,
)
.decode(sys.stdout.encoding)
.strip()
)
gpus_tf = tf.config.list_physical_devices("GPU")
gpus_nvidia_smi = pd.read_csv(StringIO(output))
gpu_devices = {
tf.config.experimental.get_device_details(g)["device_name"]: g.name for g in gpus_tf
}
if not "CUDA_VISIBLE_DEVICES" in os.environ:
cuda_num = {
tf.config.experimental.get_device_details(g)["device_name"]: str(i)
for i, g in enumerate(gpus_tf)
}
gpus_nvidia_smi["cuda_num"] = gpus_nvidia_smi["name"].replace(cuda_num)
else:
to_drop = [n not in gpu_devices for n in gpus_nvidia_smi["name"]]
gpus_nvidia_smi.drop(index=gpus_nvidia_smi.index[to_drop], inplace=True)
gpus_nvidia_smi["tf_name"] = gpus_nvidia_smi["name"].replace(gpu_devices)
if "preferred_gpu" in os.environ:
preferred_gpu = gpus_nvidia_smi.loc[int(os.environ["preferred_gpu"])]
else:
# get the GPU with the most free memory
preferred_gpu = gpus_nvidia_smi.sort_values(" memory.free [MiB]").iloc[-1]
free = preferred_gpu[" memory.free [MiB]"]
if free > memory_limit:
if not silent:
print(f"Using {preferred_gpu['name']}")
logger.info("Using %s", preferred_gpu["name"])
selected_gpu = preferred_gpu.tf_name.partition("physical_device:")[-1]
if not "CUDA_VISIBLE_DEVICES" in os.environ:
os.environ["CUDA_VISIBLE_DEVICES"] = preferred_gpu.cuda_num
return selected_gpu
else:
raise SystemError("No free GPU available")
def output_to_image(
output: np.ndarray,
task: str,
processed_image: sitk.Image,
original_image: sitk.Image,
) -> sitk.Image:
"""Convert the network output to an image. For classification and segmentation,
argmax is applied first to the last dimension. Then, the output is converted to
an image with the same physical dimensions as the processed image. For segmentation,
it is then also resampled to the original image.
Parameters
----------
output : np.ndarray
The output to process
task : str
The name of the task, it should be "segmentation", "classification", "regression" or "autoencoder".
processed_image : sitk.Image
The processed image used for the prediction
original_image : sitk.Image
The original image, only needed for segmentation
Returns
-------
sitk.Image
The resulting Image
"""
if output.ndim > 4:
raise ValueError("Result should have at most 4 dimensions")
# make sure that the output has the right number of dimensions
if task == "segmentation" and output.ndim != 4:
raise ValueError("For segmentation, a 4D Result is expected.")
# for classification, add dimensions until there are 4
if task == "classification":
if output.ndim < 4:
output = np.expand_dims(
output, axis=tuple(2 - i for i in range(4 - output.ndim))
)
# a regression task should have just 3 dimensions
elif task == "regression":
if output.ndim < 3:
output = np.expand_dims(
output, axis=tuple(i + 1 for i in range(3 - output.ndim))
)
elif output.ndim == 4:
raise ValueError("For regression, there should only be 3 dimensions.")
# do the prediction for classification tasks
if task in ("segmentation", "classification"):
output = np.argmax(output, axis=-1)
# remove unneeded dimensions for autoencoder
if task == "autoencoder" and output.ndim == 4:
if output.shape[3] == 1:
output = output[:, :, :, 0]
# turn the output into an image
pred_img = sitk.GetImageFromArray(output.astype(np.float32))
# cast to the right type
if task in ("regression", "autoencoder") and output.ndim < 4:
pred_img = sitk.Cast(pred_img, sitk.sitkFloat32)
elif task in ("regression", "autoencoder") and output.ndim == 4:
pred_img = sitk.Cast(pred_img, sitk.sitkVectorFloat32)
else:
pred_img = sitk.Cast(pred_img, sitk.sitkUInt8)
image_size = np.array(processed_image.GetSize()[:3]) # image could be 4D
zoom_factor = image_size / pred_img.GetSize()
# set the image information, the extent should be constant
pred_img.SetDirection(processed_image.GetDirection())
pred_img.SetSpacing(processed_image.GetSpacing() * zoom_factor)
# in each direction, the origin is shifted by half the zoom factor, but there
# is a shift by 1, because the origin is at the center of the first voxel
new_origin_idx = (zoom_factor - 1) / 2
pred_img.SetOrigin(
processed_image.TransformContinuousIndexToPhysicalPoint(new_origin_idx)
)
if task == "segmentation":
pred_img = sitk.Resample(
image1=pred_img,
referenceImage=original_image,
interpolator=sitk.sitkNearestNeighbor,
outputPixelType=sitk.sitkUInt8,
)
return pred_img
def export_npz(
output: List[np.ndarray],
tasks: List[str],
task_names: List[str],
file_path: Path,
write_class_probabilities=True,
):
"""Export the output of the network as npz file
Parameters
----------
output : List[np.ndarray]
The output of the network
tasks : List[str]
The list of tasks to performed
task_names : List[str]
The task names to be used as keys in the file
file_path : Path
The path where the file should be saved
write_class_probabilities : bool, optional
If probabilities should be written for classification tasks
"""
assert len(task_names) == len(output)
output_dict: Dict[str, np.ndarray] = {}
for out, tsk, name in zip(output, tasks, task_names):
# for regression, just save the whole thing, it is not that big
if tsk == "regression":
output_dict[name] = out.astype(np.float16)
output_dict[name + "_std"] = out.std(axis=tuple(range(out.ndim - 1)))
output_dict[name + "_median"] = np.median(out, axis=tuple(range(out.ndim - 1)))
# average over the output
elif tsk == "classification":
output_dict[name + "_mean"] = out.mean(axis=tuple(range(out.ndim - 1)))
output_dict[name + "_std"] = out.std(axis=tuple(range(out.ndim - 1)))
output_dict[name + "_median"] = np.median(out, axis=tuple(range(out.ndim - 1)))
if write_class_probabilities:
output_dict[name] = out.astype(np.float16)
# for now, just don't export data for segmentation
elif tsk == "segmentation":
output_dict[name] = np.array([])
np.savez_compressed(file_path, **output_dict)
### experiment running utils
def configure_logging(tf_logger: logging.Logger) -> logging.Logger:
"""Configure the logger, the handlers of the tf_logger are removed and both
loggers are set to
Parameters
----------
tf_logger : logging.Logger
The tensorflow logger, must be assigned before importing tensorflow
Returns
-------
logging.Logger
The base logger
"""
# configure loggers
logger_config = logging.getLogger()
logger_config.setLevel(logging.DEBUG)
tf_logger.setLevel(logging.DEBUG)
# there is too much output otherwise
for handler in tf_logger.handlers:
tf_logger.removeHandler(handler)
return logger_config
def generate_res_path(version: str, external: bool, postprocessed: bool, task: str):
"""For a given path, generate the relative path to the result file"""
if postprocessed:
version += "-postprocessed"
if external:
folder_name = f"results_external_testset_{version}_{task}"
else:
folder_name = f"results_test_{version}_{task}"
res_path = Path(folder_name) / "evaluation-all-files.h5"
return res_path
def export_hyperparameters(
experiments, target_dir, additional_info=None, keep_existing=True
):
"""
Export a summary of the experiments and compare the hyperparameters of all experiments
and collect the ones that were changed.
"""
if additional_info is None:
additional_info = [{}] * len(experiments)
if len(experiments) != len(additional_info):
raise ValueError("Experiments and additional_info should have the same length.")
# export the hyperparameters
experiment_dir = Path(os.environ["experiment_dir"])
experiments_file = target_dir / "experiments.json"
hyperparameter_changed_file = target_dir / "hyperparameters_changed.json"
# collect all results
hparams = []
for exp, add_inf in zip(experiments, additional_info):
# and parameters
hparams.append(
{
**exp.hyper_parameters["network_parameters"],
**exp.hyper_parameters["train_parameters"],
"normalizing_method": exp.hyper_parameters["preprocessing_parameters"][
"normalizing_method"
],
"loss": exp.hyper_parameters["loss"],
"architecture": exp.hyper_parameters["architecture"].__name__,
"dimensions": exp.hyper_parameters["dimensions"],
"path": str(exp.output_path_rel),
"exp_group_name": str(exp.output_path_rel.parent.name),
"versions": exp.versions,
"external": exp.external_test_set is not None,
"priority": exp.priority,
}
| add_inf
)
# convert to data frames
hparams = pd.DataFrame(hparams)
if keep_existing and experiments_file.exists():
hparams_old = pd.read_json(experiments_file)
params_present = hparams_old.path.apply(
lambda x: (experiment_dir / x / "parameters.yaml").exists()
)
hparams_old = hparams_old[params_present]
missing = hparams_old.path.apply(
lambda x: np.all(str(x) != hparams.path.astype(str))
)
hparams_old = hparams_old[missing]
hparams = pd.concat([hparams_old, hparams])
hparams = hparams.reset_index().drop(columns="index")
hparams.sort_values("priority", ascending=False, na_position="last", inplace=True)
# find changed parameters
changed_params = []
# drop the results file when analyzing the changed hyperparameters
for col in hparams:
if hparams[col].astype(str).unique().size > 1:
changed_params.append(col)
# have at least one changed parameters (for the plots)
if len(changed_params) == 0:
changed_params = ["architecture"]
hparams_changed = hparams[changed_params].copy()
# if n_filters, use the first
if "n_filters" in hparams_changed:
hparams_changed.loc[:, "n_filters"] = (
hparams_changed["n_filters"].dropna().apply(lambda x: x[0])
)
if "normalizing_method" in hparams_changed:
def get_norm_name(obj):
if isinstance(obj, dict):
return obj["name"]
else:
return obj.name
n_name = hparams_changed["normalizing_method"].apply(get_norm_name)
hparams_changed.loc[:, "normalizing_method"] = n_name
# ignore the batch size (it correlates with the dimension)
if "batch_size" in hparams_changed:
hparams_changed.drop(columns="batch_size", inplace=True)
# ignore do_bias (it is set the opposite to batch_norm)
if "do_bias" in hparams_changed and "do_batch_normalization" in hparams_changed:
hparams_changed.drop(columns="do_bias", inplace=True)
# drop column specifying the files
if "path" in hparams_changed:
hparams_changed.drop(columns="path", inplace=True)
# drop columns only related to architecture
if "architecture" in hparams_changed:
arch_groups = hparams_changed.astype(str).groupby("architecture")
# there should be at least one other column
if arch_groups.ngroups > 1 and hparams_changed.shape[1] > 1:
arch_params = arch_groups.nunique(dropna=False)
for col in arch_params:
if np.all(arch_params[col] == 1):
hparams_changed.drop(columns=col, inplace=True)
# drop the priority
if "priority" in hparams_changed:
hparams_changed.drop(columns="priority", inplace=True)
hparams.to_csv(experiments_file.with_suffix(".csv"), sep=";")
hparams.to_json(experiments_file, indent=4)
hparams_changed.to_csv(hyperparameter_changed_file.with_suffix(".csv"), sep=";")
hparams_changed.to_json(hyperparameter_changed_file, indent=4)
def gather_results(
experiment_dir: Path,
task: str,
external=False,
postprocessed=False,
combined=False,
version="best",
) -> pd.DataFrame:
"""Collect all result files from all experiments. Only experiments that are
already finished will be included in the analysis.
Parameters
----------
experiment_dir : Pathlike
The path where the experiments are located
task : str
The task to analyze, choices are segmentation, classification and regression
external : bool, optional
If the external testset should be evaluated, by default False
postprocessed : bool, optional
If the data from the postprocessed should be evaluated, by default False
combined : bool, optional
If there is a combined model, which should be analyzed, by default True
version : str, optional
Which version of the model should be used, by default best
Returns
-------
pd.DataFrame
The results with all metrics for all files
"""
experiments_file = experiment_dir / "experiments.json"
if external:
file_field = "results_file_external_testset"
else:
file_field = "results_file"
if postprocessed:
file_field += "_postprocessed"
res_path = generate_res_path(version, external, postprocessed, task)
hparams = pd.read_json(experiments_file)
# type is incorrectly detected
# pylint: disable=no-member
# add combined model if present
if combined:
c_path = Path(hparams.iloc[0]["path"]).parent / "combined_models"
loc = hparams.shape[0]
hparams.loc[loc] = "Combined"
hparams.loc[loc, "path"] = c_path
# ignore some fields
ignore = ["tasks", "label_shapes", "path"]
total_experiments = hparams.shape[0]
results_all_list = []
for _, row in hparams.iterrows():
exp_tasks = list(set(row.tasks.values()))
if task not in exp_tasks:
total_experiments -= 1
continue
results_file = experiment_dir.parent / row["path"] / res_path
if results_file.exists():
results = pd.read_hdf(results_file).reset_index()
# set the model
results["name"] = Path(row["path"]).name
results["task"] = task
# set the other parameters
for name, val in row.iteritems():
if name in ignore:
continue
results[name] = [val] * results.shape[0]
# save results
results_all_list.append(results)
else:
name = Path(results_file).parent.parent.name
path_name = Path(results_file).parent.parent.parent.name
print(f"Could not find the evaluation file for {path_name} {name}")
if len(results_all_list) == 0:
print("No files found")
return None
else:
results_all = pd.concat(results_all_list)
dtypes_list = pd.DataFrame([r.dtypes for r in results_all_list]).apply(
lambda x: x.dropna().unique()
)
if isinstance(dtypes_list, pd.DataFrame):
assert dtypes_list.shape[0] == 1
dtypes_list = dtypes_list.iloc[0]
for col, dtypes in zip(results_all, dtypes_list):
if not isinstance(dtypes, List):
dtype = dtypes
elif len(dtypes) == 1:
dtype = dtypes[0]
elif np.all([pd.api.types.is_numeric_dtype(d) for d in dtypes]):
dtype = float
else:
raise TypeError(f"Multiple dtypes found for {col}")
is_int = pd.api.types.is_integer_dtype(dtype)
if ("prediction" in col or "ground_truth" in col) and is_int:
results_all.loc[results_all[col].isna(), col] = -1
results_all[col] = results_all[col].astype(dtype)
complete_percent = int(np.round(len(results_all_list) / total_experiments * 100))
print(f"{complete_percent:3d} % of experiments completed.")
results_all = results_all.copy()
results_all["fold"] = pd.Categorical(results_all["fold"])
results_all["name"] = pd.Categorical(results_all["name"])
results_all["version"] = version
results_all.index = pd.RangeIndex(results_all.shape[0])
results_all.sort_values("File Number", inplace=True)
return results_all
def gather_training_data(
experiment_dir: Path,
) -> pd.DataFrame:
"""Collect all training data from all experiments.
Parameters
----------
experiment_dir : Pathlike
The path where the experiments are located
Returns
-------
pd.DataFrame
The results with all metrics for all files
"""
experiments_file = experiment_dir / "experiments.json"
hparams = pd.read_json(experiments_file)
# type is incorrectly detected
# pylint: disable=no-member
# ignore some fields
ignore = ["tasks", "label_shapes", "path"]
training_data_all_list = []
for _, row in hparams.iterrows():
model_dir = experiment_dir.parent / row["path"]
for fold_dir in model_dir.glob("fold-*"):
training_data_file = fold_dir / "training.csv"
if training_data_file.exists():
training_data = pd.read_csv(training_data_file, sep=";")
# set the model
training_data["name"] = Path(row["path"]).name
training_data["fold"] = fold_dir.name
# set the other parameters
for name, val in row.iteritems():
if name in ignore:
continue
training_data[name] = [val] * training_data.shape[0]
# save training_data
training_data_all_list.append(training_data)
if len(training_data_all_list) == 0:
print("No files found")
return None
else:
training_data_all = pd.concat(training_data_all_list)
# drop first column (which is just the old index)
training_data_all["fold"] = pd.Categorical(training_data_all["fold"])
training_data_all.index = pd.RangeIndex(training_data_all.shape[0])
return training_data_all
def export_slurm_job(
filename,
command,
job_name=None,
workingdir=None,
venv_dir="venv",
job_type="CPU",
cpus=1,
hours=0,
minutes=30,
log_dir=None,
log_file=None,
error_file=None,
array_job=False,
array_range="0-4",
singleton=False,
variables=None,
):
"""Generates a slurm file to run jobs on the cluster
Parameters
----------
filename : Path or str
Where the slurm file should be saved
command : str
The command to run (can also be multiple commands separated by line breaks)
job_name : str, optional
The name displayed in squeue and used for log_name, by default None
workingdir : str, optional
The directory in Segmentation_Experiment, if None, basedir is used, by default None
venv_dir : str, optional
The directory of the virtual environment, by default venv
job_type : str, optional
type of job, CPU, GPU or GPU_no_K80, by default 'CPU'
cpus : int, optional
number of CPUs, by default 1
hours : int, optional
Time the job should run in hours, by default 0
minutes : int, optional
Time the job should run in minutes, by default 30
log_dir : str, optional
dir where the logs should be saved if None logs/job_name/, by default None
log_file : str, optional
name of the log file, if None job_name_job_id_log.txt, by default None
error_file : str, optional
name of the errors file, if None job_name_job_id_log_errors.txt, by default None
array_job : bool, optional
If set to true, array_range should be set, by default False
array_range : str, optional
array_range as str (comma separated or start-stop (ends included)), by default '0-4'
singleton : bool, optional
if only one job with that name and user should be running, by default False
variables : dict, optional
environmental variables to write {name : value} $EXPDIR can be used, by default {}
"""
if variables is None:
variables = {}
# this new node dos not work
exclude_nodes = ["h08c0301", "h08c0401", "h08c0501"]
if job_type == "GPU_no_K80":
exclude_nodes += [
"h05c0101",
"h05c0201",
"h05c0301",
"h05c0401",
"h05c0501",
"h06c0301",
"h05c0601",
"h05c0701",
"h05c0801",
"h05c0901",
"h06c0101",
"h06c0201",
"h06c0401",
"h06c0501",
"h06c0601",
"h06c0701",
"h06c0801",
"h06c0901",
]
if job_type == "CPU":
assert hours == 0
assert minutes <= 30
else:
assert minutes < 60
assert hours <= 48
if log_dir is None:
log_dir = Path("logs/{job_name}/")
else:
log_dir = Path(log_dir)
if log_file is None:
if array_job:
log_file = log_dir / f"{job_name}_%a_%A_log.txt"
else:
log_file = log_dir / f"{job_name}_%j_log.txt"
else:
log_file = log_dir / log_file
if error_file is None:
if array_job:
error_file = log_dir / f"{job_name}_%a_%A_errors.txt"
else:
error_file = log_dir / f"{job_name}_%j_errors.txt"
else:
error_file = log_dir / error_file
filename = Path(filename)
slurm_file = "#!/bin/bash\n\n"
if job_name is not None:
slurm_file += f"#SBATCH --job-name={job_name}\n"
slurm_file += f"#SBATCH --cpus-per-task={cpus}\n"
slurm_file += "#SBATCH --ntasks-per-node=1\n"
slurm_file += f"#SBATCH --time={hours:02d}:{minutes:02d}:00\n"
slurm_file += "#SBATCH --mem=32gb\n"
if job_type in ("GPU", "GPU_no_K80"):
slurm_file += "\n#SBATCH --partition=gpu-single\n"
slurm_file += "#SBATCH --gres=gpu:1\n"
if len(exclude_nodes) > 0:
slurm_file += "#SBATCH --exclude=" + ",".join(exclude_nodes) + "\n"
if array_job:
slurm_file += f"\n#SBATCH --array={array_range}\n"
# add logging
slurm_file += f"\n#SBATCH --output={str(log_file)}\n"
slurm_file += f"#SBATCH --error={str(error_file)}\n"
if singleton:
slurm_file += "\n#SBATCH --dependency=singleton\n"
# define workdir, add diagnostic info
slurm_file += """
echo "Set Workdir"
WSDIR=/gpfs/bwfor/work/ws/hd_mo173-myws
echo $WSDIR
EXPDIR=$WSDIR\n"""
# print task ID depending on type
if array_job:
slurm_file += '\necho "My SLURM_ARRAY_TASK_ID: " $SLURM_ARRAY_TASK_ID\n'
else:
slurm_file += '\necho "My SLURM_JOB_ID: " $SLURM_JOB_ID\n'
slurm_file += """\necho "job started on Node: $HOSTNAME"
echo "Load modules"
module load devel/python_intel/3.7
"""
# add environmental variables
if len(variables) > 0:
slurm_file += "\n"
for key, val in variables.items():
slurm_file += f'export {key}="{val}"\n'
if "GPU" in job_type:
slurm_file += """module load devel/cuda/10.1
module load lib/cudnn/7.6.5-cuda-10.1
echo "Get GPU info"
nvidia-smi
"""
slurm_file += '\necho "Go to workingdir"\n'
if workingdir is None:
slurm_file += "cd $EXPDIR/nnUNet\n"
else:
slurm_file += f"cd {Path(workingdir).resolve()}\n"
# activate virtual environment
slurm_file += '\necho "Activate virtual environment"\n'
slurm_file += f"source {Path(venv_dir).resolve()}/bin/activate\n"
# run the real command
slurm_file += '\necho "Start calculation"\n\n'
slurm_file += command
slurm_file += '\n\necho "Finished"'
if not filename.parent.exists():
filename.parent.mkdir(parents=True)
# write to file
with open(filename, "w+", encoding="utf8") as f:
f.write(slurm_file)
def export_batch_file(filename, commands, file_start=""):
"""Exports a list of commands (one per line) as batch script
Parameters
----------
filename : str or Path
The new file
commands : [str]
List of commands (as strings)
file_start : str, optional
Is added to the start of each file, by default empty
"""
filename = Path(filename)
batch_file = file_start + "#!/bin/bash"
for com in commands:
batch_file += f"\n\n{com}"
if not filename.parent.exists():
filename.parent.mkdir(parents=True)
# write to file
with open(filename, "w+", encoding="utf8") as f:
f.write(batch_file)
# set permission
os.chmod(filename, stat.S_IRWXU)
def export_powershell_scripts(script_dir: Path, experiments: list, file_start=""):
"""Export power shell script to start the different folds and to start tensorboard.
Parameters
----------
script_dir : Path
The directory where the scripts should be placed
experiments : List[Experiment]
The experiments to export
file_start : str, optional
Is added to the start of each file, by default empty
"""
# set the environment (might be changed for each machine)
first_exp = experiments[0]
experiment_dir = first_exp.experiment_dir
assert isinstance(experiment_dir, Path)
data_dir = Path(os.environ["data_dir"])
ps_script_set_env = experiment_dir / "set_env.ps1"
python_script_dir = Path(sys.argv[0]).resolve().parent
command = f'$env:script_dir="{python_script_dir}"\n'
command += "$env:script_dir=$env:script_dir -replace ' ', '` '\n"
command += f'$env:data_dir="{data_dir}"\n'
command += f'$env:experiment_dir="{experiment_dir}"\n'
if "preferred_gpu" in os.environ:
pref_gpu = os.environ["preferred_gpu"]
command += f"$env:preferred_gpu={pref_gpu}\n"
# create env file
if not ps_script_set_env.exists():
with open(ps_script_set_env, "w+", encoding="utf8") as powershell_file_tb:
powershell_file_tb.write(command)
ps_script = script_dir / "start.ps1"
ps_script_single = script_dir / "start_individual_jobs.ps1"
ps_script_tb = script_dir / "start_tensorboard.ps1"
# make a powershell command, add env
command = file_start
if script_dir.resolve() == experiment_dir.resolve():
command += '$set_env=".\\set_env.ps1"\n'
else:
command += "$script_parent = (get-item $PSScriptRoot ).parent.FullName\n"
command += '$set_env="${script_parent}\\set_env.ps1"\n'
command += "$set_env=$set_env -replace ' ', '` '\n"
command += "Invoke-Expression ${set_env}\n"
command += 'Write-Output "Data dir: $env:data_dir"\n'
command += 'Write-Output "Experiment dir: $env:experiment_dir"\n'
command += 'Write-Output "Script dir: $env:script_dir"\n'
# activate
command += 'Write-Output "Activate Virtual Environment"\n'
command += '$activate=${env:script_dir} + "\\venv\\Scripts\\activate.ps1"\n'
command += "Invoke-Expression ${activate}\n"
# tensorboard command (up to here, it is the same)
command_tb = command
command_tb += "$start='tensorboard --logdir=\"' + "
if script_dir.resolve() == experiment_dir.resolve():
rel_dir = ""
else:
rel_dir = str(script_dir.relative_to(experiment_dir))
command_tb += f"${{env:experiment_dir}} + '\\{rel_dir}\"'\n"
command_tb += "Write-Output $start\n"
command_tb += "Invoke-Expression ${start}\n"
# add GPU selection
command += '\n$print_gpus="python " + ${env:script_dir} +'
command += ' "\\SegClassRegBasis\\print_tf_gpus.py"\n'
command += "if ($null -eq $env:CUDA_VISIBLE_DEVICES) {\n"
command += " Invoke-Expression ${print_gpus}\n"
command += ' $env:CUDA_VISIBLE_DEVICES=Read-Host -Prompt "Use GPU number"\n'
command += "} else {\n"
command += ' Write-Output "Using the following GPU:"\n'
command += " Invoke-Expression ${print_gpus}\n"
command += "}\n\n"
command_all = command
command_all += '$script=${env:script_dir} + "\\run_all_experiments.py"\n'
command_all += '$command="python " + "${script}"\n'
command_all += "Write-Output $command\n"
command_all += "Invoke-Expression ${command}\n"
# add the experiments
command += '$script_run=${env:script_dir} + "\\run_single_experiment.py"\n'
for exp in experiments:
command_path = (
f'\n\n$output_path=${{env:experiment_dir}} + "\\{exp.output_path_rel}"\n'
)
command += command_path
for fold_num in range(exp.folds):
fold_task_name = f"{exp.output_path_rel.parent.name}-{exp.name} Fold {fold_num}"
command += f'Write-Output "starting with {fold_task_name}"\n'
command += f'$command="python " + ${{script_run}} + " -f {fold_num} -e " + \'${{output_path}}\'\n'
command += "Invoke-Expression ${command}\n\n"
with open(ps_script, "w+", encoding="utf8") as powershell_file:
powershell_file.write(command_all)
with open(ps_script_single, "w+", encoding="utf8") as powershell_file:
powershell_file.write(command)
# create tensorboard file
with open(ps_script_tb, "w+", encoding="utf8") as powershell_file_tb:
powershell_file_tb.write(command_tb)
def export_experiments_run_files(
script_dir: Path, experiments: list, file_start="", additional_info: List[dict] = None
):
"""Export the files to run the experiments. These are first the hyperparameter
comparison files and then depending on the environment (Windows or Linux cluster),
either bash script to submit slurm jobs or powershell scripts to start the
experiments are written.
Parameters
----------
script_dir : Path
The directory where the scripts should be placed
experiments : List[Experiment]
The experiments to export
file_start : str, optional
Is added to the start of each file, by default empty
additional_info List[dict], optional
Additional info to add to the experiments file for each experiment, the keys
will become columns as the values the entries.
"""
# export all hyperparameters
export_hyperparameters(experiments, script_dir, additional_info)
# if on cluster, export slurm files
if "CLUSTER" in os.environ:
slurm_files = []
working_dir = Path("").resolve()
if not working_dir.exists():
working_dir.mkdir()
for exp in experiments:
slurm_files.append(exp.export_slurm_file(working_dir))
start_all_batch = script_dir / "start_all_jobs.sh"
export_batch_file(
filename=start_all_batch,
commands=[f"sbatch {f}" for f in slurm_files],
file_start=file_start,
)
# and create some needed directories (without their log dirs, jobs don't start)
plot_dir_slurm = working_dir / "plots" / "slurm"
if not plot_dir_slurm.exists():
plot_dir_slurm.mkdir(parents=True)
combined_dir_slurm = working_dir / "combined_models" / "slurm"
if not combined_dir_slurm.exists():
combined_dir_slurm.mkdir(parents=True)
print(f"To start the training, execute {start_all_batch}")
# if on local computer, export powershell start file
else:
export_powershell_scripts(script_dir, experiments, file_start)