Skip to content

Commit

Permalink
safer walltime limits in trainings
Browse files Browse the repository at this point in the history
  • Loading branch information
svandenhaute committed Dec 12, 2023
1 parent e0f62a6 commit 6c0a996
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 2 deletions.
3 changes: 2 additions & 1 deletion psiflow/models/_mace.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def train(
) -> str:
import yaml

actual_walltime = int(0.9 * walltime) # reserve 10 % for safe shutdown
mace_config["train_file"] = inputs[1].filepath
mace_config["valid_file"] = inputs[2].filepath
config_str = yaml.dump(dict(mace_config))
Expand All @@ -172,7 +173,7 @@ def train(
command_tmp,
command_cd,
command_write,
"timeout -s 15 {}s psiflow-train-mace".format(max(walltime - 15, 0)),
"timeout -s 15 {}s psiflow-train-mace".format(actual_walltime),
"--config config.yaml",
"--model {} || true;".format(inputs[0].filepath),
"ls *;",
Expand Down
3 changes: 2 additions & 1 deletion psiflow/models/_nequip.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ def train(
) -> str:
import yaml

actual_walltime = int(0.9 * walltime) # reserve 10 % for safe shutdown
nequip_config["dataset_file_name"] = inputs[1].filepath
nequip_config["validation_dataset"] = "ase"
nequip_config["validation_dataset_file_name"] = inputs[2].filepath
Expand All @@ -262,7 +263,7 @@ def train(
command_cd,
command_env,
command_write,
"timeout -s 15 {}s".format(max(walltime - 15, 0)), # 15 s slack
"timeout -s 15 {}s".format(actual_walltime), # 15 s slack
"psiflow-train-nequip",
"--config config.yaml",
"--model {} || true;".format(inputs[0].filepath),
Expand Down

0 comments on commit 6c0a996

Please sign in to comment.