From ac7d574efd60366bd1b9e7815482fe66ad2882ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20J=C3=A4hn?= Date: Thu, 1 Feb 2024 17:55:10 +0100 Subject: [PATCH] Update 'how to run' --- docs/howtorun.rst | 54 ++++++++++++++++++++++++++-------------------- run_chain.py | 55 +++++++++++++++++++++-------------------------- 2 files changed, 55 insertions(+), 54 deletions(-) diff --git a/docs/howtorun.rst b/docs/howtorun.rst index d9f83220..ea58e0a7 100644 --- a/docs/howtorun.rst +++ b/docs/howtorun.rst @@ -23,51 +23,50 @@ contain additional runscripts to be submitted via ``sbatch``. .. hint:: Technically, you can run several cases (instead of a single case) in one command, - which is useful for nested run, for example. This can be achieved by running + which is useful for nested runs, for example. This can be achieved by running ``./run_chain.py ``. With that, the full chain is executed for ``case1`` first, and afterwards for ``case2``. -Without specifiying a job list, the default joblist defined in -``config/models.yaml`` will be executed. - There are several optional arguments available to change the behavior of the chain: + $ ./run_chain.py -h + * ``-h``, ``--help`` - Show a help message and exit. + Show this help message and exit. * ``-j [JOB_LIST ...]``, ``--jobs [JOB_LIST ...]`` List of job names to be executed. - A job is a .py-file in jobs/ with a ``main()`` function, which + A job is a ``.py`` file in i``jobs/`` with a ``main()`` function, which handles one aspect of the Processing Chain, for example copying ``meteo`` input data or launching a job for ``int2lm``. Jobs are executed in the order in which they are given here. If no jobs are - given, default jobs will be executedas defined - in config/models.yaml. + given, default jobs will be executed as defined + in ``config/models.yaml``. * ``-f``, ``--force`` - Force the processing chain to redo all specified + Force the Processing Chain to redo all specified jobs, even if they have been started already or were finished previously. WARNING: Only logfiles get deleted, other effects of a given job (copied files etc.) are simply overwritten. This - may cause errors. -* ``-t NTRY``, ``--try NTRY`` - Amount of time the cosmo job is re-tried before crashing. Default is 1. + may cause errors or unexpected behavior. * ``-r``, ``--resume`` - Resume the processing chain by restarting the + Resume the Processing Chain by restarting the last unfinished job. WARNING: Only the logfile gets deleted, other effects of a given job (copied files etc.) are simply overwritten. This - may cause errors. + may cause errors or unexpected behavior. What it Does ------------ -The script ``run_chain.py`` reads the command line arguments and the config file. +The script ``run_chain.py`` reads the command line arguments and the config file +from the specified case. It then calls the function :func:`run_chain.restart_runs`, which divides the simulation time according to the specified restart steps. Then it calls -:func:`run_chain.run_chain` for each sub-run. This function sets up the directory -structure of the chain and then starts the specified :ref:`jobs` -sequentially. +:func:`run_chain.run_chunk` for each part (chunk) of the simulation workflow. +This function sets up the directory structure of the chain and then submits the +specified :ref:`jobs` via ``sbatch`` to the Slurm workload manager, +taking job dependencies into account. Test Cases ---------- @@ -89,6 +88,10 @@ the script:: This will run all the individual scripts in ``jenkins/scripts/``, which can also be launched separately if desired. +These cases undergo regulary testing to ensure that the Processing Chain runs +correctly. A corresponding Jenkins plan is launched on a weekly basis and +when triggered within a GitHub pull request. + Directory Structure ------------------- @@ -108,6 +111,11 @@ run looks like this:: ├── cfg.int2lm_input/ ├── cfg.int2lm_work/ └── cfg.int2lm_output/ + +As one can see, it creates working directories for both the ``int2lm`` preprocessor +and ``cosmo``. Additionally, and this is always the case, the ``checkpoints`` +directory holds all the job logfiles. Whenever a job has successfully finished, +the logfile is copied from the ``working`` to the ``finished`` sub-directory. Running the ``cosmo-ghg-test`` case therefore produces the following directories and files (showing four levels of directories deep):: @@ -124,7 +132,7 @@ directories and files (showing four levels of directories deep):: │ │ │ ├── online_vprm │ │ │ ├── post_cosmo │ │ │ ├── post_int2lm - │ │ │ └── prepare_data + │ │ │ └── prepare_cosmo │ │ └── working/ │ │ ├── biofluxes │ │ ├── cosmo @@ -134,7 +142,7 @@ directories and files (showing four levels of directories deep):: │ │ ├── online_vprm │ │ ├── post_cosmo │ │ ├── post_int2lm - │ │ └── prepare_data + │ │ └── prepare_cosmo │ ├── cosmo/ │ │ ├── input/ │ │ │ ├── oem/ @@ -177,7 +185,7 @@ directories and files (showing four levels of directories deep):: │ │ ├── online_vprm │ │ ├── post_cosmo │ │ ├── post_int2lm - │ │ └── prepare_data + │ │ └── prepare_cosmo │ └── working/ │ ├── biofluxes │ ├── cosmo @@ -187,7 +195,7 @@ directories and files (showing four levels of directories deep):: │ ├── online_vprm │ ├── post_cosmo │ ├── post_int2lm - │ └── prepare_data + │ └── prepare_cosmo ├── cosmo/ │ ├── input/ │ │ ├── oem @@ -222,7 +230,7 @@ directories and files (showing four levels of directories deep):: ------------------------------------------- -.. autofunction:: run_chain.run_chain +.. autofunction:: run_chain.run_chunk ------------------------------------------- diff --git a/run_chain.py b/run_chain.py index 743b54c2..7e9e5b42 100755 --- a/run_chain.py +++ b/run_chain.py @@ -16,18 +16,18 @@ def parse_arguments(): - """Parse command line arguments for the processing chain script. + """Parse command line arguments for the Processing Chain script. Parses and retrieves command line arguments, allowing users to specify run identifiers, jobs to execute, and various options to control the - execution of the processing chain. + execution of the Processing Chain. Returns ------- argparse.Namespace A namespace object containing parsed command line arguments. """ - parser = argparse.ArgumentParser(description="Run the processing chain.") + parser = argparse.ArgumentParser(description="Run the Processing Chain.") parser.add_argument("casenames", nargs='+', @@ -36,9 +36,9 @@ def parse_arguments(): "to be in cases//. The runs are executed " "sequentially in the order they're given here.") - jobs_help = ("List of job-names to be executed. A job is a .py-" + jobs_help = ("List of job names to be executed. A job is a .py " "file in jobs/ with a main()-function which " - "handles one aspect of the processing chain, for " + "handles one aspect of the Processing Chain, for " "example copying meteo-input data or launching a " "job for int2lm. " "Jobs are executed in the order in which they are " @@ -52,27 +52,20 @@ def parse_arguments(): help=jobs_help, default=None) - force_help = ("Force the processing chain to redo all specified jobs," + force_help = ("Force the Processing Chain to redo all specified jobs," " even if they have been started already or were finished" " previously. WARNING: Only logfiles get deleted," " other effects of a given job (copied files etc.)" - " are simply overwritten. This may cause errors.") + " are simply overwritten. This may cause errors" + " or unexpected behavior.") parser.add_argument("-f", "--force", action='store_true', help=force_help) - tries_help = ("Amount of time the cosmo job is re-tried before crashing." - " Default is 1.") - parser.add_argument("-t", - "--try", - help=tries_help, - dest="ntry", - type=int, - default=1) - resume_help = ( - "Resume the processing chain by restarting the last unfinished job." + "Resume the Processing Chain by restarting the last unfinished job." " WARNING: Only the logfile gets deleted," " other effects of a given job (copied files etc.)" - " are simply overwritten. This may cause errors.") + " are simply overwritten. This may cause errors." + " or unexpected behavior.") parser.add_argument("-r", "--resume", help=resume_help, @@ -90,7 +83,7 @@ def __init__(self, casename): """Initialize an instance of the Config class. Initializes an instance of the Config class with user-specific - and default attributes. The class represents a processing chain for a + and default attributes. The class represents a Processing Chain for a particular case, and its attributes are populated based on the provided `casename`. @@ -98,7 +91,7 @@ def __init__(self, casename): ---------- casename : str The identifier for the case, typically specifying the configuration - and settings to be used in the processing chain. + and settings to be used in the Processing Chain. Attributes ---------- @@ -107,13 +100,13 @@ def __init__(self, casename): email : str The user's email address, initially set to None and updated using the `set_email` method. casename : str - The specified case name for the processing chain. + The specified case name for the Processing Chain. chain_src_dir : str - The source directory for the processing chain, typically the current working directory. + The source directory for the Processing Chain, typically the current working directory. case_path : str The path to the case directory under 'cases/' for the specified `casename`. work_root : str - The root directory for processing chain execution, typically located under the source directory. + The root directory for Processing Chain execution, typically located under the source directory. Notes ----- @@ -414,15 +407,15 @@ def create_vars_from_dicts(self): def run_chain(work_root, model_cfg, cfg, startdate_sim, enddate_sim, job_names, force, resume): - """Run the processing chain, managing job execution and logging. + """Run the Processing Chain, managing job execution and logging. - This function sets up and manages the execution of a processing chain, handling + This function sets up and manages the execution of a Processing Chain, handling job execution, logging, and various configuration settings. Parameters ---------- work_root : str - The path to the directory where the processing chain writes files during execution. + The path to the directory where the Processing Chain writes files during execution. model_cfg : dict Configuration settings for the modeling framework. cfg : Config @@ -623,7 +616,7 @@ def run_chain(work_root, model_cfg, cfg, startdate_sim, enddate_sim, job_names, def restart_runs(work_root, model_cfg, cfg, job_names, force, resume): """Start subchains in specified intervals and manage restarts. - This function slices the total runtime of the processing chain according to the + This function slices the total runtime of the Processing Chain according to the `cfg.restart_step_hours` configuration. It calls `run_chain()` for each specified interval. @@ -676,7 +669,7 @@ def restart_runs(work_root, model_cfg, cfg, job_names, force, resume): def restart_runs_spinup(work_root, model_cfg, cfg, job_names, force, resume): """Start subchains in specified intervals and manage restarts with spin-up. - This function slices the total runtime of the processing chain according to the + This function slices the total runtime of the Processing Chain according to the `cfg.restart_step_hours` configuration. It calls `run_chain()` for each specified interval, managing restarts with spin-up. @@ -758,9 +751,9 @@ def load_model_config_yaml(yamlfile): if __name__ == '__main__': - """Main script for running a processing chain. + """Main script for running a Processing Chain. - This script handles the execution of a processing chain for one or more specified cases. It loads model configurations, prepares the environment, and starts the chain based on the provided settings. + This script handles the execution of a Processing Chain for one or more specified cases. It loads model configurations, prepares the environment, and starts the chain based on the provided settings. Parameters ---------- @@ -827,4 +820,4 @@ def load_model_config_yaml(yamlfile): force=args.force, resume=args.resume) - print('>>> Finished the processing chain successfully <<<') + print('>>> Finished the Processing Chain successfully <<<')