Merge branch 'NOAA-GFDL:main' into nish-pod

NOAA-GFDL · Jan 27, 2025 · abfdff9 · abfdff9
2 parents 254895d + 26dee6d
commit abfdff9
Show file tree

Hide file tree

Showing 11 changed files with 108 additions and 60 deletions.
diff --git a/.github/workflows/mdtf_tests.yml b/.github/workflows/mdtf_tests.yml
@@ -90,9 +90,9 @@ jobs:
         pip install mdtf-test-data
         mkdir mdtf_test_data ; cd mdtf_test_data
         # generate the data and run unit tests
-        mdtf_synthetic.py -c GFDL --startyear 1 --nyears 10 --unittest
+        mdtf_synthetic.py -c GFDL --startyear 1 --nyears 10 --freq day --unittest
         mdtf_synthetic.py -c NCAR --startyear 1975 --nyears 7
-        mdtf_synthetic.py -c CMIP --startyear 1990 --nyears 20
+        mdtf_synthetic.py -c CMIP --startyear 1990 --nyears 20 --freq day mon
         cd ../
         mkdir wkdir
         ## make input data directories

diff --git a/README.md b/README.md
@@ -224,8 +224,8 @@ mamba env create --force -q -f ./src/conda/_env_synthetic_data.yml
 conda activate _MDTF_synthetic_data
 pip install mdtf-test-data
 mkdir mdtf_test_data && cd mdtf_test_data
-mdtf_synthetic.py -c CMIP --startyear 1980 --nyears 5
-mdtf_synthetic.py -c CMIP --startyear 1985 --nyears 5
+mdtf_synthetic.py -c CMIP --startyear 1980 --nyears 5 --freq day
+mdtf_synthetic.py -c CMIP --startyear 1985 --nyears 5 --freq day
 ```
 Then, modify the ``path`` entries in diagnostic/example_multicase/esm_catalog_CMIP_synthetic_r1i1p1f1_gr1.csv, and
 the `"catalog_file":` path in diagnostic/example_multicase/esm_catalog_CMIP_synthetic_r1i1p1f1_gr1.json to include the

diff --git a/doc/sphinx/ref_cli.rst b/doc/sphinx/ref_cli.rst
@@ -2,8 +2,8 @@
    :language: console
    :class: highlight
 .. _ref-cli:
-Command-line options
-====================
+Runtime configuration options
+=============================
 
 Running the package
 -------------------
@@ -35,83 +35,106 @@ To get a list of topics recognized by the command, run :console:`% mdtf info`.
 
 
 .. _ref-cli-options:
-
 General options
 +++++++++++++++
 
 -h, --help     Show a help message, potentially more up-to-date than this page, along with your site's default values
 for these options.
 
--f Path to a user configuration file that sets options listed here. This can be a JSON file of the form given in
+-f    Path to a user configuration file that sets options listed here. This can be a JSON file of the form given in
  `src/default_tests.jsonc <https://github.com/NOAA-GFDL/MDTF-diagnostics/blob/main/templates/runtime_config.jsonc>`__
  (which is intended to be copied and used as a template)
 
+Runtime configuration file settings
+-----------------------------------
+
 Path settings
 +++++++++++++
 
 Locations of input and output data. All the paths in this section must be on a locally mounted filesystem. Environment variables in paths (e.g., ``$HOME``) are resolved at runtime according to the shell context the package is called from. Relative paths are resolved relative to the code directory.
 
---OBS-DATA-ROOT <OBS_DATA_ROOT>   Required setting if running PODs that require observational data. Directory containing
+-OBS_DATA_ROOT     <str> Required setting if running PODs that require observational data. Directory containing
   observational and supporting data required by individual PODs. Currently, this must be downloaded manually as part
-  of the framework installation. See :numref:`ref-download` of the :doc:`installation guide<start_install>` for instructions.
---WORK-DIR <WORKING_DIR>     Working directory. This will be used as scratch storage by the framework and the PODs.
-  Optional; defaults to <*OUTPUT_DIR*> if not specified.
--o, --OUTPUT-DIR <OUTPUT_DIR>    Required setting. Destination for output files.
+  of the framework installation. See :numref:`ref-download` of the :doc:`installation guide<start_install>` for
+  instructions.
+
+-WORK_DIR     <str> Optional. Working directory. This will be used as scratch storage by the framework and the PODs.
+ defaults to <*OUTPUT_DIR*> if not specified.
+
+-OUTPUT-DIR    <str> Required setting. Destination for output files.
 
 Data options
 ++++++++++++
 
-Options that describe the input model data and how it should be obtained.
+Options that describe the input model data and how it should be obtained. The settings are defined
+in the example configuration files in the `templates/ <https://github.com/NOAA/MDTF-diagnostics/tree/main/templates>`__
+directory.
 
---convention <naming_convention>   | The convention for variable names and units used in the input model data. Defaults
-  to ``CMIP``, for data produced as part of CMIP6 data request, or compatible with it.
-   |
-   | See the :doc:`ref_conventions` for documentation on the recognized values for this option.
+-convention    <str; CMIP | GFDL | CESM> The convention for variable names and units used in the input model data.
 
---large_file   | Set this flag when running the package on a large volume of input model data: specifically, if the full
-  time series for any requested variable is over 4gb. This may impact performance for variables less than 4gb but
-  otherwise has no effect.
-   |
-   | When set, this causes the framework and PODs to use the netCDF-4 format (CDF-5 standard, using the HDF5 API;
-   | see the `netCDF FAQ <https://www.unidata.ucar.edu/software/netcdf/docs/faq.html#How-many-netCDF-formats-are-there-and-what-are-the-differences-among-them>`__) for all intermediate data files generated during the package run. If the flag is not set (default), the netCDF4 Classic format is used instead. Regardless of this setting, the package can read input model data in any netCDF4 format.
+-large_file    <bool> Set this flag when running the package on a large volume of input model data: specifically, if the
+ full time series for any requested variable is over 4gb. This may impact performance for variables less than 4gb but
+ otherwise has no effect.
+ When set, this causes the framework and PODs to use the netCDF-4 format (CDF-5 standard, using the HDF5 API;
+ see the `netCDF FAQ <https://www.unidata.ucar.edu/software/netcdf/docs/faq.html#How-many-netCDF-formats-are-there-and-what-are-the-differences-among-them>`__)
+ for all intermediate data files generated during the package run. If the flag is not set (default), the netCDF4
+ Classic format is used instead. Regardless of this setting, the package can read input model data in any
+ netCDF4 format.
 
---disable-preprocessor    If set, this flag disables preprocessing of input model data done by the framework before the PODs are run. Specifically, this skips validation of ``standard_name`` and ``units`` CF attributes in file metadata, and skips unit conversion and level extraction functions. This is only provided as a workaround for input data which is known to have incorrect metadata: using this flag means that the user assumes responsibility for verifying that the input data has the units requested by all PODs being run.
 Conda/micromamba settings
 +++++++++++++++++++++++++
---conda_root     path to anaconda, miniconda, or micromamba installation
---conda_env_root     path to directory with conda enviroments
---micromamba_exe     path to the micromamba executable. REQUIRED if using micromamba
+
+-conda_root     <str> path to anaconda, miniconda, or micromamba installation
+
+-conda_env_root     <str> path to directory with conda enviroments
+
+-micromamba_exe     <str> path to the micromamba executable. REQUIRED if using micromamba
 
 Analysis settings
 +++++++++++++++++
 
 Settings determining what analyses the package performs.
 
-CASENAME <name>    Required setting. Identifier used to label this run of the package. Can be set to any string.
-startdate <yyyymmdd> or <yyyymmddHHmmss>   Required setting. Starting year of analysis period.
-enddate <yyyymmdd>  or <yyyymmddHHmmss>   Required setting. Ending year of analysis period. The analysis period is taken
-to be a **closed interval**
-pod_list <list of POD identifiers>    Specification for which diagnostics (PODs) the package should run on the model
-data, given as a list separated by spaces. Optional; default behavior is to attempt to run all PODs.
+-pod_list    <list of POD identifiers>  Specification for which diagnostics (PODs) the package should run on the model
+ data, given as a list separated by spaces.
+ Valid identifiers for PODs are the name(s) of the diagnostic(s) as given in the
+ `diagnostics/ <https://github.com/NOAA/MDTF-diagnostics/tree/main/diagnostics>`__ directory.
+
+-startdate    <yyyymmdd> or <yyyymmddHHmmss> Required setting. Starting year of analysis period.
 
-  Valid identifiers for PODs are:
+-enddate     <yyyymmdd> or <yyyymmddHHmmss> Required setting. Ending year of analysis period. The analysis period is taken
+ to be a **closed interval**
 
-  - The name of the diagnostic as given in the
-    `diagnostics/ <https://github.com/NOAA/MDTF-diagnostics/tree/main/diagnostics>`__ directory.
+-model     <str> Optional. Name of model, mainly for user reference.
+
+-realm     <str> Optional. Dataset realm. May be used to refine query search. If not defined, the query uses the POD realm.
 
 Runtime options
 +++++++++++++++
 
 Options that control how the package is deployed (how code dependencies are managed) and how the diagnostics are run.
 
+-user_pp_scripts    <list of strings> Optional. List with custom preprocessing script(s) to run on data
+ Place these scripts in the user_scripts directory of your copy of the MDTF-diagnostics repository. Note that
+ the framework will automatically run any scripts defined in the list.
+
 Output options
 ++++++++++++++
 
 Options determining what files are output by the package.
 
-save-ps    Set flag to have PODs save postscript figures in addition to bitmaps.
-save-nc    Set flag to have PODs save netCDF files of processed data.
-save-non-nc    Set flag to have PODs save all intermediate data **except** netCDF files.
-make-variab-tar    Set flag to save package output in a single .tar file. This will only contain HTML and bitmap plots,
-regardless of whether the flags above are used.
-overwrite    If this flag is set, new runs of the package will overwrite any pre-existing results in <*OUTPUT_DIR*>.
+-run_pp     <bool> Set to true to run the preprocessor; default true.
+
+-translate_data    <bool> Set to true to perform data translation; default true.
+
+-save_ps    <bool> Set to true have PODs save postscript figures in addition to bitmaps; Default false.
+
+-save_pp_data    <bool> Set to true have PODs save netCDF files of processed data; default true
+
+-make_variab_tar    <bool> Set to true save package output in a single .tar file. This will only contain HTML
+ and bitmap plots regardless of whether the flags above are used. Default false.
+
+-overwrite   <bool>  Set to true to have new runs of the package overwrite any pre-existing results in <*OUTPUT_DIR*>.
+ default false
+
+-make_multicase_figure    <bool> Generate html output for multiple figures per case. Default false.
diff --git a/doc/sphinx/ref_container.rst b/doc/sphinx/ref_container.rst
@@ -58,8 +58,8 @@ We generate our synthetic data by running:
       micromamba activate _MDTF_synthetic_data
       pip install mdtf-test-data
       mkdir mdtf_test_data && cd mdtf_test_data
-      mdtf_synthetic.py -c CMIP --startyear 1980 --nyears 5
-      mdtf_synthetic.py -c CMIP --startyear 1985 --nyears 5
+      mdtf_synthetic.py -c CMIP --startyear 1980 --nyears 5 --freq day
+      mdtf_synthetic.py -c CMIP --startyear 1985 --nyears 5 --freq day
 
 Now would be a good time to generate a catalog for the synthetic data, but, in the sake
 of testing, we provide a catalog for the files needed to run the example POD.

diff --git a/doc/sphinx/start_install.rst b/doc/sphinx/start_install.rst
@@ -276,8 +276,8 @@ using the configuration in the
     % conda activate _MDTF_synthetic_data
     % pip install mdtf-test-data
     % mkdir mdtf_test_data && cd mdtf_test_data
-    % mdtf_synthetic.py -c CMIP --startyear 1980 --nyears 5
-    % mdtf_synthetic.py -c CMIP --startyear 1985 --nyears 5
+    % mdtf_synthetic.py -c CMIP --startyear 1980 --nyears 5 --freq day
+    % mdtf_synthetic.py -c CMIP --startyear 1985 --nyears 5 --freq day
 
 Obtaining supporting data for 3rd-generation and older single-run PODs
 -------------------------------------------------------------------------

diff --git a/src/cli.py b/src/cli.py
@@ -124,11 +124,12 @@ def check_date_format(date_string: str):
 
 def verify_case_atts(case_list: util.NameSpace):
     # required case attributes
-    case_attrs = ['convention', 'startdate', 'enddate']
-    conventions = ['cmip', 'gfdl', 'cesm']
+    required_case_attrs = {'convention', 'startdate', 'enddate'}
+    optional_case_attrs = {'realm', 'model'}
+    conventions = {'cmip', 'gfdl', 'cesm'}
     for name, att_dict in case_list.items():
         try:
-            all(att in att_dict.keys() for att in case_attrs)
+            all(att in att_dict.keys() for att in required_case_attrs)
         except KeyError:
             raise util.exceptions.MDTFBaseException(
                 f"Missing or incorrect convention, startdate, or enddate for case {name}"
@@ -139,6 +140,11 @@ def verify_case_atts(case_list: util.NameSpace):
             raise util.exceptions.MDTFBaseException(
                 f"Convention {att_dict['convention']} not supported"
             )
+        try:
+            {att for att in att_dict.keys()}.issubset(required_case_attrs.union(optional_case_attrs))
+        except KeyError:
+            raise util.exceptions.MDTFBaseException(f"Runtime case attribute is not a required or optional attribute. Check runtime config file for typo or unsupported entry.")
+
         st = check_date_format(att_dict.startdate)
         en = check_date_format(att_dict.enddate)
 

diff --git a/src/data_sources.py b/src/data_sources.py
@@ -49,6 +49,11 @@ def __init__(self, case_name: str,
             k: case_dict[k] for k in ("startdate", "enddate", "convention")
         })
         self.env_vars.update({"CASENAME": case_name})
+        optional_case_attrs = {'realm'}
+        for att in optional_case_attrs:
+            if case_dict.get(att, None) is not None:
+                self.query[att] = case_dict[att]
+
 
     @property
     def _children(self):
@@ -66,10 +71,11 @@ def set_date_range(self, startdate: str, enddate: str):
         self.date_range = util.DateRange(start=startdate, end=enddate)
 
     def set_query(self, var: varlist_util.VarlistEntry, path_regex: str):
-        realm_regex = var.realm + '*'
         date_range = var.T.range
         var_id = var.name
         standard_name = var.standard_name
+        if self.query['realm'] == '':
+            self.query['realm'] = var.realm
         if var.translation.convention is not None:
             var_id = var.translation.name
             standard_name = var.translation.standard_name
@@ -90,7 +96,6 @@ def set_query(self, var: varlist_util.VarlistEntry, path_regex: str):
         # the variable is translated
         self.query['frequency'] = freq
         self.query['path'] = path_regex
-        self.query['realm'] = realm_regex
         self.query['standard_name'] = standard_name
         self.query['variable_id'] = var_id
 

diff --git a/templates/module_config.jsonc b/templates/module_config.jsonc
@@ -15,8 +15,8 @@
 // > pip install mdtf-test-data
 // > cd <root directory>/mdtf
 // > mkdir mdtf_test_data && cd mdtf_test_data
-// > mdtf_synthetic.py -c CMIP --startyear 1980 --nyears 5
-// > mdtf_synthetic.py -c CMIP --startyear 1985 --nyears 5
+// > mdtf_synthetic.py -c CMIP --startyear 1980 --nyears 5 --freq day
+// > mdtf_synthetic.py -c CMIP --startyear 1985 --nyears 5 --freq day
 // Note that MODEL_DATA_ROOT assumes that mdtf_test_data is one directory above MDTF-diagnostics
 // in this sample config file
 {
@@ -51,15 +51,17 @@
           "model": "test",
           "convention": "CMIP",
           "startdate": "19800101120000",
-          "enddate": "19841231000000"
+          "enddate": "19841231000000",
+          "realm": "atmos"
         }
       ,
       "CMIP_Synthetic_r1i1p1f1_gr1_19850101-19891231":
         {
           "model": "test",
           "convention": "CMIP",
           "startdate": "19850101",
-          "enddate": "19891231"
+          "enddate": "19891231",
+          "realm": "atmos"
         }
     },
   // PATHS ---------------------------------------------------------------------
@@ -100,7 +102,7 @@
   // Settings affecting what output is generated:
   // Set to true to run the preprocessor; default true:
   "run_pp": true,
-  // Set to true to perform data translation; default false:
+  // Set to true to perform data translation; default true:
   "translate_data": true,
   // Set to true to have PODs save postscript figures in addition to bitmaps.
   "save_ps": false,

diff --git a/templates/module_config.yml b/templates/module_config.yml
@@ -17,18 +17,22 @@ module_list:
         "--out": "../wkdir/tempestextremes.dat"
 
 # Case list entries (must be unique IDs for each simulation)
+# convention: cmip, cesm, or gfdl
+# optional: model, realm (used to refine query if dataset realm differs from POD realm)
 case_list:
   "CMIP_Synthetic_r1i1p1f1_gr1_19800101-19841231" :
     model: "test"
     convention: "CMIP"
     startdate: "19800101120000"
     enddate: "19841231000000"
+    realm: "atmos"
 
   "CMIP_Synthetic_r1i1p1f1_gr1_19850101-19891231" :
     model: "test"
     convention: "CMIP"
     startdate: "19850101000000"
     enddate: "19891231000000"
+    realm: "atmos"
 
 ### Data location settings ###
 # Required: full or relative path to ESM-intake catalog header file

diff --git a/templates/runtime_config.jsonc b/templates/runtime_config.jsonc
@@ -15,8 +15,8 @@
 // > pip install mdtf-test-data
 // > cd <root directory>/mdtf
 // > mkdir mdtf_test_data && cd mdtf_test_data
-// > mdtf_synthetic.py -c CMIP --startyear 1980 --nyears 5
-// > mdtf_synthetic.py -c CMIP --startyear 1985 --nyears 5
+// > mdtf_synthetic.py -c CMIP --startyear 1980 --nyears 5 --freq day
+// > mdtf_synthetic.py -c CMIP --startyear 1985 --nyears 5 --freq day
 // Note that MODEL_DATA_ROOT assumes that mdtf_test_data is one directory above MDTF-diagnostics
 // in this sample config file
 {
@@ -27,13 +27,16 @@
      "example_multicase"
    ],
    // Each case corresponds to a different simulation/output dataset
+   // convention: cmip, cesm, or gfdl
    // startdate, enddate: either YYYY-MM-DD, YYYYMMDD:HHMMSS, or YYYY-MM-DD:HHMMSS
+   // optional: model, realm
    "case_list":
     {
       "CMIP_Synthetic_r1i1p1f1_gr1_19800101-19841231":
         {
           "model": "test",
           "convention": "CMIP",
+          "realm": "atmos",  // can be used to refine query if dataset realm definition differs from POD realm
           "startdate": "19800101120000",
           "enddate": "19841231000000"
         }
@@ -42,6 +45,7 @@
         {
           "model": "test",
           "convention": "CMIP",
+          "realm": "atmos",
           "startdate": "19850101",
           "enddate": "19891231"
         }

diff --git a/templates/runtime_config.yml b/templates/runtime_config.yml
@@ -7,19 +7,23 @@ pod_list:
   - "example_multicase"
 
 # Case list entries (must be unique IDs for each simulation)
+# convention: cmip, cesm, or gfdl
+# required: convention, startdate, enddate
+# optional: model, realm (used to refine query if dataset realm differs from POD realm)
 case_list:
   "CMIP_Synthetic_r1i1p1f1_gr1_19800101-19841231" :
     model: "test"
     convention: "CMIP"
     startdate: "19800101120000"
     enddate: "19841231000000"
+    realm: "atmos"
 
   "CMIP_Synthetic_r1i1p1f1_gr1_19850101-19891231" :
     model: "test"
     convention: "CMIP"
     startdate: "19850101000000"
     enddate: "19891231000000"
-
+    realm: "atmos"
 ### Data location settings ###
 # Required: full or relative path to ESM-intake catalog header file
 DATA_CATALOG: "./diagnostics/example_multicase/esm_catalog_CMIP_synthetic_r1i1p1f1_gr1.json"