From 95e53e7a5b92a0eb5a7fe4354faa47e2aacd0e63 Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Thu, 12 Sep 2024 10:05:25 +0800
Subject: [PATCH 01/23] Updated .gitignore to include some vs code settings

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index da4e917f..b18b515f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -131,3 +131,6 @@ dmypy.json
 sandpit.ipynb
 *.DS_Store
 bin/build_all.sh.o*
+
+# Vs Code 
+.vscode/
\ No newline at end of file

From 13b38a90f1a14fee2b79ed5962181005b88c6464 Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Thu, 12 Sep 2024 10:08:36 +0800
Subject: [PATCH 02/23] * Added type hints

* Replaced a couple of try/excepts with .get in `src/access_nri_intake/catalog/translators.py`

* Updated a misleading docstring
---
 mypy.ini                                     |  3 ++
 src/access_nri_intake/catalog/manager.py     | 24 +++++------
 src/access_nri_intake/catalog/translators.py | 42 +++++++++----------
 src/access_nri_intake/source/builders.py     | 43 +++++++++++---------
 src/access_nri_intake/source/utils.py        | 16 ++++----
 src/access_nri_intake/utils.py               | 12 ++++--
 6 files changed, 76 insertions(+), 64 deletions(-)
 create mode 100644 mypy.ini

diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 00000000..a47639ed
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,3 @@
+[mypy]
+python_version = 3.12
+ignore_missing_imports = True
\ No newline at end of file
diff --git a/src/access_nri_intake/catalog/manager.py b/src/access_nri_intake/catalog/manager.py
index f3d03243..ed954e36 100644
--- a/src/access_nri_intake/catalog/manager.py
+++ b/src/access_nri_intake/catalog/manager.py
@@ -30,7 +30,7 @@ class CatalogManager:
     Add/update intake sources in an intake-dataframe-catalog like the ACCESS-NRI catalog
     """
 
-    def __init__(self, path):
+    def __init__(self, path : str):
         """
         Initialise a CatalogManager instance to add/update intake sources in a
         intake-dataframe-catalog like the ACCESS-NRI catalog
@@ -58,14 +58,14 @@ def __init__(self, path):
 
     def build_esm(
         self,
-        name,
-        description,
+        name : str,
+        description : str,
         builder,
-        path,
+        path : list[str] | str,
         translator=DefaultTranslator,
-        metadata=None,
-        directory=None,
-        overwrite=False,
+        metadata : dict | None = None,
+        directory : str | None = None,
+        overwrite : bool =False,
         **kwargs,
     ):
         """
@@ -124,12 +124,12 @@ def build_esm(
 
     def load(
         self,
-        name,
-        description,
-        path,
-        driver="esm_datastore",
+        name : str,
+        description : str,
+        path : str,
+        driver : str ="esm_datastore",
         translator=DefaultTranslator,
-        metadata=None,
+        metadata : dict | None =None,
         **kwargs,
     ):
         """
diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py
index 509fe362..18c4fb8c 100644
--- a/src/access_nri_intake/catalog/translators.py
+++ b/src/access_nri_intake/catalog/translators.py
@@ -6,13 +6,18 @@
 like the ACCESS-NRI catalog
 """
 
+from __future__ import annotations
 from functools import partial
+from typing import Callable, TYPE_CHECKING
+from intake import DataSource
 
 import pandas as pd
 import tlz
 
 from . import COLUMNS_WITH_ITERABLES
 
+if TYPE_CHECKING:
+    from intake import DataSource
 
 class TranslatorError(Exception):
     "Generic Exception for the Translator classes"
@@ -25,7 +30,7 @@ class DefaultTranslator:
     of metadata for use in an intake-dataframe-catalog.
     """
 
-    def __init__(self, source, columns):
+    def __init__(self, source : DataSource, columns : list[str]):
         """
         Initialise a DefaultTranslator. This Translator works as follows:
 
@@ -45,12 +50,12 @@ def __init__(self, source, columns):
 
         self.source = source
         self.columns = columns
-        self._dispatch = {
+        self._dispatch : dict[str, Callable[[],pd.Series]] = {
             column: partial(self._default_translator, column=column)
             for column in columns
         }
 
-    def _default_translator(self, column):
+    def _default_translator(self, column: str) -> pd.Series:
         """
         Try to translate a column from a source using the default translator. This translator works as follows:
         - If the input source is an intake-esm datastore, the translator will first look for the column in the
@@ -96,7 +101,7 @@ def _default_translator(self, column):
 
         return pd.Series([val] * len_df)
 
-    def translate(self, groupby=None):
+    def translate(self, groupby : list[str] | None = None) -> pd.DataFrame:
         """
         Return the translated :py:class:`~pandas.DataFrame` of metadata and merge into set of
         set of rows with unique values of the columns specified.
@@ -149,7 +154,7 @@ class Cmip6Translator(DefaultTranslator):
     CMIP6 Translator for translating metadata from the NCI CMIP6 intake datastores.
     """
 
-    def __init__(self, source, columns):
+    def __init__(self, source : DataSource, columns :list[str]):
         """
         Initialise a Cmip6Translator
 
@@ -197,7 +202,7 @@ class Cmip5Translator(DefaultTranslator):
     CMIP5 Translator for translating metadata from the NCI CMIP5 intake datastores.
     """
 
-    def __init__(self, source, columns):
+    def __init__(self, source : DataSource, columns : list[str]):
         """
         Initialise a Cmip5Translator
 
@@ -245,7 +250,7 @@ class EraiTranslator(DefaultTranslator):
     ERAI Translator for translating metadata from the NCI ERA-Interim intake datastore.
     """
 
-    def __init__(self, source, columns):
+    def __init__(self, source : DataSource, columns : list[str]):
         """
         Initialise a EraiTranslator
 
@@ -267,7 +272,7 @@ def _variable_translator(self):
         return _to_tuple(self.source.df["variable"])
 
 
-def _cmip_frequency_translator(series):
+def _cmip_frequency_translator(series : pd.Series) -> pd.Series:
     """
     Return frequency from CMIP frequency metadata
     """
@@ -288,21 +293,19 @@ def _translate(string):
             "yrPt": "1yr",
         }
 
-        try:
-            return translations[string]
-        except KeyError:
-            return string
+        return translations.get(string, string)
 
     return series.apply(lambda string: _translate(string))
 
 
-def _cmip_realm_translator(series):
+def _cmip_realm_translator(series) -> pd.Series:
     """
-    Return realm from CMIP realm metadata, fixing some issues. This function returns
-    a tuple as there are sometimes multiple realms per cmip asset
+    Return realm from CMIP realm metadata, fixing some issues. This function takes
+    a series of strings and returns a series of tuples as there are sometimes multiple
+    realms per cmip asset
     """
 
-    def _translate(string):
+    def _translate(string : str) -> tuple[str, ...]:
         translations = {
             "na": "none",
             "landonly": "land",
@@ -313,10 +316,7 @@ def _translate(string):
         raw_realms = string.split(" ")
         realms = []
         for realm in raw_realms:
-            try:
-                realm = translations[realm]
-            except KeyError:
-                pass
+            realm = translations.get(realm, realm)
             if realm not in realms:
                 realms.append(realm)
         return tuple(realms)
@@ -324,7 +324,7 @@ def _translate(string):
     return series.apply(lambda string: _translate(string))
 
 
-def _to_tuple(series):
+def _to_tuple(series : pd.Series) -> pd.Series:
     """
     Make each entry in the provided series a tuple
 
diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py
index 13aa5249..cee340c6 100644
--- a/src/access_nri_intake/source/builders.py
+++ b/src/access_nri_intake/source/builders.py
@@ -16,7 +16,7 @@
 from .utils import EmptyFileError, get_timeinfo
 
 # Frequency translations
-FREQUENCIES = {
+FREQUENCIES : dict[str, tuple[int, str]] = {
     "daily": (1, "day"),
     "_dai$": (1, "day"),
     "month": (1, "mon"),
@@ -47,19 +47,19 @@ class BaseBuilder(Builder):
     """
 
     # Base class carries an empty set
-    PATTERNS = []
+    PATTERNS : list = []
 
     def __init__(
         self,
-        path,
-        depth=0,
-        exclude_patterns=None,
-        include_patterns=None,
-        data_format="netcdf",
-        groupby_attrs=None,
-        aggregations=None,
-        storage_options=None,
-        joblib_parallel_kwargs={"n_jobs": multiprocessing.cpu_count()},
+        path : str | list[str],
+        depth : int = 0,
+        exclude_patterns : list[str] | None = None,
+        include_patterns : list[str] | None = None,
+        data_format : str ="netcdf",
+        groupby_attrs : list[str] | None  = None,
+        aggregations : list[dict] | None = None,
+        storage_options : dict | None = None,
+        joblib_parallel_kwargs : dict ={"n_jobs": multiprocessing.cpu_count()},
     ):
         """
         This method should be overwritten. The expection is that some of these arguments
@@ -113,7 +113,7 @@ def parse(self):
         self._parse()
         return self
 
-    def _save(self, name, description, directory):
+    def _save(self, name : str, description : str, directory : str | None):
         super().save(
             name=name,
             path_column_name=PATH_COLUMN,
@@ -128,7 +128,7 @@ def _save(self, name, description, directory):
             to_csv_kwargs={"compression": "gzip"},
         )
 
-    def save(self, name, description, directory=None):
+    def save(self, name : str, description : str , directory : str | None = None) -> None:
         """
         Save datastore contents to a file.
 
@@ -210,8 +210,8 @@ def parser(file):
 
     @classmethod
     def parse_access_filename(
-        cls, filename, patterns=None, frequencies=FREQUENCIES, redaction_fill: str = "X"
-    ):
+        cls, filename : str, patterns=None, frequencies=FREQUENCIES, redaction_fill: str = "X"
+    ) -> tuple[str, str | None, str | None]:
         """
         Parse an ACCESS model filename and return a file id and any time information
 
@@ -260,7 +260,7 @@ def parse_access_filename(
         return file_id, timestamp, frequency
 
     @classmethod
-    def parse_access_ncfile(cls, file, time_dim="time"):
+    def parse_access_ncfile(cls, file : str , time_dim : str ="time"):
         """
         Get Intake-ESM datastore entry info from an ACCESS netcdf file
 
@@ -273,13 +273,18 @@ def parse_access_ncfile(cls, file, time_dim="time"):
 
         Returns
         -------
+        outputs: tuple
+
+        Raises
+        ------
+        EmptyFileError: If the file contains no variables
         """
 
-        file = Path(file)
-        filename = file.name
+        file_path = Path(file) 
+        filename = file_path.name
 
         file_id, filename_timestamp, filename_frequency = cls.parse_access_filename(
-            file.stem
+            file_path.stem
         )
 
         with xr.open_dataset(
diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py
index a3a8cfe9..8af4cacd 100644
--- a/src/access_nri_intake/source/utils.py
+++ b/src/access_nri_intake/source/utils.py
@@ -13,7 +13,7 @@ class EmptyFileError(Exception):
     pass
 
 
-def _add_month_start(time, n):
+def _add_month_start(time, n : int):
     """Add months to cftime datetime and truncate to start"""
     year = time.year + ((time.month + n - 1) // 12)
     month = (time.month + n - 1) % 12 + 1
@@ -22,7 +22,7 @@ def _add_month_start(time, n):
     )
 
 
-def _add_year_start(time, n):
+def _add_year_start(time, n : int):
     """Add years to cftime datetime and truncate to start"""
     return time.replace(
         year=time.year + n, month=1, day=1, hour=0, minute=0, second=0, microsecond=0
@@ -59,7 +59,7 @@ def _guess_start_end_dates(ts, te, frequency):
     return ts, te
 
 
-def get_timeinfo(ds, filename_frequency, time_dim):
+def get_timeinfo(ds, filename_frequency, time_dim : str ) -> tuple[str, str, str]:
     """
     Get start date, end date and frequency of a xarray dataset. Stolen and adapted from the
     cosima cookbook, see
@@ -109,17 +109,17 @@ def _todate(t):
             # TODO: This is not a very good way to get the frequency
             if dt.days >= 365:
                 years = round(dt.days / 365)
-                frequency = (years, "yr")
+                frequency = (years, "yr")  # type: ignore
             elif dt.days >= 28:
                 months = round(dt.days / 30)
-                frequency = (months, "mon")
+                frequency = (months, "mon") # type: ignore
             elif dt.days >= 1:
-                frequency = (dt.days, "day")
+                frequency = (dt.days, "day") # type: ignore
             elif dt.seconds >= 3600:
                 hours = round(dt.seconds / 3600)
-                frequency = (hours, "hr")
+                frequency = (hours, "hr") # type: ignore
             else:
-                frequency = (None, "subhr")
+                frequency = (None, "subhr") # type: ignore
 
     if filename_frequency:
         if filename_frequency != frequency:
diff --git a/src/access_nri_intake/utils.py b/src/access_nri_intake/utils.py
index af2e6a61..bfb5a35c 100644
--- a/src/access_nri_intake/utils.py
+++ b/src/access_nri_intake/utils.py
@@ -11,7 +11,7 @@
 import yaml
 
 
-def get_jsonschema(url, known_hash, required):
+def get_jsonschema(url : str , known_hash : str , required : list) -> tuple[dict, dict]:
     """
     Download a jsonschema from a url. Returns the unaltered jsonschema and a version with the "required" key
     matching the properties provided.
@@ -46,8 +46,7 @@ def get_jsonschema(url, known_hash, required):
 
     return schema, schema_required
 
-
-def load_metadata_yaml(path, jsonschema):
+def load_metadata_yaml(path : str, jsonschema : dict) -> dict:
     """
     Load a metadata.yaml file, leaving dates as strings, and validate against a jsonschema,
     allowing for tuples as arrays
@@ -84,7 +83,7 @@ def remove_implicit_resolver(cls, tag_to_remove):
     return metadata
 
 
-def validate_against_schema(instance, schema):
+def validate_against_schema(instance : dict, schema : dict) -> None:
     """
     Validate a dictionary against a jsonschema, allowing for tuples as arrays
 
@@ -94,6 +93,11 @@ def validate_against_schema(instance, schema):
         The instance to validate
     schema: dict
         The jsonschema
+
+    Raises
+    ------
+    jsonschema.exceptions.ValidationError
+        If the instance does not match the schema
     """
 
     Validator = jsonschema.validators.validator_for(schema)

From f89ece163ffecb0102346a35eccf673c81f02269 Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Fri, 13 Sep 2024 13:17:00 +0800
Subject: [PATCH 03/23] More type hints

---
 src/access_nri_intake/source/builders.py | 19 +++++++++++------
 src/access_nri_intake/source/utils.py    | 26 +++++++++++++++++++++++-
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py
index cee340c6..7fac34f4 100644
--- a/src/access_nri_intake/source/builders.py
+++ b/src/access_nri_intake/source/builders.py
@@ -210,7 +210,8 @@ def parser(file):
 
     @classmethod
     def parse_access_filename(
-        cls, filename : str, patterns=None, frequencies=FREQUENCIES, redaction_fill: str = "X"
+        cls, filename : str, patterns : list[str] | None =None,
+        frequencies : dict = FREQUENCIES, redaction_fill: str = "X"
     ) -> tuple[str, str | None, str | None]:
         """
         Parse an ACCESS model filename and return a file id and any time information
@@ -219,16 +220,22 @@ def parse_access_filename(
         ----------
         filename: str
             The filename to parse with the extension removed
+        patterns: list of str, optional
+            A list of regex patterns to match against the filename. If None, use the class PATTERNS
+        frequencies: dict, optional
+            A dictionary of regex patterns to match against the filename to determine the frequency
+        redaction_fill: str, optional
+            The character to replace time information with. Defaults to "X"
 
         Returns
         -------
         file_id: str
             The file id constructed by redacting time information and replacing non-python characters
             with underscores
-        timestamp: str
-            A string of the redacted time information (e.g. "1990-01")
-        frequency: str
-            The frequency of the file if available in the filename
+        timestamp: str | None
+            A string of the redacted time information (e.g. "1990-01") if available, otherwise None
+        frequency: str | None
+            The frequency of the file if available in the filename, otherwise None
         """
         if patterns is None:
             patterns = cls.PATTERNS
@@ -260,7 +267,7 @@ def parse_access_filename(
         return file_id, timestamp, frequency
 
     @classmethod
-    def parse_access_ncfile(cls, file : str , time_dim : str ="time"):
+    def parse_access_ncfile(cls, file : str , time_dim : str = "time") -> tuple:
         """
         Get Intake-ESM datastore entry info from an ACCESS netcdf file
 
diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py
index 8af4cacd..c51a5fde 100644
--- a/src/access_nri_intake/source/utils.py
+++ b/src/access_nri_intake/source/utils.py
@@ -5,9 +5,13 @@
 
 import warnings
 from datetime import timedelta
+from typing import TYPE_CHECKING
 
 import cftime
 
+if TYPE_CHECKING:
+    import xarray as xr
+
 
 class EmptyFileError(Exception):
     pass
@@ -59,7 +63,11 @@ def _guess_start_end_dates(ts, te, frequency):
     return ts, te
 
 
-def get_timeinfo(ds, filename_frequency, time_dim : str ) -> tuple[str, str, str]:
+def get_timeinfo(
+        ds : xr.Dataset,
+        filename_frequency : str | None,
+        time_dim : str, 
+    ) -> tuple[str, str, str]:
     """
     Get start date, end date and frequency of a xarray dataset. Stolen and adapted from the
     cosima cookbook, see
@@ -69,8 +77,24 @@ def get_timeinfo(ds, filename_frequency, time_dim : str ) -> tuple[str, str, str
     ----------
     ds: :py:class:`xarray.Dataset`
         The dataset to parse the time info from
+    filename_frequency: str
+        Frequency as determined from the filename
     time_dim: str
         The name of the time dimension
+
+    Returns
+    -------
+    start_date: str
+        The start date of the dataset
+    end_date: str
+        The end date of the dataset
+    frequency: str
+        The frequency of the dataset
+
+    Raises
+    ------
+    EmptyFileError
+        If the dataset has a valid unlimited dimension, but no data
     """
 
     def _todate(t):

From aeaa9701e780867667227e99993a19e840915fab Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Wed, 18 Sep 2024 14:10:36 +0800
Subject: [PATCH 04/23] Type hint upddates

---
 src/access_nri_intake/source/utils.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py
index c51a5fde..3d4b7728 100644
--- a/src/access_nri_intake/source/utils.py
+++ b/src/access_nri_intake/source/utils.py
@@ -8,9 +8,7 @@
 from typing import TYPE_CHECKING
 
 import cftime
-
-if TYPE_CHECKING:
-    import xarray as xr
+import xarray as xr
 
 
 class EmptyFileError(Exception):
@@ -103,7 +101,7 @@ def _todate(t):
     time_format = "%Y-%m-%d, %H:%M:%S"
     ts = None
     te = None
-    frequency = "fx"
+    frequency : str | tuple[int | None, str] = "fx"
     has_time = time_dim in ds
 
     if has_time:
@@ -133,17 +131,17 @@ def _todate(t):
             # TODO: This is not a very good way to get the frequency
             if dt.days >= 365:
                 years = round(dt.days / 365)
-                frequency = (years, "yr")  # type: ignore
+                frequency = (years, "yr")
             elif dt.days >= 28:
                 months = round(dt.days / 30)
-                frequency = (months, "mon") # type: ignore
+                frequency = (months, "mon") 
             elif dt.days >= 1:
-                frequency = (dt.days, "day") # type: ignore
+                frequency = (dt.days, "day")
             elif dt.seconds >= 3600:
                 hours = round(dt.seconds / 3600)
-                frequency = (hours, "hr") # type: ignore
+                frequency = (hours, "hr")
             else:
-                frequency = (None, "subhr") # type: ignore
+                frequency = (None, "subhr")
 
     if filename_frequency:
         if filename_frequency != frequency:

From 6eb659ec30e3a25e71ab9920c00ebf4ba6f2e88f Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Mon, 23 Sep 2024 15:51:00 +1000
Subject: [PATCH 05/23] Updated data_vars => variables in parse_access_ncfile
 to allow coordinate variable searching & indexing

---
 src/access_nri_intake/source/builders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py
index 13aa5249..b7c8c1b7 100644
--- a/src/access_nri_intake/source/builders.py
+++ b/src/access_nri_intake/source/builders.py
@@ -294,7 +294,7 @@ def parse_access_ncfile(cls, file, time_dim="time"):
             variable_standard_name_list = []
             variable_cell_methods_list = []
             variable_units_list = []
-            for var in ds.data_vars:
+            for var in ds.variables:
                 attrs = ds[var].attrs
                 if "long_name" in attrs:
                     variable_list.append(var)

From ca1743e556afce7247ae7c23c886be902508e91f Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Tue, 24 Sep 2024 12:45:59 +1000
Subject: [PATCH 06/23] Updated all the builders to use a dataclass rather than
 a tuple: likely to be necessary for passing around coordinates as well as
 data variables as we begin to make coordinates indexable - I think we'll
 begin to get confused about what belongs where.

---
 src/access_nri_intake/catalog/manager.py     |  24 +--
 src/access_nri_intake/catalog/translators.py |  25 ++-
 src/access_nri_intake/source/builders.py     | 215 ++++++-------------
 src/access_nri_intake/source/utils.py        |  70 +++++-
 src/access_nri_intake/utils.py               |   9 +-
 tests/test_builders.py                       |  39 ++--
 6 files changed, 180 insertions(+), 202 deletions(-)

diff --git a/src/access_nri_intake/catalog/manager.py b/src/access_nri_intake/catalog/manager.py
index ed954e36..f52e2126 100644
--- a/src/access_nri_intake/catalog/manager.py
+++ b/src/access_nri_intake/catalog/manager.py
@@ -30,7 +30,7 @@ class CatalogManager:
     Add/update intake sources in an intake-dataframe-catalog like the ACCESS-NRI catalog
     """
 
-    def __init__(self, path : str):
+    def __init__(self, path: str):
         """
         Initialise a CatalogManager instance to add/update intake sources in a
         intake-dataframe-catalog like the ACCESS-NRI catalog
@@ -58,14 +58,14 @@ def __init__(self, path : str):
 
     def build_esm(
         self,
-        name : str,
-        description : str,
+        name: str,
+        description: str,
         builder,
-        path : list[str] | str,
+        path: list[str] | str,
         translator=DefaultTranslator,
-        metadata : dict | None = None,
-        directory : str | None = None,
-        overwrite : bool =False,
+        metadata: dict | None = None,
+        directory: str | None = None,
+        overwrite: bool = False,
         **kwargs,
     ):
         """
@@ -124,12 +124,12 @@ def build_esm(
 
     def load(
         self,
-        name : str,
-        description : str,
-        path : str,
-        driver : str ="esm_datastore",
+        name: str,
+        description: str,
+        path: str,
+        driver: str = "esm_datastore",
         translator=DefaultTranslator,
-        metadata : dict | None =None,
+        metadata: dict | None = None,
         **kwargs,
     ):
         """
diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py
index 18c4fb8c..8f8bad96 100644
--- a/src/access_nri_intake/catalog/translators.py
+++ b/src/access_nri_intake/catalog/translators.py
@@ -7,17 +7,16 @@
 """
 
 from __future__ import annotations
+
 from functools import partial
-from typing import Callable, TYPE_CHECKING
-from intake import DataSource
+from typing import Callable
 
 import pandas as pd
 import tlz
+from intake import DataSource
 
 from . import COLUMNS_WITH_ITERABLES
 
-if TYPE_CHECKING:
-    from intake import DataSource
 
 class TranslatorError(Exception):
     "Generic Exception for the Translator classes"
@@ -30,7 +29,7 @@ class DefaultTranslator:
     of metadata for use in an intake-dataframe-catalog.
     """
 
-    def __init__(self, source : DataSource, columns : list[str]):
+    def __init__(self, source: DataSource, columns: list[str]):
         """
         Initialise a DefaultTranslator. This Translator works as follows:
 
@@ -50,7 +49,7 @@ def __init__(self, source : DataSource, columns : list[str]):
 
         self.source = source
         self.columns = columns
-        self._dispatch : dict[str, Callable[[],pd.Series]] = {
+        self._dispatch: dict[str, Callable[[], pd.Series]] = {
             column: partial(self._default_translator, column=column)
             for column in columns
         }
@@ -101,7 +100,7 @@ def _default_translator(self, column: str) -> pd.Series:
 
         return pd.Series([val] * len_df)
 
-    def translate(self, groupby : list[str] | None = None) -> pd.DataFrame:
+    def translate(self, groupby: list[str] | None = None) -> pd.DataFrame:
         """
         Return the translated :py:class:`~pandas.DataFrame` of metadata and merge into set of
         set of rows with unique values of the columns specified.
@@ -154,7 +153,7 @@ class Cmip6Translator(DefaultTranslator):
     CMIP6 Translator for translating metadata from the NCI CMIP6 intake datastores.
     """
 
-    def __init__(self, source : DataSource, columns :list[str]):
+    def __init__(self, source: DataSource, columns: list[str]):
         """
         Initialise a Cmip6Translator
 
@@ -202,7 +201,7 @@ class Cmip5Translator(DefaultTranslator):
     CMIP5 Translator for translating metadata from the NCI CMIP5 intake datastores.
     """
 
-    def __init__(self, source : DataSource, columns : list[str]):
+    def __init__(self, source: DataSource, columns: list[str]):
         """
         Initialise a Cmip5Translator
 
@@ -250,7 +249,7 @@ class EraiTranslator(DefaultTranslator):
     ERAI Translator for translating metadata from the NCI ERA-Interim intake datastore.
     """
 
-    def __init__(self, source : DataSource, columns : list[str]):
+    def __init__(self, source: DataSource, columns: list[str]):
         """
         Initialise a EraiTranslator
 
@@ -272,7 +271,7 @@ def _variable_translator(self):
         return _to_tuple(self.source.df["variable"])
 
 
-def _cmip_frequency_translator(series : pd.Series) -> pd.Series:
+def _cmip_frequency_translator(series: pd.Series) -> pd.Series:
     """
     Return frequency from CMIP frequency metadata
     """
@@ -305,7 +304,7 @@ def _cmip_realm_translator(series) -> pd.Series:
     realms per cmip asset
     """
 
-    def _translate(string : str) -> tuple[str, ...]:
+    def _translate(string: str) -> tuple[str, ...]:
         translations = {
             "na": "none",
             "landonly": "land",
@@ -324,7 +323,7 @@ def _translate(string : str) -> tuple[str, ...]:
     return series.apply(lambda string: _translate(string))
 
 
-def _to_tuple(series : pd.Series) -> pd.Series:
+def _to_tuple(series: pd.Series) -> pd.Series:
     """
     Make each entry in the provided series a tuple
 
diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py
index 28246f67..c43c9741 100644
--- a/src/access_nri_intake/source/builders.py
+++ b/src/access_nri_intake/source/builders.py
@@ -13,10 +13,10 @@
 
 from ..utils import validate_against_schema
 from . import ESM_JSONSCHEMA, PATH_COLUMN, VARIABLE_COLUMN
-from .utils import EmptyFileError, get_timeinfo
+from .utils import AccessNCFileInfo, EmptyFileError, get_timeinfo
 
 # Frequency translations
-FREQUENCIES : dict[str, tuple[int, str]] = {
+FREQUENCIES: dict[str, tuple[int, str]] = {
     "daily": (1, "day"),
     "_dai$": (1, "day"),
     "month": (1, "mon"),
@@ -47,19 +47,19 @@ class BaseBuilder(Builder):
     """
 
     # Base class carries an empty set
-    PATTERNS : list = []
+    PATTERNS: list = []
 
     def __init__(
         self,
-        path : str | list[str],
-        depth : int = 0,
-        exclude_patterns : list[str] | None = None,
-        include_patterns : list[str] | None = None,
-        data_format : str ="netcdf",
-        groupby_attrs : list[str] | None  = None,
-        aggregations : list[dict] | None = None,
-        storage_options : dict | None = None,
-        joblib_parallel_kwargs : dict ={"n_jobs": multiprocessing.cpu_count()},
+        path: str | list[str],
+        depth: int = 0,
+        exclude_patterns: list[str] | None = None,
+        include_patterns: list[str] | None = None,
+        data_format: str = "netcdf",
+        groupby_attrs: list[str] | None = None,
+        aggregations: list[dict] | None = None,
+        storage_options: dict | None = None,
+        joblib_parallel_kwargs: dict = {"n_jobs": multiprocessing.cpu_count()},
     ):
         """
         This method should be overwritten. The expection is that some of these arguments
@@ -113,7 +113,7 @@ def parse(self):
         self._parse()
         return self
 
-    def _save(self, name : str, description : str, directory : str | None):
+    def _save(self, name: str, description: str, directory: str | None):
         super().save(
             name=name,
             path_column_name=PATH_COLUMN,
@@ -128,7 +128,7 @@ def _save(self, name : str, description : str, directory : str | None):
             to_csv_kwargs={"compression": "gzip"},
         )
 
-    def save(self, name : str, description : str , directory : str | None = None) -> None:
+    def save(self, name: str, description: str, directory: str | None = None) -> None:
         """
         Save datastore contents to a file.
 
@@ -210,8 +210,11 @@ def parser(file):
 
     @classmethod
     def parse_access_filename(
-        cls, filename : str, patterns : list[str] | None =None,
-        frequencies : dict = FREQUENCIES, redaction_fill: str = "X"
+        cls,
+        filename: str,
+        patterns: list[str] | None = None,
+        frequencies: dict = FREQUENCIES,
+        redaction_fill: str = "X",
     ) -> tuple[str, str | None, str | None]:
         """
         Parse an ACCESS model filename and return a file id and any time information
@@ -267,7 +270,7 @@ def parse_access_filename(
         return file_id, timestamp, frequency
 
     @classmethod
-    def parse_access_ncfile(cls, file : str , time_dim : str = "time") -> tuple:
+    def parse_access_ncfile(cls, file: str, time_dim: str = "time") -> AccessNCFileInfo:
         """
         Get Intake-ESM datastore entry info from an ACCESS netcdf file
 
@@ -280,14 +283,15 @@ def parse_access_ncfile(cls, file : str , time_dim : str = "time") -> tuple:
 
         Returns
         -------
-        outputs: tuple
+        output_nc_info: AccessNCFileInfo
+            A dataclass containing the information parsed from the file
 
         Raises
         ------
         EmptyFileError: If the file contains no variables
         """
 
-        file_path = Path(file) 
+        file_path = Path(file)
         filename = file_path.name
 
         file_id, filename_timestamp, filename_frequency = cls.parse_access_filename(
@@ -306,23 +310,14 @@ def parse_access_ncfile(cls, file : str , time_dim : str = "time") -> tuple:
             variable_standard_name_list = []
             variable_cell_methods_list = []
             variable_units_list = []
-            for var in ds.variables:
+            for var in ds.data_vars:
                 attrs = ds[var].attrs
                 if "long_name" in attrs:
                     variable_list.append(var)
                     variable_long_name_list.append(attrs["long_name"])
-                    if "standard_name" in attrs:
-                        variable_standard_name_list.append(attrs["standard_name"])
-                    else:
-                        variable_standard_name_list.append("")
-                    if "cell_methods" in attrs:
-                        variable_cell_methods_list.append(attrs["cell_methods"])
-                    else:
-                        variable_cell_methods_list.append("")
-                    if "units" in attrs:
-                        variable_units_list.append(attrs["units"])
-                    else:
-                        variable_units_list.append("")
+                    variable_standard_name_list.append(attrs.get("standard_name", ""))
+                    variable_cell_methods_list.append(attrs.get("cell_methods", ""))
+                    variable_units_list.append(attrs.get("units", ""))
 
             start_date, end_date, frequency = get_timeinfo(
                 ds, filename_frequency, time_dim
@@ -331,21 +326,21 @@ def parse_access_ncfile(cls, file : str , time_dim : str = "time") -> tuple:
         if not variable_list:
             raise EmptyFileError("This file contains no variables")
 
-        outputs = (
-            filename,
-            file_id,
-            filename_timestamp,
-            frequency,
-            start_date,
-            end_date,
-            variable_list,
-            variable_long_name_list,
-            variable_standard_name_list,
-            variable_cell_methods_list,
-            variable_units_list,
+        output_ncfile = AccessNCFileInfo(
+            filename=filename,
+            file_id=file_id,
+            filename_timestamp=filename_timestamp,
+            frequency=frequency,
+            start_date=start_date,
+            end_date=end_date,
+            variable=variable_list,  # type: ignore
+            variable_long_name=variable_long_name_list,
+            variable_standard_name=variable_standard_name_list,
+            variable_cell_methods=variable_cell_methods_list,
+            variable_units=variable_units_list,
         )
 
-        return outputs
+        return output_ncfile
 
 
 class AccessOm2Builder(BaseBuilder):
@@ -390,44 +385,23 @@ def __init__(self, path):
         super().__init__(**kwargs)
 
     @classmethod
-    def parser(cls, file):
+    def parser(cls, file) -> dict:
         try:
-            match_groups = re.match(r".*/output\d+/([^/]*)/.*\.nc", file).groups()
+            # Need to check, but I think that the .groups() method that mypy is
+            # getting upset about is what the try/catch is for here - if the regex
+            # doesn't match, then it will throw an exception.
+            match_groups = re.match(r".*/output\d+/([^/]*)/.*\.nc", file).groups()  # type: ignore
             realm = match_groups[0]
 
             if realm == "ice":
                 realm = "seaIce"
 
-            (
-                filename,
-                file_id,
-                _,
-                frequency,
-                start_date,
-                end_date,
-                variable_list,
-                variable_long_name_list,
-                variable_standard_name_list,
-                variable_cell_methods_list,
-                variable_units_list,
-            ) = cls.parse_access_ncfile(file)
-
-            info = {
-                "path": str(file),
-                "realm": realm,
-                "variable": variable_list,
-                "frequency": frequency,
-                "start_date": start_date,
-                "end_date": end_date,
-                "variable_long_name": variable_long_name_list,
-                "variable_standard_name": variable_standard_name_list,
-                "variable_cell_methods": variable_cell_methods_list,
-                "variable_units": variable_units_list,
-                "filename": filename,
-                "file_id": file_id,
-            }
-
-            return info
+            nc_info = cls.parse_access_ncfile(file)
+            ncinfo_dict = nc_info.to_dict()
+
+            ncinfo_dict["realm"] = realm
+
+            return ncinfo_dict
 
         except Exception:
             return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}
@@ -478,47 +452,22 @@ def __init__(self, path):
         super().__init__(**kwargs)
 
     @classmethod
-    def parser(cls, file):
+    def parser(cls, file) -> dict:
         try:
-            (
-                filename,
-                file_id,
-                _,
-                frequency,
-                start_date,
-                end_date,
-                variable_list,
-                variable_long_name_list,
-                variable_standard_name_list,
-                variable_cell_methods_list,
-                variable_units_list,
-            ) = cls.parse_access_ncfile(file)
-
-            if "mom6" in filename:
+            output_nc_info = cls.parse_access_ncfile(file)
+            ncinfo_dict = output_nc_info.to_dict()
+
+            if "mom6" in ncinfo_dict["filename"]:
                 realm = "ocean"
-            elif "ww3" in filename:
+            elif "ww3" in ncinfo_dict["filename"]:
                 realm = "wave"
-            elif "cice" in filename:
+            elif "cice" in ncinfo_dict["filename"]:
                 realm = "seaIce"
             else:
                 raise ParserError(f"Cannot determine realm for file {file}")
+            ncinfo_dict["realm"] = realm
 
-            info = {
-                "path": str(file),
-                "realm": realm,
-                "variable": variable_list,
-                "frequency": frequency,
-                "start_date": start_date,
-                "end_date": end_date,
-                "variable_long_name": variable_long_name_list,
-                "variable_standard_name": variable_standard_name_list,
-                "variable_cell_methods": variable_cell_methods_list,
-                "variable_units": variable_units_list,
-                "filename": filename,
-                "file_id": file_id,
-            }
-
-            return info
+            return ncinfo_dict
 
         except Exception:
             return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}
@@ -582,42 +531,18 @@ def parser(cls, file):
             realm = match_groups[1]
 
             realm_mapping = {"atm": "atmos", "ocn": "ocean", "ice": "seaIce"}
-            realm = realm_mapping[realm]
-
-            (
-                filename,
-                file_id,
-                _,
-                frequency,
-                start_date,
-                end_date,
-                variable_list,
-                variable_long_name_list,
-                variable_standard_name_list,
-                variable_cell_methods_list,
-                variable_units_list,
-            ) = cls.parse_access_ncfile(file)
+
+            nc_info = cls.parse_access_ncfile(file)
+            ncinfo_dict = nc_info.to_dict()
 
             # Remove exp_id from file id so that members can be part of the same dataset
-            file_id = re.sub(exp_id, "", file_id).strip("_")
-
-            info = {
-                "path": str(file),
-                "realm": realm,
-                "variable": variable_list,
-                "frequency": frequency,
-                "start_date": start_date,
-                "end_date": end_date,
-                "member": exp_id,
-                "variable_long_name": variable_long_name_list,
-                "variable_standard_name": variable_standard_name_list,
-                "variable_cell_methods": variable_cell_methods_list,
-                "variable_units": variable_units_list,
-                "filename": filename,
-                "file_id": file_id,
-            }
-
-            return info
+            ncinfo_dict["file_id"] = re.sub(exp_id, "", ncinfo_dict["file_id"]).strip(
+                "_"
+            )
+            ncinfo_dict["realm"] = realm_mapping[realm]
+            ncinfo_dict["member"] = exp_id
+
+            return ncinfo_dict
 
         except Exception:
             return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}
diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py
index 3d4b7728..f5c35240 100644
--- a/src/access_nri_intake/source/utils.py
+++ b/src/access_nri_intake/source/utils.py
@@ -4,8 +4,9 @@
 """ Shared utilities for writing Intake-ESM builders and their parsers """
 
 import warnings
+from dataclasses import asdict, dataclass, field
 from datetime import timedelta
-from typing import TYPE_CHECKING
+from pathlib import Path
 
 import cftime
 import xarray as xr
@@ -15,7 +16,58 @@ class EmptyFileError(Exception):
     pass
 
 
-def _add_month_start(time, n : int):
+@dataclass
+class AccessNCFileInfo:
+    """
+    Holds information about a NetCDF file that is used to create an intake-esm
+    catalog entry.
+    """
+
+    filename: str | Path
+    file_id: str
+    filename_timestamp: str | None
+    frequency: str
+    start_date: str
+    end_date: str
+    variable: list[str]
+    variable_long_name: list[str]
+    variable_standard_name: list[str]
+    variable_cell_methods: list[str]
+    variable_units: list[str]
+    path: str = field(init=False)
+
+    def __post_init__(self):
+        self.path = str(self.filename)
+
+    def to_dict(self) -> dict[str, str | list[str]]:
+        """
+        Return a dictionary representation of the NcFileInfo object
+        """
+        return asdict(self)
+
+    def to_tuple(
+        self,
+    ) -> tuple[
+        str,
+        str | None,
+        str,
+        str,
+        str,
+        list[str],
+        list[str],
+        list[str],
+        list[str],
+        list[str],
+    ]:
+        """
+        Return a tuple representation of the NcFileInfo object.
+
+        Returns an insanely long tuple: aiming to clean this up.
+        """
+        return tuple(asdict(self).values())
+
+
+def _add_month_start(time, n: int):
     """Add months to cftime datetime and truncate to start"""
     year = time.year + ((time.month + n - 1) // 12)
     month = (time.month + n - 1) % 12 + 1
@@ -24,7 +76,7 @@ def _add_month_start(time, n : int):
     )
 
 
-def _add_year_start(time, n : int):
+def _add_year_start(time, n: int):
     """Add years to cftime datetime and truncate to start"""
     return time.replace(
         year=time.year + n, month=1, day=1, hour=0, minute=0, second=0, microsecond=0
@@ -62,10 +114,10 @@ def _guess_start_end_dates(ts, te, frequency):
 
 
 def get_timeinfo(
-        ds : xr.Dataset,
-        filename_frequency : str | None,
-        time_dim : str, 
-    ) -> tuple[str, str, str]:
+    ds: xr.Dataset,
+    filename_frequency: str | None,
+    time_dim: str,
+) -> tuple[str, str, str]:
     """
     Get start date, end date and frequency of a xarray dataset. Stolen and adapted from the
     cosima cookbook, see
@@ -101,7 +153,7 @@ def _todate(t):
     time_format = "%Y-%m-%d, %H:%M:%S"
     ts = None
     te = None
-    frequency : str | tuple[int | None, str] = "fx"
+    frequency: str | tuple[int | None, str] = "fx"
     has_time = time_dim in ds
 
     if has_time:
@@ -134,7 +186,7 @@ def _todate(t):
                 frequency = (years, "yr")
             elif dt.days >= 28:
                 months = round(dt.days / 30)
-                frequency = (months, "mon") 
+                frequency = (months, "mon")
             elif dt.days >= 1:
                 frequency = (dt.days, "day")
             elif dt.seconds >= 3600:
diff --git a/src/access_nri_intake/utils.py b/src/access_nri_intake/utils.py
index dc0cf3f8..ceae573d 100644
--- a/src/access_nri_intake/utils.py
+++ b/src/access_nri_intake/utils.py
@@ -11,7 +11,7 @@
 import yaml
 
 
-def get_jsonschema(url : str , known_hash : str , required : list) -> tuple[dict, dict]:
+def get_jsonschema(metadata_file: str, required: list) -> tuple[dict, dict]:
     """
     Read in the required JSON schema, and annotate it with "required" fields.
 
@@ -22,7 +22,7 @@ def get_jsonschema(url : str , known_hash : str , required : list) -> tuple[dict
     """
 
     schema_file = rsr.files("access_nri_intake").joinpath(metadata_file)
-    with schema_file.open(mode="r") as fpath:
+    with schema_file.open(mode="r") as fpath: # type: ignore
         schema = json.load(fpath)
 
     schema_required = schema.copy()
@@ -39,7 +39,8 @@ def get_jsonschema(url : str , known_hash : str , required : list) -> tuple[dict
 
     return schema, schema_required
 
-def load_metadata_yaml(path : str, jsonschema : dict) -> dict:
+
+def load_metadata_yaml(path: str, jsonschema: dict) -> dict:
     """
     Load a metadata.yaml file, leaving dates as strings, and validate against a jsonschema,
     allowing for tuples as arrays
@@ -76,7 +77,7 @@ def remove_implicit_resolver(cls, tag_to_remove):
     return metadata
 
 
-def validate_against_schema(instance : dict, schema : dict) -> None:
+def validate_against_schema(instance: dict, schema: dict) -> None:
     """
     Validate a dictionary against a jsonschema, allowing for tuples as arrays
 
diff --git a/tests/test_builders.py b/tests/test_builders.py
index 7b8dc5d3..d2283d61 100644
--- a/tests/test_builders.py
+++ b/tests/test_builders.py
@@ -8,6 +8,7 @@
 import pytest
 
 from access_nri_intake.source import CORE_COLUMNS, builders
+from access_nri_intake.source.utils import AccessNCFileInfo
 
 
 @pytest.mark.parametrize(
@@ -364,7 +365,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessOm2Builder,
             "access-om2/output000/ocean/ocean_grid.nc",
-            (
+            AccessNCFileInfo(
                 "ocean_grid.nc",
                 "ocean_grid",
                 None,
@@ -381,7 +382,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessOm2Builder,
             "access-om2/output000/ocean/ocean.nc",
-            (
+            AccessNCFileInfo(
                 "ocean.nc",
                 "ocean",
                 None,
@@ -398,7 +399,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessOm2Builder,
             "access-om2/output000/ocean/ocean_month.nc",
-            (
+            AccessNCFileInfo(
                 "ocean_month.nc",
                 "ocean_month",
                 None,
@@ -418,7 +419,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessOm2Builder,
             "access-om2/output000/ocean/ocean_month_inst_nobounds.nc",
-            (
+            AccessNCFileInfo(
                 "ocean_month_inst_nobounds.nc",
                 "ocean_month_inst_nobounds",
                 None,
@@ -435,7 +436,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessOm2Builder,
             "access-om2/output000/ice/OUTPUT/iceh.1900-01.nc",
-            (
+            AccessNCFileInfo(
                 "iceh.1900-01.nc",
                 "iceh_XXXX_XX",
                 "1900-01",
@@ -464,7 +465,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessCm2Builder,
             "access-cm2/by578/history/atm/netCDF/by578a.pd201501_dai.nc",
-            (
+            AccessNCFileInfo(
                 "by578a.pd201501_dai.nc",
                 "by578a_pdXXXXXX_dai",
                 "201501",
@@ -481,7 +482,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessCm2Builder,
             "access-cm2/by578/history/ice/iceh_d.2015-01.nc",
-            (
+            AccessNCFileInfo(
                 "iceh_d.2015-01.nc",
                 "iceh_d_XXXX_XX",
                 "2015-01",
@@ -510,7 +511,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessCm2Builder,
             "access-cm2/by578/history/ocn/ocean_daily.nc-20150630",
-            (
+            AccessNCFileInfo(
                 "ocean_daily.nc-20150630",
                 "ocean_daily",
                 None,
@@ -527,7 +528,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessCm2Builder,
             "access-cm2/by578/history/ocn/ocean_scalar.nc-20150630",
-            (
+            AccessNCFileInfo(
                 "ocean_scalar.nc-20150630",
                 "ocean_scalar",
                 None,
@@ -544,7 +545,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessEsm15Builder,
             "access-esm1-5/history/atm/netCDF/HI-C-05-r1.pa-185001_mon.nc",
-            (
+            AccessNCFileInfo(
                 "HI-C-05-r1.pa-185001_mon.nc",
                 "HI_C_05_r1_pa_XXXXXX_mon",
                 "185001",
@@ -561,7 +562,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessEsm15Builder,
             "access-esm1-5/history/ice/iceh.1850-01.nc",
-            (
+            AccessNCFileInfo(
                 "iceh.1850-01.nc",
                 "iceh_XXXX_XX",
                 "1850-01",
@@ -590,7 +591,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessEsm15Builder,
             "access-esm1-5/history/ocn/ocean_bgc_ann.nc-18501231",
-            (
+            AccessNCFileInfo(
                 "ocean_bgc_ann.nc-18501231",
                 "ocean_bgc_ann",
                 None,
@@ -607,7 +608,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessEsm15Builder,
             "access-esm1-5/history/ocn/ocean_bgc.nc-18501231",
-            (
+            AccessNCFileInfo(
                 "ocean_bgc.nc-18501231",
                 "ocean_bgc",
                 None,
@@ -624,7 +625,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.mom6.h.native_1900_01.nc",
-            (
+            AccessNCFileInfo(
                 "GMOM_JRA_WD.mom6.h.native_1900_01.nc",
                 "GMOM_JRA_WD_mom6_h_native_XXXX_XX",
                 "1900_01",
@@ -653,7 +654,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc",
-            (
+            AccessNCFileInfo(
                 "GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc",
                 "GMOM_JRA_WD_mom6_h_sfc_XXXX_XX_XX",
                 "1900_01_02",
@@ -682,7 +683,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.mom6.h.static.nc",
-            (
+            AccessNCFileInfo(
                 "GMOM_JRA_WD.mom6.h.static.nc",
                 "GMOM_JRA_WD_mom6_h_static",
                 None,
@@ -699,7 +700,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.mom6.h.z_1900_01.nc",
-            (
+            AccessNCFileInfo(
                 "GMOM_JRA_WD.mom6.h.z_1900_01.nc",
                 "GMOM_JRA_WD_mom6_h_z_XXXX_XX",
                 "1900_01",
@@ -728,7 +729,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.cice.h.1900-01-01.nc",
-            (
+            AccessNCFileInfo(
                 "GMOM_JRA_WD.cice.h.1900-01-01.nc",
                 "GMOM_JRA_WD_cice_h_XXXX_XX_XX",
                 "1900-01-01",
@@ -757,7 +758,7 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc",
-            (
+            AccessNCFileInfo(
                 "GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc",
                 "GMOM_JRA_WD_ww3_hi_XXXX_XX_XX_XXXXX",
                 "1900-01-02-00000",

From 1c5378e2034486ef45902d483ec0b1cee52c8973 Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Wed, 25 Sep 2024 15:44:01 +0800
Subject: [PATCH 07/23] - Updated the builders to use separate coordinate &
 variables    dataclasses that are fed into a dataclass holding all the   
 coordinate and data variables from a netCDF file. - Updated tests so that
 they are all passing: tests now expect   to find coordinate variables from
 the netCDF files as well as   data variables. - Some minor changes to make
 code more readable - changed long   tuples to dataclasses & dictionaries
 where possible.

---
 src/access_nri_intake/source/builders.py |  42 +-
 src/access_nri_intake/source/utils.py    | 110 +++-
 src/access_nri_intake/utils.py           |   2 +-
 tests/test_builders.py                   | 723 ++++++++++++++++-------
 4 files changed, 609 insertions(+), 268 deletions(-)

diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py
index c43c9741..ed41a098 100644
--- a/src/access_nri_intake/source/builders.py
+++ b/src/access_nri_intake/source/builders.py
@@ -13,7 +13,13 @@
 
 from ..utils import validate_against_schema
 from . import ESM_JSONSCHEMA, PATH_COLUMN, VARIABLE_COLUMN
-from .utils import AccessNCFileInfo, EmptyFileError, get_timeinfo
+from .utils import (
+    EmptyFileError,
+    _AccessNCFileInfo,
+    _CoordVarInfo,
+    _DataVarInfo,
+    get_timeinfo,
+)
 
 # Frequency translations
 FREQUENCIES: dict[str, tuple[int, str]] = {
@@ -270,7 +276,9 @@ def parse_access_filename(
         return file_id, timestamp, frequency
 
     @classmethod
-    def parse_access_ncfile(cls, file: str, time_dim: str = "time") -> AccessNCFileInfo:
+    def parse_access_ncfile(
+        cls, file: str, time_dim: str = "time"
+    ) -> _AccessNCFileInfo:
         """
         Get Intake-ESM datastore entry info from an ACCESS netcdf file
 
@@ -305,39 +313,33 @@ def parse_access_ncfile(cls, file: str, time_dim: str = "time") -> AccessNCFileI
             decode_times=False,
             decode_coords=False,
         ) as ds:
-            variable_list = []
-            variable_long_name_list = []
-            variable_standard_name_list = []
-            variable_cell_methods_list = []
-            variable_units_list = []
+            dvars = _DataVarInfo()
+            cvars = _CoordVarInfo()
+
             for var in ds.data_vars:
                 attrs = ds[var].attrs
-                if "long_name" in attrs:
-                    variable_list.append(var)
-                    variable_long_name_list.append(attrs["long_name"])
-                    variable_standard_name_list.append(attrs.get("standard_name", ""))
-                    variable_cell_methods_list.append(attrs.get("cell_methods", ""))
-                    variable_units_list.append(attrs.get("units", ""))
+                dvars.append_attrs(var, attrs)  # type: ignore
+
+            for var in ds.coords:
+                attrs = ds[var].attrs
+                cvars.append_attrs(var, attrs)  # type: ignore
 
             start_date, end_date, frequency = get_timeinfo(
                 ds, filename_frequency, time_dim
             )
 
-        if not variable_list:
+        if not dvars.variable_list:
             raise EmptyFileError("This file contains no variables")
 
-        output_ncfile = AccessNCFileInfo(
+        output_ncfile = _AccessNCFileInfo(
             filename=filename,
             file_id=file_id,
             filename_timestamp=filename_timestamp,
             frequency=frequency,
             start_date=start_date,
             end_date=end_date,
-            variable=variable_list,  # type: ignore
-            variable_long_name=variable_long_name_list,
-            variable_standard_name=variable_standard_name_list,
-            variable_cell_methods=variable_cell_methods_list,
-            variable_units=variable_units_list,
+            **dvars.to_ncinfo_dict(),
+            **cvars.to_ncinfo_dict(),
         )
 
         return output_ncfile
diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py
index f5c35240..816bb3b4 100644
--- a/src/access_nri_intake/source/utils.py
+++ b/src/access_nri_intake/source/utils.py
@@ -17,7 +17,7 @@ class EmptyFileError(Exception):
 
 
 @dataclass
-class AccessNCFileInfo:
+class _AccessNCFileInfo:
     """
     Holds information about a NetCDF file that is used to create an intake-esm
     catalog entry.
@@ -34,6 +34,12 @@ class AccessNCFileInfo:
     variable_standard_name: list[str]
     variable_cell_methods: list[str]
     variable_units: list[str]
+    coords: list[str]
+    coord_long_name: list[str]
+    coord_cartesian_axes: list[str]
+    coord_calendar_types: list[str]
+    coord_bounds: list[str]
+    coord_units: list[str]
     path: str = field(init=False)
 
     def __post_init__(self):
@@ -45,26 +51,94 @@ def to_dict(self) -> dict[str, str | list[str]]:
         """
         return asdict(self)
 
-    def to_tuple(
-        self,
-    ) -> tuple[
-        str,
-        str | None,
-        str,
-        str,
-        str,
-        list[str],
-        list[str],
-        list[str],
-        list[str],
-        list[str],
-    ]:
+
+@dataclass
+class _DataVarInfo:
+    """
+    Holds information about the data variables in a NetCDF file that is used to
+    create an intake-esm catalog entry.
+    """
+
+    variable_list: list[str] = field(default_factory=list)
+    long_name_list: list[str] = field(default_factory=list)
+    standard_name_list: list[str] = field(default_factory=list)
+    cell_methods_list: list[str] = field(default_factory=list)
+    units_list: list[str] = field(default_factory=list)
+
+    def append_attrs(self, var: str, attrs: dict) -> None:
+        """
+        Append attributes to the DataVarInfo object, if the attribute has a
+        'long_name' key.
+
+        TODO: Why do we need a long name key? seems important
+        """
+        if "long_name" not in attrs:
+            return None
+
+        self.variable_list.append(var)
+        self.long_name_list.append(attrs["long_name"])
+        self.standard_name_list.append(attrs.get("standard_name", ""))
+        self.cell_methods_list.append(attrs.get("cell_methods", ""))
+        self.units_list.append(attrs.get("units", ""))
+
+    def to_ncinfo_dict(self) -> dict[str, list[str]]:
+        """
+        Return a dictionary representation of the DataVarInfo object. Fields are
+        defined explicitly for use in the _AccessNCFileInfo constructor.
         """
-        Return a tuple representation of the NcFileInfo object.
+        return {
+            "variable": self.variable_list,
+            "variable_long_name": self.long_name_list,
+            "variable_standard_name": self.standard_name_list,
+            "variable_cell_methods": self.cell_methods_list,
+            "variable_units": self.units_list,
+        }
 
-        Returns an insanely long tuple: aiming to clean this up.
+
+@dataclass
+class _CoordVarInfo:
+    """
+    Holds information about the coordinate variables in a NetCDF file that is
+    used to create an intake-esm catalog entry.
+    """
+
+    coord_list: list[str] = field(default_factory=list)
+    long_name_list: list[str] = field(default_factory=list)
+    cartesian_axis_list: list[str] = field(default_factory=list)
+    calendar_type_list: list[str] = field(default_factory=list)
+    bounds_list: list[str] = field(default_factory=list)
+    units_list: list[str] = field(default_factory=list)
+
+    def append_attrs(self, var: str, attrs: dict) -> None:
+        """
+        Append attributes to the CoordVarInfo object, if the attribute has a
+        'long_name' key.
+
+        TODO: Why do we need a long name key? seems important
+        """
+        if "long_name" not in attrs:
+            return None
+
+        self.coord_list.append(var)
+        self.long_name_list.append(attrs["long_name"])
+        self.cartesian_axis_list.append(attrs.get("cartesian_axis", ""))
+        self.calendar_type_list.append(attrs.get("calendar_type", ""))
+        self.bounds_list.append(attrs.get("bounds", ""))
+        self.units_list.append(attrs.get("units", ""))
+
+    def to_ncinfo_dict(self) -> dict[str, list[str]]:
+        """
+        Return a dictionary representation of the CoordVarInfo object. Fields are
+        defined explicitly for use in the _AccessNCFileInfo constructor.
         """
-        return tuple(asdict(self).values())
+        return {
+            "coords": self.coord_list,
+            "coord_long_name": self.long_name_list,
+            "coord_cartesian_axes": self.cartesian_axis_list,
+            "coord_calendar_types": self.calendar_type_list,
+            "coord_bounds": self.bounds_list,
+            "coord_units": self.units_list,
+        }
 
 
 def _add_month_start(time, n: int):
diff --git a/src/access_nri_intake/utils.py b/src/access_nri_intake/utils.py
index ceae573d..aed413dc 100644
--- a/src/access_nri_intake/utils.py
+++ b/src/access_nri_intake/utils.py
@@ -22,7 +22,7 @@ def get_jsonschema(metadata_file: str, required: list) -> tuple[dict, dict]:
     """
 
     schema_file = rsr.files("access_nri_intake").joinpath(metadata_file)
-    with schema_file.open(mode="r") as fpath: # type: ignore
+    with schema_file.open(mode="r") as fpath:  # type: ignore
         schema = json.load(fpath)
 
     schema_required = schema.copy()
diff --git a/tests/test_builders.py b/tests/test_builders.py
index d2283d61..2cf523eb 100644
--- a/tests/test_builders.py
+++ b/tests/test_builders.py
@@ -8,7 +8,7 @@
 import pytest
 
 from access_nri_intake.source import CORE_COLUMNS, builders
-from access_nri_intake.source.utils import AccessNCFileInfo
+from access_nri_intake.source.utils import _AccessNCFileInfo
 
 
 @pytest.mark.parametrize(
@@ -365,411 +365,676 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessOm2Builder,
             "access-om2/output000/ocean/ocean_grid.nc",
-            AccessNCFileInfo(
-                "ocean_grid.nc",
-                "ocean_grid",
-                None,
-                "fx",
-                "none",
-                "none",
-                ["geolat_t", "geolon_t"],
-                ["tracer latitude", "tracer longitude"],
-                ["", ""],
-                ["time: point", "time: point"],
-                ["degrees_N", "degrees_E"],
+            _AccessNCFileInfo(
+                filename="ocean_grid.nc",
+                file_id="ocean_grid",
+                filename_timestamp=None,
+                frequency="fx",
+                start_date="none",
+                end_date="none",
+                variable=["geolat_t", "geolon_t"],
+                variable_long_name=["tracer latitude", "tracer longitude"],
+                variable_standard_name=["", ""],
+                variable_cell_methods=["time: point", "time: point"],
+                variable_units=["degrees_N", "degrees_E"],
+                coord_long_name=["tcell longitude", "tcell latitude"],
+                coords=["xt_ocean", "yt_ocean"],
+                coord_cartesian_axes=["X", "Y"],
+                coord_calendar_types=["", ""],
+                coord_bounds=["", ""],
+                coord_units=["degrees_E", "degrees_N"],
             ),
         ),
         (
             builders.AccessOm2Builder,
             "access-om2/output000/ocean/ocean.nc",
-            AccessNCFileInfo(
-                "ocean.nc",
-                "ocean",
-                None,
-                "1yr",
-                "1900-01-01, 00:00:00",
-                "1910-01-01, 00:00:00",
-                ["temp", "time_bounds"],
-                ["Conservative temperature", "time axis boundaries"],
-                ["sea_water_conservative_temperature", ""],
-                ["time: mean", ""],
-                ["K", "days"],
+            _AccessNCFileInfo(
+                filename="ocean.nc",
+                file_id="ocean",
+                filename_timestamp=None,
+                frequency="1yr",
+                start_date="1900-01-01, 00:00:00",
+                end_date="1910-01-01, 00:00:00",
+                variable=["temp", "time_bounds"],
+                variable_long_name=["Conservative temperature", "time axis boundaries"],
+                variable_standard_name=["sea_water_conservative_temperature", ""],
+                variable_cell_methods=["time: mean", ""],
+                variable_units=["K", "days"],
+                coords=["nv", "st_ocean", "time", "xt_ocean", "yt_ocean"],
+                coord_long_name=[
+                    "vertex number",
+                    "tcell zstar depth",
+                    "time",
+                    "tcell longitude",
+                    "tcell latitude",
+                ],
+                coord_cartesian_axes=["N", "Z", "T", "X", "Y"],
+                coord_calendar_types=["", "", "NOLEAP", "", ""],
+                coord_bounds=["", "", "time_bounds", "", ""],
+                coord_units=[
+                    "none",
+                    "meters",
+                    "days since 1900-01-01 00:00:00",
+                    "degrees_E",
+                    "degrees_N",
+                ],
             ),
         ),
         (
             builders.AccessOm2Builder,
             "access-om2/output000/ocean/ocean_month.nc",
-            AccessNCFileInfo(
-                "ocean_month.nc",
-                "ocean_month",
-                None,
-                "1mon",
-                "1900-01-01, 00:00:00",
-                "1910-01-01, 00:00:00",
-                ["mld", "time_bounds"],
-                [
+            _AccessNCFileInfo(
+                filename="ocean_month.nc",
+                file_id="ocean_month",
+                filename_timestamp=None,
+                frequency="1mon",
+                start_date="1900-01-01, 00:00:00",
+                end_date="1910-01-01, 00:00:00",
+                variable=["mld", "time_bounds"],
+                variable_long_name=[
                     "mixed layer depth determined by density criteria",
                     "time axis boundaries",
                 ],
-                ["ocean_mixed_layer_thickness_defined_by_sigma_t", ""],
-                ["time: mean", ""],
-                ["m", "days"],
+                variable_standard_name=[
+                    "ocean_mixed_layer_thickness_defined_by_sigma_t",
+                    "",
+                ],
+                variable_cell_methods=["time: mean", ""],
+                variable_units=["m", "days"],
+                coords=["nv", "time", "xt_ocean", "yt_ocean"],
+                coord_long_name=[
+                    "vertex number",
+                    "time",
+                    "tcell longitude",
+                    "tcell latitude",
+                ],
+                coord_cartesian_axes=["N", "T", "X", "Y"],
+                coord_calendar_types=["", "NOLEAP", "", ""],
+                coord_bounds=["", "time_bounds", "", ""],
+                coord_units=[
+                    "none",
+                    "days since 1900-01-01 00:00:00",
+                    "degrees_E",
+                    "degrees_N",
+                ],
             ),
         ),
         (
             builders.AccessOm2Builder,
             "access-om2/output000/ocean/ocean_month_inst_nobounds.nc",
-            AccessNCFileInfo(
-                "ocean_month_inst_nobounds.nc",
-                "ocean_month_inst_nobounds",
-                None,
-                "1mon",
-                "1900-01-01, 00:00:00",
-                "1900-02-01, 00:00:00",
-                ["mld"],
-                ["mixed layer depth determined by density criteria"],
-                ["ocean_mixed_layer_thickness_defined_by_sigma_t"],
-                ["time: mean"],
-                ["m"],
+            _AccessNCFileInfo(
+                filename="ocean_month_inst_nobounds.nc",
+                file_id="ocean_month_inst_nobounds",
+                filename_timestamp=None,
+                frequency="1mon",
+                start_date="1900-01-01, 00:00:00",
+                end_date="1900-02-01, 00:00:00",
+                variable=["mld"],
+                variable_long_name=["mixed layer depth determined by density criteria"],
+                variable_standard_name=[
+                    "ocean_mixed_layer_thickness_defined_by_sigma_t"
+                ],
+                variable_cell_methods=["time: mean"],
+                variable_units=["m"],
+                coords=["time", "xt_ocean", "yt_ocean"],
+                coord_long_name=["time", "tcell longitude", "tcell latitude"],
+                coord_cartesian_axes=["T", "X", "Y"],
+                coord_calendar_types=["NOLEAP", "", ""],
+                coord_bounds=["time_bounds", "", ""],
+                coord_units=[
+                    "days since 1900-01-01 00:00:00",
+                    "degrees_E",
+                    "degrees_N",
+                ],
             ),
         ),
         (
             builders.AccessOm2Builder,
             "access-om2/output000/ice/OUTPUT/iceh.1900-01.nc",
-            AccessNCFileInfo(
-                "iceh.1900-01.nc",
-                "iceh_XXXX_XX",
-                "1900-01",
-                "1mon",
-                "1900-01-01, 00:00:00",
-                "1900-02-01, 00:00:00",
-                ["TLAT", "TLON", "aice_m", "tarea", "time_bounds"],
-                [
+            _AccessNCFileInfo(
+                filename="iceh.1900-01.nc",
+                file_id="iceh_XXXX_XX",
+                filename_timestamp="1900-01",
+                frequency="1mon",
+                start_date="1900-01-01, 00:00:00",
+                end_date="1900-02-01, 00:00:00",
+                variable=["TLAT", "TLON", "aice_m", "tarea", "time_bounds"],
+                variable_long_name=[
                     "T grid center latitude",
                     "T grid center longitude",
                     "ice area  (aggregate)",
                     "area of T grid cells",
                     "boundaries for time-averaging interval",
                 ],
-                ["", "", "", "", ""],
-                ["", "", "time: mean", "", ""],
-                [
+                variable_standard_name=["", "", "", "", ""],
+                variable_cell_methods=["", "", "time: mean", "", ""],
+                variable_units=[
                     "degrees_north",
                     "degrees_east",
                     "1",
                     "m^2",
                     "days since 1900-01-01 00:00:00",
                 ],
+                coords=["time"],
+                coord_long_name=["model time"],
+                coord_cartesian_axes=[""],
+                coord_calendar_types=[""],
+                coord_bounds=["time_bounds"],
+                coord_units=["days since 1900-01-01 00:00:00"],
             ),
         ),
         (
             builders.AccessCm2Builder,
             "access-cm2/by578/history/atm/netCDF/by578a.pd201501_dai.nc",
-            AccessNCFileInfo(
-                "by578a.pd201501_dai.nc",
-                "by578a_pdXXXXXX_dai",
-                "201501",
-                "1day",
-                "2015-01-01, 00:00:00",
-                "2015-02-01, 00:00:00",
-                ["fld_s03i236"],
-                ["TEMPERATURE AT 1.5M"],
-                ["air_temperature"],
-                ["time: mean"],
-                ["K"],
+            _AccessNCFileInfo(
+                filename="by578a.pd201501_dai.nc",
+                file_id="by578a_pdXXXXXX_dai",
+                filename_timestamp="201501",
+                frequency="1day",
+                start_date="2015-01-01, 00:00:00",
+                end_date="2015-02-01, 00:00:00",
+                variable=["fld_s03i236"],
+                variable_long_name=["TEMPERATURE AT 1.5M"],
+                variable_standard_name=["air_temperature"],
+                variable_cell_methods=["time: mean"],
+                variable_units=["K"],
+                coords=[],
+                coord_long_name=[],
+                coord_cartesian_axes=[],
+                coord_calendar_types=[],
+                coord_bounds=[],
+                coord_units=[],
             ),
         ),
         (
             builders.AccessCm2Builder,
             "access-cm2/by578/history/ice/iceh_d.2015-01.nc",
-            AccessNCFileInfo(
-                "iceh_d.2015-01.nc",
-                "iceh_d_XXXX_XX",
-                "2015-01",
-                "1day",
-                "2015-01-01, 00:00:00",
-                "2015-02-01, 00:00:00",
-                ["TLAT", "TLON", "aice", "tarea", "time_bounds"],
-                [
+            _AccessNCFileInfo(
+                filename="iceh_d.2015-01.nc",
+                file_id="iceh_d_XXXX_XX",
+                filename_timestamp="2015-01",
+                frequency="1day",
+                start_date="2015-01-01, 00:00:00",
+                end_date="2015-02-01, 00:00:00",
+                variable=["TLAT", "TLON", "aice", "tarea", "time_bounds"],
+                variable_long_name=[
                     "T grid center latitude",
                     "T grid center longitude",
                     "ice area  (aggregate)",
                     "area of T grid cells",
                     "boundaries for time-averaging interval",
                 ],
-                ["", "", "", "", ""],
-                ["", "", "time: mean", "", ""],
-                [
+                variable_standard_name=["", "", "", "", ""],
+                variable_cell_methods=["", "", "time: mean", "", ""],
+                variable_units=[
                     "degrees_north",
                     "degrees_east",
                     "1",
                     "m^2",
                     "days since 1850-01-01 00:00:00",
                 ],
+                coords=["time"],
+                coord_long_name=["model time"],
+                coord_cartesian_axes=[""],
+                coord_calendar_types=[""],
+                coord_bounds=["time_bounds"],
+                coord_units=["days since 1850-01-01 00:00:00"],
             ),
         ),
         (
             builders.AccessCm2Builder,
             "access-cm2/by578/history/ocn/ocean_daily.nc-20150630",
-            AccessNCFileInfo(
-                "ocean_daily.nc-20150630",
-                "ocean_daily",
-                None,
-                "1day",
-                "2015-01-01, 00:00:00",
-                "2015-07-01, 00:00:00",
-                ["sst", "time_bounds"],
-                ["Potential temperature", "time axis boundaries"],
-                ["sea_surface_temperature", ""],
-                ["time: mean", ""],
-                ["K", "days"],
+            _AccessNCFileInfo(
+                filename="ocean_daily.nc-20150630",
+                file_id="ocean_daily",
+                filename_timestamp=None,
+                frequency="1day",
+                start_date="2015-01-01, 00:00:00",
+                end_date="2015-07-01, 00:00:00",
+                variable=["sst", "time_bounds"],
+                variable_long_name=["Potential temperature", "time axis boundaries"],
+                variable_standard_name=["sea_surface_temperature", ""],
+                variable_cell_methods=["time: mean", ""],
+                variable_units=["K", "days"],
+                coords=["nv", "time", "xt_ocean", "yt_ocean"],
+                coord_long_name=[
+                    "vertex number",
+                    "time",
+                    "tcell longitude",
+                    "tcell latitude",
+                ],
+                coord_cartesian_axes=["N", "T", "X", "Y"],
+                coord_calendar_types=["", "GREGORIAN", "", ""],
+                coord_bounds=["", "time_bounds", "", ""],
+                coord_units=[
+                    "none",
+                    "days since 1850-01-01 00:00:00",
+                    "degrees_E",
+                    "degrees_N",
+                ],
             ),
         ),
         (
             builders.AccessCm2Builder,
             "access-cm2/by578/history/ocn/ocean_scalar.nc-20150630",
-            AccessNCFileInfo(
-                "ocean_scalar.nc-20150630",
-                "ocean_scalar",
-                None,
-                "1mon",
-                "2015-01-01, 00:00:00",
-                "2015-07-01, 00:00:00",
-                ["temp_global_ave", "time_bounds"],
-                ["Global mean temp in liquid seawater", "time axis boundaries"],
-                ["sea_water_potential_temperature", ""],
-                ["time: mean", ""],
-                ["deg_C", "days"],
+            _AccessNCFileInfo(
+                filename="ocean_scalar.nc-20150630",
+                file_id="ocean_scalar",
+                filename_timestamp=None,
+                frequency="1mon",
+                start_date="2015-01-01, 00:00:00",
+                end_date="2015-07-01, 00:00:00",
+                variable=["temp_global_ave", "time_bounds"],
+                variable_long_name=[
+                    "Global mean temp in liquid seawater",
+                    "time axis boundaries",
+                ],
+                variable_standard_name=["sea_water_potential_temperature", ""],
+                variable_cell_methods=["time: mean", ""],
+                variable_units=["deg_C", "days"],
+                coords=["nv", "scalar_axis", "time"],
+                coord_long_name=["vertex number", "none", "time"],
+                coord_cartesian_axes=["N", "X", "T"],
+                coord_calendar_types=["", "", "GREGORIAN"],
+                coord_bounds=["", "", "time_bounds"],
+                coord_units=["none", "none", "days since 1850-01-01 00:00:00"],
             ),
         ),
         (
             builders.AccessEsm15Builder,
             "access-esm1-5/history/atm/netCDF/HI-C-05-r1.pa-185001_mon.nc",
-            AccessNCFileInfo(
-                "HI-C-05-r1.pa-185001_mon.nc",
-                "HI_C_05_r1_pa_XXXXXX_mon",
-                "185001",
-                "1mon",
-                "1850-01-01, 00:00:00",
-                "1850-02-01, 00:00:00",
-                ["fld_s03i236"],
-                ["TEMPERATURE AT 1.5M"],
-                ["air_temperature"],
-                ["time: mean"],
-                ["K"],
+            _AccessNCFileInfo(
+                filename="HI-C-05-r1.pa-185001_mon.nc",
+                file_id="HI_C_05_r1_pa_XXXXXX_mon",
+                filename_timestamp="185001",
+                frequency="1mon",
+                start_date="1850-01-01, 00:00:00",
+                end_date="1850-02-01, 00:00:00",
+                variable=["fld_s03i236"],
+                variable_long_name=["TEMPERATURE AT 1.5M"],
+                variable_standard_name=["air_temperature"],
+                variable_cell_methods=["time: mean"],
+                variable_units=["K"],
+                coords=[],
+                coord_long_name=[],
+                coord_cartesian_axes=[],
+                coord_calendar_types=[],
+                coord_bounds=[],
+                coord_units=[],
             ),
         ),
         (
             builders.AccessEsm15Builder,
             "access-esm1-5/history/ice/iceh.1850-01.nc",
-            AccessNCFileInfo(
-                "iceh.1850-01.nc",
-                "iceh_XXXX_XX",
-                "1850-01",
-                "1mon",
-                "1850-01-01, 00:00:00",
-                "1850-02-01, 00:00:00",
-                ["TLAT", "TLON", "aice", "tarea", "time_bounds"],
-                [
+            _AccessNCFileInfo(
+                filename="iceh.1850-01.nc",
+                file_id="iceh_XXXX_XX",
+                filename_timestamp="1850-01",
+                frequency="1mon",
+                start_date="1850-01-01, 00:00:00",
+                end_date="1850-02-01, 00:00:00",
+                variable=["TLAT", "TLON", "aice", "tarea", "time_bounds"],
+                variable_long_name=[
                     "T grid center latitude",
                     "T grid center longitude",
                     "ice area  (aggregate)",
                     "area of T grid cells",
                     "boundaries for time-averaging interval",
                 ],
-                ["", "", "", "", ""],
-                ["", "", "time: mean", "", ""],
-                [
+                variable_standard_name=["", "", "", "", ""],
+                variable_cell_methods=["", "", "time: mean", "", ""],
+                variable_units=[
                     "degrees_north",
                     "degrees_east",
                     "1",
                     "m^2",
                     "days since 0001-01-01 00:00:00",
                 ],
+                coords=["time"],
+                coord_long_name=["model time"],
+                coord_cartesian_axes=[""],
+                coord_calendar_types=[""],
+                coord_bounds=["time_bounds"],
+                coord_units=["days since 0001-01-01 00:00:00"],
             ),
         ),
         (
             builders.AccessEsm15Builder,
             "access-esm1-5/history/ocn/ocean_bgc_ann.nc-18501231",
-            AccessNCFileInfo(
-                "ocean_bgc_ann.nc-18501231",
-                "ocean_bgc_ann",
-                None,
-                "1yr",
-                "1849-12-30, 00:00:00",
-                "1850-12-30, 00:00:00",
-                ["fgco2_raw", "time_bounds"],
-                ["Flux into ocean - DIC, inc. anth.", "time axis boundaries"],
-                ["", ""],
-                ["time: mean", ""],
-                ["mmol/m^2/s", "days"],
+            _AccessNCFileInfo(
+                filename="ocean_bgc_ann.nc-18501231",
+                file_id="ocean_bgc_ann",
+                filename_timestamp=None,
+                frequency="1yr",
+                start_date="1849-12-30, 00:00:00",
+                end_date="1850-12-30, 00:00:00",
+                variable=["fgco2_raw", "time_bounds"],
+                variable_long_name=[
+                    "Flux into ocean - DIC, inc. anth.",
+                    "time axis boundaries",
+                ],
+                variable_standard_name=["", ""],
+                variable_cell_methods=["time: mean", ""],
+                variable_units=["mmol/m^2/s", "days"],
+                coords=["nv", "time", "xt_ocean", "yt_ocean"],
+                coord_long_name=[
+                    "vertex number",
+                    "time",
+                    "tcell longitude",
+                    "tcell latitude",
+                ],
+                coord_cartesian_axes=["N", "T", "X", "Y"],
+                coord_calendar_types=["", "GREGORIAN", "", ""],
+                coord_bounds=["", "time_bounds", "", ""],
+                coord_units=[
+                    "none",
+                    "days since 0001-01-01 00:00:00",
+                    "degrees_E",
+                    "degrees_N",
+                ],
             ),
         ),
         (
             builders.AccessEsm15Builder,
             "access-esm1-5/history/ocn/ocean_bgc.nc-18501231",
-            AccessNCFileInfo(
-                "ocean_bgc.nc-18501231",
-                "ocean_bgc",
-                None,
-                "1mon",
-                "1849-12-30, 00:00:00",
-                "1850-12-30, 00:00:00",
-                ["o2", "time_bounds"],
-                ["o2", "time axis boundaries"],
-                ["", ""],
-                ["time: mean", ""],
-                ["mmol/m^3", "days"],
+            _AccessNCFileInfo(
+                filename="ocean_bgc.nc-18501231",
+                file_id="ocean_bgc",
+                filename_timestamp=None,
+                frequency="1mon",
+                start_date="1849-12-30, 00:00:00",
+                end_date="1850-12-30, 00:00:00",
+                variable=["o2", "time_bounds"],
+                variable_long_name=["o2", "time axis boundaries"],
+                variable_standard_name=["", ""],
+                variable_cell_methods=["time: mean", ""],
+                variable_units=["mmol/m^3", "days"],
+                coords=["nv", "st_ocean", "time", "xt_ocean", "yt_ocean"],
+                coord_long_name=[
+                    "vertex number",
+                    "tcell zstar depth",
+                    "time",
+                    "tcell longitude",
+                    "tcell latitude",
+                ],
+                coord_cartesian_axes=["N", "Z", "T", "X", "Y"],
+                coord_calendar_types=["", "", "GREGORIAN", "", ""],
+                coord_bounds=["", "", "time_bounds", "", ""],
+                coord_units=[
+                    "none",
+                    "meters",
+                    "days since 0001-01-01 00:00:00",
+                    "degrees_E",
+                    "degrees_N",
+                ],
             ),
         ),
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.mom6.h.native_1900_01.nc",
-            AccessNCFileInfo(
-                "GMOM_JRA_WD.mom6.h.native_1900_01.nc",
-                "GMOM_JRA_WD_mom6_h_native_XXXX_XX",
-                "1900_01",
-                "1mon",
-                "1900-01-01, 00:00:00",
-                "1900-02-01, 00:00:00",
-                ["average_DT", "average_T1", "average_T2", "thetao", "time_bnds"],
-                [
+            _AccessNCFileInfo(
+                filename="GMOM_JRA_WD.mom6.h.native_1900_01.nc",
+                file_id="GMOM_JRA_WD_mom6_h_native_XXXX_XX",
+                filename_timestamp="1900_01",
+                frequency="1mon",
+                start_date="1900-01-01, 00:00:00",
+                end_date="1900-02-01, 00:00:00",
+                variable=[
+                    "average_DT",
+                    "average_T1",
+                    "average_T2",
+                    "thetao",
+                    "time_bnds",
+                ],
+                variable_long_name=[
                     "Length of average period",
                     "Start time for average period",
                     "End time for average period",
                     "Sea Water Potential Temperature",
                     "time axis boundaries",
                 ],
-                ["", "", "", "sea_water_potential_temperature", ""],
-                ["", "", "", "area:mean zl:mean yh:mean xh:mean time: mean", ""],
-                [
+                variable_standard_name=[
+                    "",
+                    "",
+                    "",
+                    "sea_water_potential_temperature",
+                    "",
+                ],
+                variable_cell_methods=[
+                    "",
+                    "",
+                    "",
+                    "area:mean zl:mean yh:mean xh:mean time: mean",
+                    "",
+                ],
+                variable_units=[
                     "days",
                     "days since 0001-01-01 00:00:00",
                     "days since 0001-01-01 00:00:00",
                     "degC",
                     "days since 0001-01-01 00:00:00",
                 ],
+                coords=["nv", "time", "xh", "yh", "zl"],
+                coord_long_name=[
+                    "vertex number",
+                    "time",
+                    "h point nominal longitude",
+                    "h point nominal latitude",
+                    "Layer pseudo-depth, -z*",
+                ],
+                coord_cartesian_axes=["", "", "", "", ""],
+                coord_calendar_types=["", "NOLEAP", "", "", ""],
+                coord_bounds=["", "time_bnds", "", "", ""],
+                coord_units=[
+                    "",
+                    "days since 0001-01-01 00:00:00",
+                    "degrees_east",
+                    "degrees_north",
+                    "meter",
+                ],
             ),
         ),
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc",
-            AccessNCFileInfo(
-                "GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc",
-                "GMOM_JRA_WD_mom6_h_sfc_XXXX_XX_XX",
-                "1900_01_02",
-                "1day",
-                "1900-01-01, 00:00:00",
-                "1900-01-02, 00:00:00",
-                ["average_DT", "average_T1", "average_T2", "time_bnds", "tos"],
-                [
+            _AccessNCFileInfo(
+                filename="GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc",
+                file_id="GMOM_JRA_WD_mom6_h_sfc_XXXX_XX_XX",
+                filename_timestamp="1900_01_02",
+                frequency="1day",
+                start_date="1900-01-01, 00:00:00",
+                end_date="1900-01-02, 00:00:00",
+                variable=["average_DT", "average_T1", "average_T2", "time_bnds", "tos"],
+                variable_long_name=[
                     "Length of average period",
                     "Start time for average period",
                     "End time for average period",
                     "time axis boundaries",
                     "Sea Surface Temperature",
                 ],
-                ["", "", "", "", "sea_surface_temperature"],
-                ["", "", "", "", "area:mean yh:mean xh:mean time: mean"],
-                [
+                variable_standard_name=["", "", "", "", "sea_surface_temperature"],
+                variable_cell_methods=[
+                    "",
+                    "",
+                    "",
+                    "",
+                    "area:mean yh:mean xh:mean time: mean",
+                ],
+                variable_units=[
                     "days",
                     "days since 0001-01-01 00:00:00",
                     "days since 0001-01-01 00:00:00",
                     "days since 0001-01-01 00:00:00",
                     "degC",
                 ],
+                coords=["nv", "time", "xh", "yh"],
+                coord_long_name=[
+                    "vertex number",
+                    "time",
+                    "h point nominal longitude",
+                    "h point nominal latitude",
+                ],
+                coord_cartesian_axes=["", "", "", ""],
+                coord_calendar_types=["", "NOLEAP", "", ""],
+                coord_bounds=["", "time_bnds", "", ""],
+                coord_units=[
+                    "",
+                    "days since 0001-01-01 00:00:00",
+                    "degrees_east",
+                    "degrees_north",
+                ],
             ),
         ),
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.mom6.h.static.nc",
-            AccessNCFileInfo(
-                "GMOM_JRA_WD.mom6.h.static.nc",
-                "GMOM_JRA_WD_mom6_h_static",
-                None,
-                "fx",
-                "none",
-                "none",
-                ["geolat", "geolon"],
-                ["Latitude of tracer (T) points", "Longitude of tracer (T) points"],
-                ["", ""],
-                ["time: point", "time: point"],
-                ["degrees_north", "degrees_east"],
+            _AccessNCFileInfo(
+                filename="GMOM_JRA_WD.mom6.h.static.nc",
+                file_id="GMOM_JRA_WD_mom6_h_static",
+                filename_timestamp=None,
+                frequency="fx",
+                start_date="none",
+                end_date="none",
+                variable=["geolat", "geolon"],
+                variable_long_name=[
+                    "Latitude of tracer (T) points",
+                    "Longitude of tracer (T) points",
+                ],
+                variable_standard_name=["", ""],
+                variable_cell_methods=["time: point", "time: point"],
+                variable_units=["degrees_north", "degrees_east"],
+                coords=["xh", "yh"],
+                coord_long_name=[
+                    "h point nominal longitude",
+                    "h point nominal latitude",
+                ],
+                coord_cartesian_axes=["", ""],
+                coord_calendar_types=["", ""],
+                coord_bounds=["", ""],
+                coord_units=["degrees_east", "degrees_north"],
             ),
         ),
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.mom6.h.z_1900_01.nc",
-            AccessNCFileInfo(
-                "GMOM_JRA_WD.mom6.h.z_1900_01.nc",
-                "GMOM_JRA_WD_mom6_h_z_XXXX_XX",
-                "1900_01",
-                "1mon",
-                "1900-01-01, 00:00:00",
-                "1900-02-01, 00:00:00",
-                ["average_DT", "average_T1", "average_T2", "thetao", "time_bnds"],
-                [
+            _AccessNCFileInfo(
+                filename="GMOM_JRA_WD.mom6.h.z_1900_01.nc",
+                file_id="GMOM_JRA_WD_mom6_h_z_XXXX_XX",
+                filename_timestamp="1900_01",
+                frequency="1mon",
+                start_date="1900-01-01, 00:00:00",
+                end_date="1900-02-01, 00:00:00",
+                variable=[
+                    "average_DT",
+                    "average_T1",
+                    "average_T2",
+                    "thetao",
+                    "time_bnds",
+                ],
+                variable_long_name=[
                     "Length of average period",
                     "Start time for average period",
                     "End time for average period",
                     "Sea Water Potential Temperature",
                     "time axis boundaries",
                 ],
-                ["", "", "", "sea_water_potential_temperature", ""],
-                ["", "", "", "area:mean z_l:mean yh:mean xh:mean time: mean", ""],
-                [
+                variable_standard_name=[
+                    "",
+                    "",
+                    "",
+                    "sea_water_potential_temperature",
+                    "",
+                ],
+                variable_cell_methods=[
+                    "",
+                    "",
+                    "",
+                    "area:mean z_l:mean yh:mean xh:mean time: mean",
+                    "",
+                ],
+                variable_units=[
                     "days",
                     "days since 0001-01-01 00:00:00",
                     "days since 0001-01-01 00:00:00",
                     "degC",
                     "days since 0001-01-01 00:00:00",
                 ],
+                coords=["nv", "time", "xh", "yh", "z_l"],
+                coord_long_name=[
+                    "vertex number",
+                    "time",
+                    "h point nominal longitude",
+                    "h point nominal latitude",
+                    "Depth at cell center",
+                ],
+                coord_cartesian_axes=["", "", "", "", ""],
+                coord_calendar_types=["", "NOLEAP", "", "", ""],
+                coord_bounds=["", "time_bnds", "", "", ""],
+                coord_units=[
+                    "",
+                    "days since 0001-01-01 00:00:00",
+                    "degrees_east",
+                    "degrees_north",
+                    "meters",
+                ],
             ),
         ),
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.cice.h.1900-01-01.nc",
-            AccessNCFileInfo(
-                "GMOM_JRA_WD.cice.h.1900-01-01.nc",
-                "GMOM_JRA_WD_cice_h_XXXX_XX_XX",
-                "1900-01-01",
-                "1day",
-                "1900-01-01, 00:00:00",
-                "1900-01-02, 00:00:00",
-                ["TLAT", "TLON", "aice", "tarea", "time_bounds"],
-                [
+            _AccessNCFileInfo(
+                filename="GMOM_JRA_WD.cice.h.1900-01-01.nc",
+                file_id="GMOM_JRA_WD_cice_h_XXXX_XX_XX",
+                filename_timestamp="1900-01-01",
+                frequency="1day",
+                start_date="1900-01-01, 00:00:00",
+                end_date="1900-01-02, 00:00:00",
+                variable=["TLAT", "TLON", "aice", "tarea", "time_bounds"],
+                variable_long_name=[
                     "T grid center latitude",
                     "T grid center longitude",
                     "ice area  (aggregate)",
                     "area of T grid cells",
                     "time interval endpoints",
                 ],
-                ["", "", "", "", ""],
-                ["", "", "time: mean", "", ""],
-                [
+                variable_standard_name=["", "", "", "", ""],
+                variable_cell_methods=["", "", "time: mean", "", ""],
+                variable_units=[
                     "degrees_north",
                     "degrees_east",
                     "1",
                     "m^2",
                     "days since 0000-01-01 00:00:00",
                 ],
+                coords=["time"],
+                coord_long_name=["time"],
+                coord_cartesian_axes=[""],
+                coord_calendar_types=[""],
+                coord_bounds=["time_bounds"],
+                coord_units=["days since 0000-01-01 00:00:00"],
             ),
         ),
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc",
-            AccessNCFileInfo(
-                "GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc",
-                "GMOM_JRA_WD_ww3_hi_XXXX_XX_XX_XXXXX",
-                "1900-01-02-00000",
-                "fx",  # WW3 provides no time bounds
-                "1900-01-02, 00:00:00",
-                "1900-01-02, 00:00:00",
-                ["EF", "mapsta"],
-                ["1D spectral density", "map status"],
-                ["", ""],
-                ["", ""],
-                ["m2 s", "unitless"],
+            _AccessNCFileInfo(
+                filename="GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc",
+                file_id="GMOM_JRA_WD_ww3_hi_XXXX_XX_XX_XXXXX",
+                filename_timestamp="1900-01-02-00000",
+                frequency="fx",  # WW3 provides no time bounds
+                start_date="1900-01-02, 00:00:00",
+                end_date="1900-01-02, 00:00:00",
+                variable=["EF", "mapsta"],
+                variable_long_name=["1D spectral density", "map status"],
+                variable_standard_name=["", ""],
+                variable_cell_methods=["", ""],
+                variable_units=["m2 s", "unitless"],
+                coords=[],
+                coord_long_name=[],
+                coord_cartesian_axes=[],
+                coord_calendar_types=[],
+                coord_bounds=[],
+                coord_units=[],
             ),
         ),
     ],

From 7f34102f6341054b793d6159161ad39318b18101 Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Wed, 25 Sep 2024 16:18:50 +0800
Subject: [PATCH 08/23] - Added mypy checks for 3.9..3.12. Note type hints fail
 on Python3.9 due to use of '|' type union syntax - might be worth
 consideration? - Removed a couple of unused imports, cleaned up some comments

---
 mypy/mypy_3.10.ini                           | 3 +++
 mypy/mypy_3.11.ini                           | 3 +++
 mypy.ini => mypy/mypy_3.12.ini               | 0
 mypy/mypy_3.9.ini                            | 3 +++
 src/access_nri_intake/catalog/translators.py | 2 --
 src/access_nri_intake/source/builders.py     | 6 +++---
 6 files changed, 12 insertions(+), 5 deletions(-)
 create mode 100644 mypy/mypy_3.10.ini
 create mode 100644 mypy/mypy_3.11.ini
 rename mypy.ini => mypy/mypy_3.12.ini (100%)
 create mode 100644 mypy/mypy_3.9.ini

diff --git a/mypy/mypy_3.10.ini b/mypy/mypy_3.10.ini
new file mode 100644
index 00000000..99c9e230
--- /dev/null
+++ b/mypy/mypy_3.10.ini
@@ -0,0 +1,3 @@
+[mypy]
+python_version = 3.10
+ignore_missing_imports = True
\ No newline at end of file
diff --git a/mypy/mypy_3.11.ini b/mypy/mypy_3.11.ini
new file mode 100644
index 00000000..0413b9fc
--- /dev/null
+++ b/mypy/mypy_3.11.ini
@@ -0,0 +1,3 @@
+[mypy]
+python_version = 3.11
+ignore_missing_imports = True
\ No newline at end of file
diff --git a/mypy.ini b/mypy/mypy_3.12.ini
similarity index 100%
rename from mypy.ini
rename to mypy/mypy_3.12.ini
diff --git a/mypy/mypy_3.9.ini b/mypy/mypy_3.9.ini
new file mode 100644
index 00000000..28f27a56
--- /dev/null
+++ b/mypy/mypy_3.9.ini
@@ -0,0 +1,3 @@
+[mypy]
+python_version = 3.9
+ignore_missing_imports = True
\ No newline at end of file
diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py
index 8f8bad96..40231e2a 100644
--- a/src/access_nri_intake/catalog/translators.py
+++ b/src/access_nri_intake/catalog/translators.py
@@ -6,8 +6,6 @@
 like the ACCESS-NRI catalog
 """
 
-from __future__ import annotations
-
 from functools import partial
 from typing import Callable
 
diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py
index ed41a098..d43ffb6b 100644
--- a/src/access_nri_intake/source/builders.py
+++ b/src/access_nri_intake/source/builders.py
@@ -389,9 +389,9 @@ def __init__(self, path):
     @classmethod
     def parser(cls, file) -> dict:
         try:
-            # Need to check, but I think that the .groups() method that mypy is
-            # getting upset about is what the try/catch is for here - if the regex
-            # doesn't match, then it will throw an exception.
+            # mypy gets upset as match can return None. I assume this is why we
+            # have try/except block in the first place? If so, we might be able
+            # to make this more explicit?
             match_groups = re.match(r".*/output\d+/([^/]*)/.*\.nc", file).groups()  # type: ignore
             realm = match_groups[0]
 

From bc0265ca8eb5d98c9dcd226a80aefd846fcfe153 Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Thu, 26 Sep 2024 09:32:10 +0800
Subject: [PATCH 09/23] Rewrote type hints to be compatible with Python3.9 (T |
 None => Optional[T], etc)

---
 src/access_nri_intake/catalog/manager.py     |  9 ++++----
 src/access_nri_intake/catalog/translators.py |  4 ++--
 src/access_nri_intake/source/builders.py     | 23 +++++++++++---------
 src/access_nri_intake/source/utils.py        | 11 +++++-----
 4 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/src/access_nri_intake/catalog/manager.py b/src/access_nri_intake/catalog/manager.py
index f52e2126..37b6cf01 100644
--- a/src/access_nri_intake/catalog/manager.py
+++ b/src/access_nri_intake/catalog/manager.py
@@ -4,6 +4,7 @@
 """ Manager for adding/updating intake sources in an intake-dataframe-catalog like the ACCESS-NRI catalog """
 
 import os
+from typing import Optional, Union
 
 import intake
 from intake_dataframe_catalog.core import DfFileCatalog
@@ -61,10 +62,10 @@ def build_esm(
         name: str,
         description: str,
         builder,
-        path: list[str] | str,
+        path: Union[str, list[str]],
         translator=DefaultTranslator,
-        metadata: dict | None = None,
-        directory: str | None = None,
+        metadata: Optional[dict] = None,
+        directory: Optional[str] = None,
         overwrite: bool = False,
         **kwargs,
     ):
@@ -129,7 +130,7 @@ def load(
         path: str,
         driver: str = "esm_datastore",
         translator=DefaultTranslator,
-        metadata: dict | None = None,
+        metadata: Optional[dict] = None,
         **kwargs,
     ):
         """
diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py
index 40231e2a..f237c165 100644
--- a/src/access_nri_intake/catalog/translators.py
+++ b/src/access_nri_intake/catalog/translators.py
@@ -7,7 +7,7 @@
 """
 
 from functools import partial
-from typing import Callable
+from typing import Callable, Optional
 
 import pandas as pd
 import tlz
@@ -98,7 +98,7 @@ def _default_translator(self, column: str) -> pd.Series:
 
         return pd.Series([val] * len_df)
 
-    def translate(self, groupby: list[str] | None = None) -> pd.DataFrame:
+    def translate(self, groupby: Optional[list[str]] = None) -> pd.DataFrame:
         """
         Return the translated :py:class:`~pandas.DataFrame` of metadata and merge into set of
         set of rows with unique values of the columns specified.
diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py
index d43ffb6b..57480a91 100644
--- a/src/access_nri_intake/source/builders.py
+++ b/src/access_nri_intake/source/builders.py
@@ -7,6 +7,7 @@
 import re
 import traceback
 from pathlib import Path
+from typing import Optional, Union
 
 import xarray as xr
 from ecgtools.builder import INVALID_ASSET, TRACEBACK, Builder
@@ -57,14 +58,14 @@ class BaseBuilder(Builder):
 
     def __init__(
         self,
-        path: str | list[str],
+        path: Union[str, list[str]],
         depth: int = 0,
-        exclude_patterns: list[str] | None = None,
-        include_patterns: list[str] | None = None,
+        exclude_patterns: Optional[list[str]] = None,
+        include_patterns: Optional[list[str]] = None,
         data_format: str = "netcdf",
-        groupby_attrs: list[str] | None = None,
-        aggregations: list[dict] | None = None,
-        storage_options: dict | None = None,
+        groupby_attrs: Optional[list[str]] = None,
+        aggregations: Optional[list[dict]] = None,
+        storage_options: Optional[dict] = None,
         joblib_parallel_kwargs: dict = {"n_jobs": multiprocessing.cpu_count()},
     ):
         """
@@ -119,7 +120,7 @@ def parse(self):
         self._parse()
         return self
 
-    def _save(self, name: str, description: str, directory: str | None):
+    def _save(self, name: str, description: str, directory: Union[str, None]):
         super().save(
             name=name,
             path_column_name=PATH_COLUMN,
@@ -134,7 +135,9 @@ def _save(self, name: str, description: str, directory: str | None):
             to_csv_kwargs={"compression": "gzip"},
         )
 
-    def save(self, name: str, description: str, directory: str | None = None) -> None:
+    def save(
+        self, name: str, description: str, directory: Optional[str] = None
+    ) -> None:
         """
         Save datastore contents to a file.
 
@@ -218,10 +221,10 @@ def parser(file):
     def parse_access_filename(
         cls,
         filename: str,
-        patterns: list[str] | None = None,
+        patterns: Optional[list[str]] = None,
         frequencies: dict = FREQUENCIES,
         redaction_fill: str = "X",
-    ) -> tuple[str, str | None, str | None]:
+    ) -> tuple[str, Union[str, None], Union[str, None]]:
         """
         Parse an ACCESS model filename and return a file id and any time information
 
diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py
index 816bb3b4..06e82827 100644
--- a/src/access_nri_intake/source/utils.py
+++ b/src/access_nri_intake/source/utils.py
@@ -7,6 +7,7 @@
 from dataclasses import asdict, dataclass, field
 from datetime import timedelta
 from pathlib import Path
+from typing import Union
 
 import cftime
 import xarray as xr
@@ -23,9 +24,9 @@ class _AccessNCFileInfo:
     catalog entry.
     """
 
-    filename: str | Path
+    filename: Union[str, Path]
     file_id: str
-    filename_timestamp: str | None
+    filename_timestamp: Union[str, None]
     frequency: str
     start_date: str
     end_date: str
@@ -45,7 +46,7 @@ class _AccessNCFileInfo:
     def __post_init__(self):
         self.path = str(self.filename)
 
-    def to_dict(self) -> dict[str, str | list[str]]:
+    def to_dict(self) -> dict[str, Union[str, list[str]]]:
         """
         Return a dictionary representation of the NcFileInfo object
         """
@@ -189,7 +190,7 @@ def _guess_start_end_dates(ts, te, frequency):
 
 def get_timeinfo(
     ds: xr.Dataset,
-    filename_frequency: str | None,
+    filename_frequency: Union[str, None],
     time_dim: str,
 ) -> tuple[str, str, str]:
     """
@@ -227,7 +228,7 @@ def _todate(t):
     time_format = "%Y-%m-%d, %H:%M:%S"
     ts = None
     te = None
-    frequency: str | tuple[int | None, str] = "fx"
+    frequency: Union[str, tuple[Union[int, None], str]] = "fx"
     has_time = time_dim in ds
 
     if has_time:

From f84d383f2c48214d114cb7fa070d4943caf13c47 Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Fri, 27 Sep 2024 09:56:49 +0800
Subject: [PATCH 10/23] Fixed some issues tests weren't catching

---
 src/access_nri_intake/source/builders.py |  4 ++--
 src/access_nri_intake/source/utils.py    |  5 +----
 tests/test_builders.py                   | 22 ++++++++++++++++++++++
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py
index 57480a91..27a2235c 100644
--- a/src/access_nri_intake/source/builders.py
+++ b/src/access_nri_intake/source/builders.py
@@ -303,7 +303,6 @@ def parse_access_ncfile(
         """
 
         file_path = Path(file)
-        filename = file_path.name
 
         file_id, filename_timestamp, filename_frequency = cls.parse_access_filename(
             file_path.stem
@@ -335,7 +334,8 @@ def parse_access_ncfile(
             raise EmptyFileError("This file contains no variables")
 
         output_ncfile = _AccessNCFileInfo(
-            filename=filename,
+            filename=file_path.name,
+            path=file,
             file_id=file_id,
             filename_timestamp=filename_timestamp,
             frequency=frequency,
diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py
index 06e82827..7d74e7d4 100644
--- a/src/access_nri_intake/source/utils.py
+++ b/src/access_nri_intake/source/utils.py
@@ -26,6 +26,7 @@ class _AccessNCFileInfo:
 
     filename: Union[str, Path]
     file_id: str
+    path: str
     filename_timestamp: Union[str, None]
     frequency: str
     start_date: str
@@ -41,10 +42,6 @@ class _AccessNCFileInfo:
     coord_calendar_types: list[str]
     coord_bounds: list[str]
     coord_units: list[str]
-    path: str = field(init=False)
-
-    def __post_init__(self):
-        self.path = str(self.filename)
 
     def to_dict(self) -> dict[str, Union[str, list[str]]]:
         """
diff --git a/tests/test_builders.py b/tests/test_builders.py
index 2cf523eb..032ad94d 100644
--- a/tests/test_builders.py
+++ b/tests/test_builders.py
@@ -366,6 +366,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessOm2Builder,
             "access-om2/output000/ocean/ocean_grid.nc",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="ocean_grid.nc",
                 file_id="ocean_grid",
                 filename_timestamp=None,
@@ -389,6 +390,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessOm2Builder,
             "access-om2/output000/ocean/ocean.nc",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="ocean.nc",
                 file_id="ocean",
                 filename_timestamp=None,
@@ -424,6 +426,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessOm2Builder,
             "access-om2/output000/ocean/ocean_month.nc",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="ocean_month.nc",
                 file_id="ocean_month",
                 filename_timestamp=None,
@@ -463,6 +466,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessOm2Builder,
             "access-om2/output000/ocean/ocean_month_inst_nobounds.nc",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="ocean_month_inst_nobounds.nc",
                 file_id="ocean_month_inst_nobounds",
                 filename_timestamp=None,
@@ -492,6 +496,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessOm2Builder,
             "access-om2/output000/ice/OUTPUT/iceh.1900-01.nc",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="iceh.1900-01.nc",
                 file_id="iceh_XXXX_XX",
                 filename_timestamp="1900-01",
@@ -527,6 +532,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessCm2Builder,
             "access-cm2/by578/history/atm/netCDF/by578a.pd201501_dai.nc",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="by578a.pd201501_dai.nc",
                 file_id="by578a_pdXXXXXX_dai",
                 filename_timestamp="201501",
@@ -550,6 +556,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessCm2Builder,
             "access-cm2/by578/history/ice/iceh_d.2015-01.nc",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="iceh_d.2015-01.nc",
                 file_id="iceh_d_XXXX_XX",
                 filename_timestamp="2015-01",
@@ -585,6 +592,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessCm2Builder,
             "access-cm2/by578/history/ocn/ocean_daily.nc-20150630",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="ocean_daily.nc-20150630",
                 file_id="ocean_daily",
                 filename_timestamp=None,
@@ -618,6 +626,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessCm2Builder,
             "access-cm2/by578/history/ocn/ocean_scalar.nc-20150630",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="ocean_scalar.nc-20150630",
                 file_id="ocean_scalar",
                 filename_timestamp=None,
@@ -644,6 +653,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessEsm15Builder,
             "access-esm1-5/history/atm/netCDF/HI-C-05-r1.pa-185001_mon.nc",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="HI-C-05-r1.pa-185001_mon.nc",
                 file_id="HI_C_05_r1_pa_XXXXXX_mon",
                 filename_timestamp="185001",
@@ -667,6 +677,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessEsm15Builder,
             "access-esm1-5/history/ice/iceh.1850-01.nc",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="iceh.1850-01.nc",
                 file_id="iceh_XXXX_XX",
                 filename_timestamp="1850-01",
@@ -702,6 +713,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessEsm15Builder,
             "access-esm1-5/history/ocn/ocean_bgc_ann.nc-18501231",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="ocean_bgc_ann.nc-18501231",
                 file_id="ocean_bgc_ann",
                 filename_timestamp=None,
@@ -738,6 +750,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessEsm15Builder,
             "access-esm1-5/history/ocn/ocean_bgc.nc-18501231",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="ocean_bgc.nc-18501231",
                 file_id="ocean_bgc",
                 filename_timestamp=None,
@@ -773,6 +786,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.mom6.h.native_1900_01.nc",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="GMOM_JRA_WD.mom6.h.native_1900_01.nc",
                 file_id="GMOM_JRA_WD_mom6_h_native_XXXX_XX",
                 filename_timestamp="1900_01",
@@ -838,6 +852,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc",
                 file_id="GMOM_JRA_WD_mom6_h_sfc_XXXX_XX_XX",
                 filename_timestamp="1900_01_02",
@@ -889,6 +904,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.mom6.h.static.nc",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="GMOM_JRA_WD.mom6.h.static.nc",
                 file_id="GMOM_JRA_WD_mom6_h_static",
                 filename_timestamp=None,
@@ -918,6 +934,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.mom6.h.z_1900_01.nc",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="GMOM_JRA_WD.mom6.h.z_1900_01.nc",
                 file_id="GMOM_JRA_WD_mom6_h_z_XXXX_XX",
                 filename_timestamp="1900_01",
@@ -983,6 +1000,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.cice.h.1900-01-01.nc",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="GMOM_JRA_WD.cice.h.1900-01-01.nc",
                 file_id="GMOM_JRA_WD_cice_h_XXXX_XX_XX",
                 filename_timestamp="1900-01-01",
@@ -1018,6 +1036,7 @@ def test_parse_access_filename(builder, filename, expected):
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc",
             _AccessNCFileInfo(
+                path=None,  # type: ignore
                 filename="GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc",
                 file_id="GMOM_JRA_WD_ww3_hi_XXXX_XX_XX_XXXXX",
                 filename_timestamp="1900-01-02-00000",
@@ -1042,4 +1061,7 @@ def test_parse_access_filename(builder, filename, expected):
 def test_parse_access_ncfile(test_data, builder, filename, expected):
     file = str(test_data / Path(filename))
 
+    # Set the path to the test data directory
+    expected.path = file
+
     assert builder.parse_access_ncfile(file) == expected

From 7799c8db7ead1671930ef5cf49800776bc07b379 Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Fri, 27 Sep 2024 12:51:53 +1000
Subject: [PATCH 11/23] Renamed  =>  - makes indexing more consistent

---
 src/access_nri_intake/source/utils.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py
index 7d74e7d4..a3a057e4 100644
--- a/src/access_nri_intake/source/utils.py
+++ b/src/access_nri_intake/source/utils.py
@@ -22,6 +22,12 @@ class _AccessNCFileInfo:
     """
     Holds information about a NetCDF file that is used to create an intake-esm
     catalog entry.
+
+    ______
+    Notes:
+    Use of both path and filename seems redundant, but constructing filename from
+    the path using a __post_init__ method makes testing more difficult. On balance,
+    more explicit tests are probably more important than the slight redundancy.
     """
 
     filename: Union[str, Path]
@@ -130,7 +136,7 @@ def to_ncinfo_dict(self) -> dict[str, list[str]]:
         defined explicitly for use in the _AccessNCFileInfo constructor.
         """
         return {
-            "coords": self.coord_list,
+            "coord": self.coord_list,
             "coord_long_name": self.long_name_list,
             "coord_cartesian_axes": self.cartesian_axis_list,
             "coord_calendar_types": self.calendar_type_list,

From e6590c98db0b3309035ea9677f61da2a5ee1a858 Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Fri, 27 Sep 2024 13:03:23 +1000
Subject: [PATCH 12/23] Fixed a couple of issues relating to changing from
 coords => coord in search (forgot to test)

---
 src/access_nri_intake/source/utils.py |  2 +-
 tests/test_builders.py                | 38 +++++++++++++--------------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py
index a3a057e4..3ec01e6d 100644
--- a/src/access_nri_intake/source/utils.py
+++ b/src/access_nri_intake/source/utils.py
@@ -42,7 +42,7 @@ class _AccessNCFileInfo:
     variable_standard_name: list[str]
     variable_cell_methods: list[str]
     variable_units: list[str]
-    coords: list[str]
+    coord: list[str]
     coord_long_name: list[str]
     coord_cartesian_axes: list[str]
     coord_calendar_types: list[str]
diff --git a/tests/test_builders.py b/tests/test_builders.py
index 032ad94d..d21857f0 100644
--- a/tests/test_builders.py
+++ b/tests/test_builders.py
@@ -379,7 +379,7 @@ def test_parse_access_filename(builder, filename, expected):
                 variable_cell_methods=["time: point", "time: point"],
                 variable_units=["degrees_N", "degrees_E"],
                 coord_long_name=["tcell longitude", "tcell latitude"],
-                coords=["xt_ocean", "yt_ocean"],
+                coord=["xt_ocean", "yt_ocean"],
                 coord_cartesian_axes=["X", "Y"],
                 coord_calendar_types=["", ""],
                 coord_bounds=["", ""],
@@ -402,7 +402,7 @@ def test_parse_access_filename(builder, filename, expected):
                 variable_standard_name=["sea_water_conservative_temperature", ""],
                 variable_cell_methods=["time: mean", ""],
                 variable_units=["K", "days"],
-                coords=["nv", "st_ocean", "time", "xt_ocean", "yt_ocean"],
+                coord=["nv", "st_ocean", "time", "xt_ocean", "yt_ocean"],
                 coord_long_name=[
                     "vertex number",
                     "tcell zstar depth",
@@ -444,7 +444,7 @@ def test_parse_access_filename(builder, filename, expected):
                 ],
                 variable_cell_methods=["time: mean", ""],
                 variable_units=["m", "days"],
-                coords=["nv", "time", "xt_ocean", "yt_ocean"],
+                coord=["nv", "time", "xt_ocean", "yt_ocean"],
                 coord_long_name=[
                     "vertex number",
                     "time",
@@ -480,7 +480,7 @@ def test_parse_access_filename(builder, filename, expected):
                 ],
                 variable_cell_methods=["time: mean"],
                 variable_units=["m"],
-                coords=["time", "xt_ocean", "yt_ocean"],
+                coord=["time", "xt_ocean", "yt_ocean"],
                 coord_long_name=["time", "tcell longitude", "tcell latitude"],
                 coord_cartesian_axes=["T", "X", "Y"],
                 coord_calendar_types=["NOLEAP", "", ""],
@@ -520,7 +520,7 @@ def test_parse_access_filename(builder, filename, expected):
                     "m^2",
                     "days since 1900-01-01 00:00:00",
                 ],
-                coords=["time"],
+                coord=["time"],
                 coord_long_name=["model time"],
                 coord_cartesian_axes=[""],
                 coord_calendar_types=[""],
@@ -544,7 +544,7 @@ def test_parse_access_filename(builder, filename, expected):
                 variable_standard_name=["air_temperature"],
                 variable_cell_methods=["time: mean"],
                 variable_units=["K"],
-                coords=[],
+                coord=[],
                 coord_long_name=[],
                 coord_cartesian_axes=[],
                 coord_calendar_types=[],
@@ -580,7 +580,7 @@ def test_parse_access_filename(builder, filename, expected):
                     "m^2",
                     "days since 1850-01-01 00:00:00",
                 ],
-                coords=["time"],
+                coord=["time"],
                 coord_long_name=["model time"],
                 coord_cartesian_axes=[""],
                 coord_calendar_types=[""],
@@ -604,7 +604,7 @@ def test_parse_access_filename(builder, filename, expected):
                 variable_standard_name=["sea_surface_temperature", ""],
                 variable_cell_methods=["time: mean", ""],
                 variable_units=["K", "days"],
-                coords=["nv", "time", "xt_ocean", "yt_ocean"],
+                coord=["nv", "time", "xt_ocean", "yt_ocean"],
                 coord_long_name=[
                     "vertex number",
                     "time",
@@ -641,7 +641,7 @@ def test_parse_access_filename(builder, filename, expected):
                 variable_standard_name=["sea_water_potential_temperature", ""],
                 variable_cell_methods=["time: mean", ""],
                 variable_units=["deg_C", "days"],
-                coords=["nv", "scalar_axis", "time"],
+                coord=["nv", "scalar_axis", "time"],
                 coord_long_name=["vertex number", "none", "time"],
                 coord_cartesian_axes=["N", "X", "T"],
                 coord_calendar_types=["", "", "GREGORIAN"],
@@ -665,7 +665,7 @@ def test_parse_access_filename(builder, filename, expected):
                 variable_standard_name=["air_temperature"],
                 variable_cell_methods=["time: mean"],
                 variable_units=["K"],
-                coords=[],
+                coord=[],
                 coord_long_name=[],
                 coord_cartesian_axes=[],
                 coord_calendar_types=[],
@@ -701,7 +701,7 @@ def test_parse_access_filename(builder, filename, expected):
                     "m^2",
                     "days since 0001-01-01 00:00:00",
                 ],
-                coords=["time"],
+                coord=["time"],
                 coord_long_name=["model time"],
                 coord_cartesian_axes=[""],
                 coord_calendar_types=[""],
@@ -728,7 +728,7 @@ def test_parse_access_filename(builder, filename, expected):
                 variable_standard_name=["", ""],
                 variable_cell_methods=["time: mean", ""],
                 variable_units=["mmol/m^2/s", "days"],
-                coords=["nv", "time", "xt_ocean", "yt_ocean"],
+                coord=["nv", "time", "xt_ocean", "yt_ocean"],
                 coord_long_name=[
                     "vertex number",
                     "time",
@@ -762,7 +762,7 @@ def test_parse_access_filename(builder, filename, expected):
                 variable_standard_name=["", ""],
                 variable_cell_methods=["time: mean", ""],
                 variable_units=["mmol/m^3", "days"],
-                coords=["nv", "st_ocean", "time", "xt_ocean", "yt_ocean"],
+                coord=["nv", "st_ocean", "time", "xt_ocean", "yt_ocean"],
                 coord_long_name=[
                     "vertex number",
                     "tcell zstar depth",
@@ -828,7 +828,7 @@ def test_parse_access_filename(builder, filename, expected):
                     "degC",
                     "days since 0001-01-01 00:00:00",
                 ],
-                coords=["nv", "time", "xh", "yh", "zl"],
+                coord=["nv", "time", "xh", "yh", "zl"],
                 coord_long_name=[
                     "vertex number",
                     "time",
@@ -882,7 +882,7 @@ def test_parse_access_filename(builder, filename, expected):
                     "days since 0001-01-01 00:00:00",
                     "degC",
                 ],
-                coords=["nv", "time", "xh", "yh"],
+                coord=["nv", "time", "xh", "yh"],
                 coord_long_name=[
                     "vertex number",
                     "time",
@@ -919,7 +919,7 @@ def test_parse_access_filename(builder, filename, expected):
                 variable_standard_name=["", ""],
                 variable_cell_methods=["time: point", "time: point"],
                 variable_units=["degrees_north", "degrees_east"],
-                coords=["xh", "yh"],
+                coord=["xh", "yh"],
                 coord_long_name=[
                     "h point nominal longitude",
                     "h point nominal latitude",
@@ -976,7 +976,7 @@ def test_parse_access_filename(builder, filename, expected):
                     "degC",
                     "days since 0001-01-01 00:00:00",
                 ],
-                coords=["nv", "time", "xh", "yh", "z_l"],
+                coord=["nv", "time", "xh", "yh", "z_l"],
                 coord_long_name=[
                     "vertex number",
                     "time",
@@ -1024,7 +1024,7 @@ def test_parse_access_filename(builder, filename, expected):
                     "m^2",
                     "days since 0000-01-01 00:00:00",
                 ],
-                coords=["time"],
+                coord=["time"],
                 coord_long_name=["time"],
                 coord_cartesian_axes=[""],
                 coord_calendar_types=[""],
@@ -1048,7 +1048,7 @@ def test_parse_access_filename(builder, filename, expected):
                 variable_standard_name=["", ""],
                 variable_cell_methods=["", ""],
                 variable_units=["m2 s", "unitless"],
-                coords=[],
+                coord=[],
                 coord_long_name=[],
                 coord_cartesian_axes=[],
                 coord_calendar_types=[],

From 0c96e28f347c553afdb22c335f89ab5fd6f19b0b Mon Sep 17 00:00:00 2001
From: Marc White <marc.white@anu.edu.au>
Date: Mon, 30 Sep 2024 16:27:21 +1000
Subject: [PATCH 13/23] Add cmip6_ig45 to catalog

---
 config/cmip6.yaml                           |  4 ++++
 config/experiments/cmip6_ig45/metadata.yaml | 26 +++++++++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 config/experiments/cmip6_ig45/metadata.yaml

diff --git a/config/cmip6.yaml b/config/cmip6.yaml
index d50b0215..b446475b 100644
--- a/config/cmip6.yaml
+++ b/config/cmip6.yaml
@@ -11,3 +11,7 @@ sources:
   - metadata_yaml: /g/data/xp65/admin/intake/metadata/cmip6_oi10/metadata.yaml
     path:
       - /g/data/oi10/catalog/v2/esm/catalog.json
+
+  - metadata_yaml: /g/data/xp65/admin/intake/metadata/cmip6_ig45/metadata.yaml
+    path:
+    - /g/data/ig45/catalog/v2/esm/catalog.json 
\ No newline at end of file
diff --git a/config/experiments/cmip6_ig45/metadata.yaml b/config/experiments/cmip6_ig45/metadata.yaml
new file mode 100644
index 00000000..8046f731
--- /dev/null
+++ b/config/experiments/cmip6_ig45/metadata.yaml
@@ -0,0 +1,26 @@
+name: cmip6_ig45
+experiment_uuid: c7021d1e-7ba2-11ef-beb5-000007d3fe80
+description: 20km regional projections for CORDEX-CMIP6 from the Queensland Future Climate Science Program
+long_description: >-
+  This dataset includes projections at 20km, formatted to meet the CORDEX-CMIP6 data standards. 
+  The 20km projections were derived from the 10km projections.
+model:
+- CMIP6
+frequency:
+- 
+variable:
+- 
+nominal_resolution:
+- 
+version: 
+contact: NCI
+email: help@nci.org.au
+reference:
+license:
+url: https://geonetwork.nci.org.au/geonetwork/srv/eng/catalog.search#/metadata/f7465_8388_5100_7022
+parent_experiment:
+related_experiments:
+-
+notes: 
+keywords:
+- cmip
\ No newline at end of file

From 1d9f429cdc119ce9a1e68bb579ffd0d2123acf61 Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Thu, 3 Oct 2024 11:06:43 +1000
Subject: [PATCH 14/23] - Moved coordinate variables back into variables
 (simpler interface) - Updated tests to better respect moving coordinate
 variables back into variables - Moved mypy setup stuff into to
 'pre-commit-config.yaml'

---
 .pre-commit-config.yaml                  |   7 +
 mypy/mypy_3.10.ini                       |   3 -
 mypy/mypy_3.11.ini                       |   3 -
 mypy/mypy_3.12.ini                       |   3 -
 mypy/mypy_3.9.ini                        |   3 -
 src/access_nri_intake/source/builders.py |   9 +-
 src/access_nri_intake/source/utils.py    |  52 ---
 tests/test_builders.py                   | 430 ++++++++++++-----------
 8 files changed, 238 insertions(+), 272 deletions(-)
 delete mode 100644 mypy/mypy_3.10.ini
 delete mode 100644 mypy/mypy_3.11.ini
 delete mode 100644 mypy/mypy_3.12.ini
 delete mode 100644 mypy/mypy_3.9.ini

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 13106eeb..2843edd9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -10,3 +10,10 @@ repos:
     hooks:
     - id: black
       language_version: python3
+# Mypy
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: 'v1.11.2'  
+    hooks:
+    - id: mypy 
+      name: mypy 
+      additional_dependencies: [types-PyYAML==6.0.12.20240808]
diff --git a/mypy/mypy_3.10.ini b/mypy/mypy_3.10.ini
deleted file mode 100644
index 99c9e230..00000000
--- a/mypy/mypy_3.10.ini
+++ /dev/null
@@ -1,3 +0,0 @@
-[mypy]
-python_version = 3.10
-ignore_missing_imports = True
\ No newline at end of file
diff --git a/mypy/mypy_3.11.ini b/mypy/mypy_3.11.ini
deleted file mode 100644
index 0413b9fc..00000000
--- a/mypy/mypy_3.11.ini
+++ /dev/null
@@ -1,3 +0,0 @@
-[mypy]
-python_version = 3.11
-ignore_missing_imports = True
\ No newline at end of file
diff --git a/mypy/mypy_3.12.ini b/mypy/mypy_3.12.ini
deleted file mode 100644
index a47639ed..00000000
--- a/mypy/mypy_3.12.ini
+++ /dev/null
@@ -1,3 +0,0 @@
-[mypy]
-python_version = 3.12
-ignore_missing_imports = True
\ No newline at end of file
diff --git a/mypy/mypy_3.9.ini b/mypy/mypy_3.9.ini
deleted file mode 100644
index 28f27a56..00000000
--- a/mypy/mypy_3.9.ini
+++ /dev/null
@@ -1,3 +0,0 @@
-[mypy]
-python_version = 3.9
-ignore_missing_imports = True
\ No newline at end of file
diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py
index 27a2235c..a05e8fed 100644
--- a/src/access_nri_intake/source/builders.py
+++ b/src/access_nri_intake/source/builders.py
@@ -17,7 +17,6 @@
 from .utils import (
     EmptyFileError,
     _AccessNCFileInfo,
-    _CoordVarInfo,
     _DataVarInfo,
     get_timeinfo,
 )
@@ -316,16 +315,11 @@ def parse_access_ncfile(
             decode_coords=False,
         ) as ds:
             dvars = _DataVarInfo()
-            cvars = _CoordVarInfo()
 
-            for var in ds.data_vars:
+            for var in ds.variables:
                 attrs = ds[var].attrs
                 dvars.append_attrs(var, attrs)  # type: ignore
 
-            for var in ds.coords:
-                attrs = ds[var].attrs
-                cvars.append_attrs(var, attrs)  # type: ignore
-
             start_date, end_date, frequency = get_timeinfo(
                 ds, filename_frequency, time_dim
             )
@@ -342,7 +336,6 @@ def parse_access_ncfile(
             start_date=start_date,
             end_date=end_date,
             **dvars.to_ncinfo_dict(),
-            **cvars.to_ncinfo_dict(),
         )
 
         return output_ncfile
diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py
index 3ec01e6d..c9082f32 100644
--- a/src/access_nri_intake/source/utils.py
+++ b/src/access_nri_intake/source/utils.py
@@ -42,12 +42,6 @@ class _AccessNCFileInfo:
     variable_standard_name: list[str]
     variable_cell_methods: list[str]
     variable_units: list[str]
-    coord: list[str]
-    coord_long_name: list[str]
-    coord_cartesian_axes: list[str]
-    coord_calendar_types: list[str]
-    coord_bounds: list[str]
-    coord_units: list[str]
 
     def to_dict(self) -> dict[str, Union[str, list[str]]]:
         """
@@ -99,52 +93,6 @@ def to_ncinfo_dict(self) -> dict[str, list[str]]:
         }
 
 
-@dataclass
-class _CoordVarInfo:
-    """
-    Holds information about the coordinate variables in a NetCDF file that is
-    used to create an intake-esm catalog entry.
-    """
-
-    coord_list: list[str] = field(default_factory=list)
-    long_name_list: list[str] = field(default_factory=list)
-    cartesian_axis_list: list[str] = field(default_factory=list)
-    calendar_type_list: list[str] = field(default_factory=list)
-    bounds_list: list[str] = field(default_factory=list)
-    units_list: list[str] = field(default_factory=list)
-
-    def append_attrs(self, var: str, attrs: dict) -> None:
-        """
-        Append attributes to the CoordVarInfo object, if the attribute has a
-        'long_name' key.
-
-        TODO: Why do we need a long name key? seems important
-        """
-        if "long_name" not in attrs:
-            return None
-
-        self.coord_list.append(var)
-        self.long_name_list.append(attrs["long_name"])
-        self.cartesian_axis_list.append(attrs.get("cartesian_axis", ""))
-        self.calendar_type_list.append(attrs.get("calendar_type", ""))
-        self.bounds_list.append(attrs.get("bounds", ""))
-        self.units_list.append(attrs.get("units", ""))
-
-    def to_ncinfo_dict(self) -> dict[str, list[str]]:
-        """
-        Return a dictionary representation of the CoordVarInfo object. Fields are
-        defined explicitly for use in the _AccessNCFileInfo constructor.
-        """
-        return {
-            "coord": self.coord_list,
-            "coord_long_name": self.long_name_list,
-            "coord_cartesian_axes": self.cartesian_axis_list,
-            "coord_calendar_types": self.calendar_type_list,
-            "coord_bounds": self.bounds_list,
-            "coord_units": self.units_list,
-        }
-
-
 def _add_month_start(time, n: int):
     """Add months to cftime datetime and truncate to start"""
     year = time.year + ((time.month + n - 1) // 12)
diff --git a/tests/test_builders.py b/tests/test_builders.py
index d21857f0..c28f9eac 100644
--- a/tests/test_builders.py
+++ b/tests/test_builders.py
@@ -373,17 +373,16 @@ def test_parse_access_filename(builder, filename, expected):
                 frequency="fx",
                 start_date="none",
                 end_date="none",
-                variable=["geolat_t", "geolon_t"],
-                variable_long_name=["tracer latitude", "tracer longitude"],
-                variable_standard_name=["", ""],
-                variable_cell_methods=["time: point", "time: point"],
-                variable_units=["degrees_N", "degrees_E"],
-                coord_long_name=["tcell longitude", "tcell latitude"],
-                coord=["xt_ocean", "yt_ocean"],
-                coord_cartesian_axes=["X", "Y"],
-                coord_calendar_types=["", ""],
-                coord_bounds=["", ""],
-                coord_units=["degrees_E", "degrees_N"],
+                variable=["geolat_t", "geolon_t", "xt_ocean", "yt_ocean"],
+                variable_long_name=[
+                    "tracer latitude",
+                    "tracer longitude",
+                    "tcell longitude",
+                    "tcell latitude",
+                ],
+                variable_standard_name=["", "", "", ""],
+                variable_cell_methods=["time: point", "time: point", "", ""],
+                variable_units=["degrees_N", "degrees_E", "degrees_E", "degrees_N"],
             ),
         ),
         (
@@ -397,26 +396,40 @@ def test_parse_access_filename(builder, filename, expected):
                 frequency="1yr",
                 start_date="1900-01-01, 00:00:00",
                 end_date="1910-01-01, 00:00:00",
-                variable=["temp", "time_bounds"],
-                variable_long_name=["Conservative temperature", "time axis boundaries"],
-                variable_standard_name=["sea_water_conservative_temperature", ""],
-                variable_cell_methods=["time: mean", ""],
-                variable_units=["K", "days"],
-                coord=["nv", "st_ocean", "time", "xt_ocean", "yt_ocean"],
-                coord_long_name=[
+                variable=[
+                    "nv",
+                    "st_ocean",
+                    "temp",
+                    "time",
+                    "time_bounds",
+                    "xt_ocean",
+                    "yt_ocean",
+                ],
+                variable_long_name=[
                     "vertex number",
                     "tcell zstar depth",
+                    "Conservative temperature",
                     "time",
+                    "time axis boundaries",
                     "tcell longitude",
                     "tcell latitude",
                 ],
-                coord_cartesian_axes=["N", "Z", "T", "X", "Y"],
-                coord_calendar_types=["", "", "NOLEAP", "", ""],
-                coord_bounds=["", "", "time_bounds", "", ""],
-                coord_units=[
+                variable_standard_name=[
+                    "",
+                    "",
+                    "sea_water_conservative_temperature",
+                    "",
+                    "",
+                    "",
+                    "",
+                ],
+                variable_cell_methods=["", "", "time: mean", "", "", "", ""],
+                variable_units=[
                     "none",
                     "meters",
+                    "K",
                     "days since 1900-01-01 00:00:00",
+                    "days",
                     "degrees_E",
                     "degrees_N",
                 ],
@@ -433,30 +446,29 @@ def test_parse_access_filename(builder, filename, expected):
                 frequency="1mon",
                 start_date="1900-01-01, 00:00:00",
                 end_date="1910-01-01, 00:00:00",
-                variable=["mld", "time_bounds"],
+                variable=["mld", "nv", "time", "time_bounds", "xt_ocean", "yt_ocean"],
                 variable_long_name=[
                     "mixed layer depth determined by density criteria",
+                    "vertex number",
+                    "time",
                     "time axis boundaries",
+                    "tcell longitude",
+                    "tcell latitude",
                 ],
                 variable_standard_name=[
                     "ocean_mixed_layer_thickness_defined_by_sigma_t",
                     "",
+                    "",
+                    "",
+                    "",
+                    "",
                 ],
-                variable_cell_methods=["time: mean", ""],
-                variable_units=["m", "days"],
-                coord=["nv", "time", "xt_ocean", "yt_ocean"],
-                coord_long_name=[
-                    "vertex number",
-                    "time",
-                    "tcell longitude",
-                    "tcell latitude",
-                ],
-                coord_cartesian_axes=["N", "T", "X", "Y"],
-                coord_calendar_types=["", "NOLEAP", "", ""],
-                coord_bounds=["", "time_bounds", "", ""],
-                coord_units=[
+                variable_cell_methods=["time: mean", "", "", "", "", ""],
+                variable_units=[
+                    "m",
                     "none",
                     "days since 1900-01-01 00:00:00",
+                    "days",
                     "degrees_E",
                     "degrees_N",
                 ],
@@ -473,19 +485,22 @@ def test_parse_access_filename(builder, filename, expected):
                 frequency="1mon",
                 start_date="1900-01-01, 00:00:00",
                 end_date="1900-02-01, 00:00:00",
-                variable=["mld"],
-                variable_long_name=["mixed layer depth determined by density criteria"],
+                variable=["mld", "time", "xt_ocean", "yt_ocean"],
+                variable_long_name=[
+                    "mixed layer depth determined by density criteria",
+                    "time",
+                    "tcell longitude",
+                    "tcell latitude",
+                ],
                 variable_standard_name=[
-                    "ocean_mixed_layer_thickness_defined_by_sigma_t"
+                    "ocean_mixed_layer_thickness_defined_by_sigma_t",
+                    "",
+                    "",
+                    "",
                 ],
-                variable_cell_methods=["time: mean"],
-                variable_units=["m"],
-                coord=["time", "xt_ocean", "yt_ocean"],
-                coord_long_name=["time", "tcell longitude", "tcell latitude"],
-                coord_cartesian_axes=["T", "X", "Y"],
-                coord_calendar_types=["NOLEAP", "", ""],
-                coord_bounds=["time_bounds", "", ""],
-                coord_units=[
+                variable_cell_methods=["time: mean", "", "", ""],
+                variable_units=[
+                    "m",
                     "days since 1900-01-01 00:00:00",
                     "degrees_E",
                     "degrees_N",
@@ -503,29 +518,25 @@ def test_parse_access_filename(builder, filename, expected):
                 frequency="1mon",
                 start_date="1900-01-01, 00:00:00",
                 end_date="1900-02-01, 00:00:00",
-                variable=["TLAT", "TLON", "aice_m", "tarea", "time_bounds"],
+                variable=["TLAT", "TLON", "aice_m", "tarea", "time", "time_bounds"],
                 variable_long_name=[
                     "T grid center latitude",
                     "T grid center longitude",
                     "ice area  (aggregate)",
                     "area of T grid cells",
+                    "model time",
                     "boundaries for time-averaging interval",
                 ],
-                variable_standard_name=["", "", "", "", ""],
-                variable_cell_methods=["", "", "time: mean", "", ""],
+                variable_standard_name=["", "", "", "", "", ""],
+                variable_cell_methods=["", "", "time: mean", "", "", ""],
                 variable_units=[
                     "degrees_north",
                     "degrees_east",
                     "1",
                     "m^2",
                     "days since 1900-01-01 00:00:00",
+                    "days since 1900-01-01 00:00:00",
                 ],
-                coord=["time"],
-                coord_long_name=["model time"],
-                coord_cartesian_axes=[""],
-                coord_calendar_types=[""],
-                coord_bounds=["time_bounds"],
-                coord_units=["days since 1900-01-01 00:00:00"],
             ),
         ),
         (
@@ -544,12 +555,6 @@ def test_parse_access_filename(builder, filename, expected):
                 variable_standard_name=["air_temperature"],
                 variable_cell_methods=["time: mean"],
                 variable_units=["K"],
-                coord=[],
-                coord_long_name=[],
-                coord_cartesian_axes=[],
-                coord_calendar_types=[],
-                coord_bounds=[],
-                coord_units=[],
             ),
         ),
         (
@@ -563,29 +568,25 @@ def test_parse_access_filename(builder, filename, expected):
                 frequency="1day",
                 start_date="2015-01-01, 00:00:00",
                 end_date="2015-02-01, 00:00:00",
-                variable=["TLAT", "TLON", "aice", "tarea", "time_bounds"],
+                variable=["TLAT", "TLON", "aice", "tarea", "time", "time_bounds"],
                 variable_long_name=[
                     "T grid center latitude",
                     "T grid center longitude",
                     "ice area  (aggregate)",
                     "area of T grid cells",
+                    "model time",
                     "boundaries for time-averaging interval",
                 ],
-                variable_standard_name=["", "", "", "", ""],
-                variable_cell_methods=["", "", "time: mean", "", ""],
+                variable_standard_name=["", "", "", "", "", ""],
+                variable_cell_methods=["", "", "time: mean", "", "", ""],
                 variable_units=[
                     "degrees_north",
                     "degrees_east",
                     "1",
                     "m^2",
                     "days since 1850-01-01 00:00:00",
+                    "days since 1850-01-01 00:00:00",
                 ],
-                coord=["time"],
-                coord_long_name=["model time"],
-                coord_cartesian_axes=[""],
-                coord_calendar_types=[""],
-                coord_bounds=["time_bounds"],
-                coord_units=["days since 1850-01-01 00:00:00"],
             ),
         ),
         (
@@ -599,24 +600,22 @@ def test_parse_access_filename(builder, filename, expected):
                 frequency="1day",
                 start_date="2015-01-01, 00:00:00",
                 end_date="2015-07-01, 00:00:00",
-                variable=["sst", "time_bounds"],
-                variable_long_name=["Potential temperature", "time axis boundaries"],
-                variable_standard_name=["sea_surface_temperature", ""],
-                variable_cell_methods=["time: mean", ""],
-                variable_units=["K", "days"],
-                coord=["nv", "time", "xt_ocean", "yt_ocean"],
-                coord_long_name=[
+                variable=["nv", "sst", "time", "time_bounds", "xt_ocean", "yt_ocean"],
+                variable_long_name=[
                     "vertex number",
+                    "Potential temperature",
                     "time",
+                    "time axis boundaries",
                     "tcell longitude",
                     "tcell latitude",
                 ],
-                coord_cartesian_axes=["N", "T", "X", "Y"],
-                coord_calendar_types=["", "GREGORIAN", "", ""],
-                coord_bounds=["", "time_bounds", "", ""],
-                coord_units=[
+                variable_standard_name=["", "sea_surface_temperature", "", "", "", ""],
+                variable_cell_methods=["", "time: mean", "", "", "", ""],
+                variable_units=[
                     "none",
+                    "K",
                     "days since 1850-01-01 00:00:00",
+                    "days",
                     "degrees_E",
                     "degrees_N",
                 ],
@@ -633,20 +632,35 @@ def test_parse_access_filename(builder, filename, expected):
                 frequency="1mon",
                 start_date="2015-01-01, 00:00:00",
                 end_date="2015-07-01, 00:00:00",
-                variable=["temp_global_ave", "time_bounds"],
+                variable=[
+                    "nv",
+                    "scalar_axis",
+                    "temp_global_ave",
+                    "time",
+                    "time_bounds",
+                ],
                 variable_long_name=[
+                    "vertex number",
+                    "none",
                     "Global mean temp in liquid seawater",
+                    "time",
                     "time axis boundaries",
                 ],
-                variable_standard_name=["sea_water_potential_temperature", ""],
-                variable_cell_methods=["time: mean", ""],
-                variable_units=["deg_C", "days"],
-                coord=["nv", "scalar_axis", "time"],
-                coord_long_name=["vertex number", "none", "time"],
-                coord_cartesian_axes=["N", "X", "T"],
-                coord_calendar_types=["", "", "GREGORIAN"],
-                coord_bounds=["", "", "time_bounds"],
-                coord_units=["none", "none", "days since 1850-01-01 00:00:00"],
+                variable_standard_name=[
+                    "",
+                    "",
+                    "sea_water_potential_temperature",
+                    "",
+                    "",
+                ],
+                variable_cell_methods=["", "", "time: mean", "", ""],
+                variable_units=[
+                    "none",
+                    "none",
+                    "deg_C",
+                    "days since 1850-01-01 00:00:00",
+                    "days",
+                ],
             ),
         ),
         (
@@ -665,12 +679,6 @@ def test_parse_access_filename(builder, filename, expected):
                 variable_standard_name=["air_temperature"],
                 variable_cell_methods=["time: mean"],
                 variable_units=["K"],
-                coord=[],
-                coord_long_name=[],
-                coord_cartesian_axes=[],
-                coord_calendar_types=[],
-                coord_bounds=[],
-                coord_units=[],
             ),
         ),
         (
@@ -684,29 +692,25 @@ def test_parse_access_filename(builder, filename, expected):
                 frequency="1mon",
                 start_date="1850-01-01, 00:00:00",
                 end_date="1850-02-01, 00:00:00",
-                variable=["TLAT", "TLON", "aice", "tarea", "time_bounds"],
+                variable=["TLAT", "TLON", "aice", "tarea", "time", "time_bounds"],
                 variable_long_name=[
                     "T grid center latitude",
                     "T grid center longitude",
                     "ice area  (aggregate)",
                     "area of T grid cells",
+                    "model time",
                     "boundaries for time-averaging interval",
                 ],
-                variable_standard_name=["", "", "", "", ""],
-                variable_cell_methods=["", "", "time: mean", "", ""],
+                variable_standard_name=["", "", "", "", "", ""],
+                variable_cell_methods=["", "", "time: mean", "", "", ""],
                 variable_units=[
                     "degrees_north",
                     "degrees_east",
                     "1",
                     "m^2",
                     "days since 0001-01-01 00:00:00",
+                    "days since 0001-01-01 00:00:00",
                 ],
-                coord=["time"],
-                coord_long_name=["model time"],
-                coord_cartesian_axes=[""],
-                coord_calendar_types=[""],
-                coord_bounds=["time_bounds"],
-                coord_units=["days since 0001-01-01 00:00:00"],
             ),
         ),
         (
@@ -720,27 +724,29 @@ def test_parse_access_filename(builder, filename, expected):
                 frequency="1yr",
                 start_date="1849-12-30, 00:00:00",
                 end_date="1850-12-30, 00:00:00",
-                variable=["fgco2_raw", "time_bounds"],
+                variable=[
+                    "fgco2_raw",
+                    "nv",
+                    "time",
+                    "time_bounds",
+                    "xt_ocean",
+                    "yt_ocean",
+                ],
                 variable_long_name=[
                     "Flux into ocean - DIC, inc. anth.",
-                    "time axis boundaries",
-                ],
-                variable_standard_name=["", ""],
-                variable_cell_methods=["time: mean", ""],
-                variable_units=["mmol/m^2/s", "days"],
-                coord=["nv", "time", "xt_ocean", "yt_ocean"],
-                coord_long_name=[
                     "vertex number",
                     "time",
+                    "time axis boundaries",
                     "tcell longitude",
                     "tcell latitude",
                 ],
-                coord_cartesian_axes=["N", "T", "X", "Y"],
-                coord_calendar_types=["", "GREGORIAN", "", ""],
-                coord_bounds=["", "time_bounds", "", ""],
-                coord_units=[
+                variable_standard_name=["", "", "", "", "", ""],
+                variable_cell_methods=["time: mean", "", "", "", "", ""],
+                variable_units=[
+                    "mmol/m^2/s",
                     "none",
                     "days since 0001-01-01 00:00:00",
+                    "days",
                     "degrees_E",
                     "degrees_N",
                 ],
@@ -757,26 +763,32 @@ def test_parse_access_filename(builder, filename, expected):
                 frequency="1mon",
                 start_date="1849-12-30, 00:00:00",
                 end_date="1850-12-30, 00:00:00",
-                variable=["o2", "time_bounds"],
-                variable_long_name=["o2", "time axis boundaries"],
-                variable_standard_name=["", ""],
-                variable_cell_methods=["time: mean", ""],
-                variable_units=["mmol/m^3", "days"],
-                coord=["nv", "st_ocean", "time", "xt_ocean", "yt_ocean"],
-                coord_long_name=[
+                variable=[
+                    "nv",
+                    "o2",
+                    "st_ocean",
+                    "time",
+                    "time_bounds",
+                    "xt_ocean",
+                    "yt_ocean",
+                ],
+                variable_long_name=[
                     "vertex number",
+                    "o2",
                     "tcell zstar depth",
                     "time",
+                    "time axis boundaries",
                     "tcell longitude",
                     "tcell latitude",
                 ],
-                coord_cartesian_axes=["N", "Z", "T", "X", "Y"],
-                coord_calendar_types=["", "", "GREGORIAN", "", ""],
-                coord_bounds=["", "", "time_bounds", "", ""],
-                coord_units=[
+                variable_standard_name=["", "", "", "", "", "", ""],
+                variable_cell_methods=["", "time: mean", "", "", "", "", ""],
+                variable_units=[
                     "none",
+                    "mmol/m^3",
                     "meters",
                     "days since 0001-01-01 00:00:00",
+                    "days",
                     "degrees_E",
                     "degrees_N",
                 ],
@@ -797,50 +809,57 @@ def test_parse_access_filename(builder, filename, expected):
                     "average_DT",
                     "average_T1",
                     "average_T2",
+                    "nv",
                     "thetao",
+                    "time",
                     "time_bnds",
+                    "xh",
+                    "yh",
+                    "zl",
                 ],
                 variable_long_name=[
                     "Length of average period",
                     "Start time for average period",
                     "End time for average period",
+                    "vertex number",
                     "Sea Water Potential Temperature",
+                    "time",
                     "time axis boundaries",
+                    "h point nominal longitude",
+                    "h point nominal latitude",
+                    "Layer pseudo-depth, -z*",
                 ],
                 variable_standard_name=[
                     "",
                     "",
                     "",
+                    "",
                     "sea_water_potential_temperature",
                     "",
+                    "",
+                    "",
+                    "",
+                    "",
                 ],
                 variable_cell_methods=[
                     "",
                     "",
                     "",
+                    "",
                     "area:mean zl:mean yh:mean xh:mean time: mean",
                     "",
+                    "",
+                    "",
+                    "",
+                    "",
                 ],
                 variable_units=[
                     "days",
                     "days since 0001-01-01 00:00:00",
                     "days since 0001-01-01 00:00:00",
+                    "",
                     "degC",
                     "days since 0001-01-01 00:00:00",
-                ],
-                coord=["nv", "time", "xh", "yh", "zl"],
-                coord_long_name=[
-                    "vertex number",
-                    "time",
-                    "h point nominal longitude",
-                    "h point nominal latitude",
-                    "Layer pseudo-depth, -z*",
-                ],
-                coord_cartesian_axes=["", "", "", "", ""],
-                coord_calendar_types=["", "NOLEAP", "", "", ""],
-                coord_bounds=["", "time_bnds", "", "", ""],
-                coord_units=[
-                    "",
                     "days since 0001-01-01 00:00:00",
                     "degrees_east",
                     "degrees_north",
@@ -859,42 +878,58 @@ def test_parse_access_filename(builder, filename, expected):
                 frequency="1day",
                 start_date="1900-01-01, 00:00:00",
                 end_date="1900-01-02, 00:00:00",
-                variable=["average_DT", "average_T1", "average_T2", "time_bnds", "tos"],
+                variable=[
+                    "average_DT",
+                    "average_T1",
+                    "average_T2",
+                    "nv",
+                    "time",
+                    "time_bnds",
+                    "tos",
+                    "xh",
+                    "yh",
+                ],
                 variable_long_name=[
                     "Length of average period",
                     "Start time for average period",
                     "End time for average period",
+                    "vertex number",
+                    "time",
                     "time axis boundaries",
                     "Sea Surface Temperature",
+                    "h point nominal longitude",
+                    "h point nominal latitude",
+                ],
+                variable_standard_name=[
+                    "",
+                    "",
+                    "",
+                    "",
+                    "",
+                    "",
+                    "sea_surface_temperature",
+                    "",
+                    "",
                 ],
-                variable_standard_name=["", "", "", "", "sea_surface_temperature"],
                 variable_cell_methods=[
                     "",
                     "",
                     "",
                     "",
+                    "",
+                    "",
                     "area:mean yh:mean xh:mean time: mean",
+                    "",
+                    "",
                 ],
                 variable_units=[
                     "days",
                     "days since 0001-01-01 00:00:00",
                     "days since 0001-01-01 00:00:00",
-                    "days since 0001-01-01 00:00:00",
-                    "degC",
-                ],
-                coord=["nv", "time", "xh", "yh"],
-                coord_long_name=[
-                    "vertex number",
-                    "time",
-                    "h point nominal longitude",
-                    "h point nominal latitude",
-                ],
-                coord_cartesian_axes=["", "", "", ""],
-                coord_calendar_types=["", "NOLEAP", "", ""],
-                coord_bounds=["", "time_bnds", "", ""],
-                coord_units=[
                     "",
                     "days since 0001-01-01 00:00:00",
+                    "days since 0001-01-01 00:00:00",
+                    "degC",
                     "degrees_east",
                     "degrees_north",
                 ],
@@ -911,23 +946,21 @@ def test_parse_access_filename(builder, filename, expected):
                 frequency="fx",
                 start_date="none",
                 end_date="none",
-                variable=["geolat", "geolon"],
+                variable=["geolat", "geolon", "xh", "yh"],
                 variable_long_name=[
                     "Latitude of tracer (T) points",
                     "Longitude of tracer (T) points",
-                ],
-                variable_standard_name=["", ""],
-                variable_cell_methods=["time: point", "time: point"],
-                variable_units=["degrees_north", "degrees_east"],
-                coord=["xh", "yh"],
-                coord_long_name=[
                     "h point nominal longitude",
                     "h point nominal latitude",
                 ],
-                coord_cartesian_axes=["", ""],
-                coord_calendar_types=["", ""],
-                coord_bounds=["", ""],
-                coord_units=["degrees_east", "degrees_north"],
+                variable_standard_name=["", "", "", ""],
+                variable_cell_methods=["time: point", "time: point", "", ""],
+                variable_units=[
+                    "degrees_north",
+                    "degrees_east",
+                    "degrees_east",
+                    "degrees_north",
+                ],
             ),
         ),
         (
@@ -945,50 +978,57 @@ def test_parse_access_filename(builder, filename, expected):
                     "average_DT",
                     "average_T1",
                     "average_T2",
+                    "nv",
                     "thetao",
+                    "time",
                     "time_bnds",
+                    "xh",
+                    "yh",
+                    "z_l",
                 ],
                 variable_long_name=[
                     "Length of average period",
                     "Start time for average period",
                     "End time for average period",
+                    "vertex number",
                     "Sea Water Potential Temperature",
+                    "time",
                     "time axis boundaries",
+                    "h point nominal longitude",
+                    "h point nominal latitude",
+                    "Depth at cell center",
                 ],
                 variable_standard_name=[
                     "",
                     "",
                     "",
+                    "",
                     "sea_water_potential_temperature",
                     "",
+                    "",
+                    "",
+                    "",
+                    "",
                 ],
                 variable_cell_methods=[
                     "",
                     "",
                     "",
+                    "",
                     "area:mean z_l:mean yh:mean xh:mean time: mean",
                     "",
+                    "",
+                    "",
+                    "",
+                    "",
                 ],
                 variable_units=[
                     "days",
                     "days since 0001-01-01 00:00:00",
                     "days since 0001-01-01 00:00:00",
+                    "",
                     "degC",
                     "days since 0001-01-01 00:00:00",
-                ],
-                coord=["nv", "time", "xh", "yh", "z_l"],
-                coord_long_name=[
-                    "vertex number",
-                    "time",
-                    "h point nominal longitude",
-                    "h point nominal latitude",
-                    "Depth at cell center",
-                ],
-                coord_cartesian_axes=["", "", "", "", ""],
-                coord_calendar_types=["", "NOLEAP", "", "", ""],
-                coord_bounds=["", "time_bnds", "", "", ""],
-                coord_units=[
-                    "",
                     "days since 0001-01-01 00:00:00",
                     "degrees_east",
                     "degrees_north",
@@ -1007,29 +1047,25 @@ def test_parse_access_filename(builder, filename, expected):
                 frequency="1day",
                 start_date="1900-01-01, 00:00:00",
                 end_date="1900-01-02, 00:00:00",
-                variable=["TLAT", "TLON", "aice", "tarea", "time_bounds"],
+                variable=["TLAT", "TLON", "aice", "tarea", "time", "time_bounds"],
                 variable_long_name=[
                     "T grid center latitude",
                     "T grid center longitude",
                     "ice area  (aggregate)",
                     "area of T grid cells",
+                    "time",
                     "time interval endpoints",
                 ],
-                variable_standard_name=["", "", "", "", ""],
-                variable_cell_methods=["", "", "time: mean", "", ""],
+                variable_standard_name=["", "", "", "", "", ""],
+                variable_cell_methods=["", "", "time: mean", "", "", ""],
                 variable_units=[
                     "degrees_north",
                     "degrees_east",
                     "1",
                     "m^2",
                     "days since 0000-01-01 00:00:00",
+                    "days since 0000-01-01 00:00:00",
                 ],
-                coord=["time"],
-                coord_long_name=["time"],
-                coord_cartesian_axes=[""],
-                coord_calendar_types=[""],
-                coord_bounds=["time_bounds"],
-                coord_units=["days since 0000-01-01 00:00:00"],
             ),
         ),
         (
@@ -1048,12 +1084,6 @@ def test_parse_access_filename(builder, filename, expected):
                 variable_standard_name=["", ""],
                 variable_cell_methods=["", ""],
                 variable_units=["m2 s", "unitless"],
-                coord=[],
-                coord_long_name=[],
-                coord_cartesian_axes=[],
-                coord_calendar_types=[],
-                coord_bounds=[],
-                coord_units=[],
             ),
         ),
     ],

From 396df8e8b0edaadfe742baeda6a1f1fc926ed8b2 Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Thu, 10 Oct 2024 07:54:56 +0800
Subject: [PATCH 15/23] Cleaned up _cmip_realm_translator & added sorting to
 test (order unimportant, causing unnecessary test failures)

---
 src/access_nri_intake/catalog/translators.py | 6 +++---
 tests/test_translators.py                    | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py
index f237c165..21d34ec5 100644
--- a/src/access_nri_intake/catalog/translators.py
+++ b/src/access_nri_intake/catalog/translators.py
@@ -18,6 +18,7 @@
 
 class TranslatorError(Exception):
     "Generic Exception for the Translator classes"
+
     pass
 
 
@@ -311,11 +312,10 @@ def _translate(string: str) -> tuple[str, ...]:
         }
 
         raw_realms = string.split(" ")
-        realms = []
+        realms = set()
         for realm in raw_realms:
             realm = translations.get(realm, realm)
-            if realm not in realms:
-                realms.append(realm)
+            realms |= {realm}
         return tuple(realms)
 
     return series.apply(lambda string: _translate(string))
diff --git a/tests/test_translators.py b/tests/test_translators.py
index 8b2b174f..baa4d73f 100644
--- a/tests/test_translators.py
+++ b/tests/test_translators.py
@@ -120,6 +120,9 @@ def test_cmip_realm_translator(input, expected):
     """Test translation of entries in the CMIP realm column"""
     series = pd.Series(input)
     translated = _cmip_realm_translator(series)
+    # Sort expected & translated to make the test less brittle
+    translated = translated.apply(lambda x: tuple(sorted(x)))
+    expected = [tuple(sorted(x)) for x in expected]
     assert list(translated) == expected
 
 

From babc3da4a1a5e5aaf35443c95cf8af5826f3afeb Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Thu, 10 Oct 2024 14:48:58 +0800
Subject: [PATCH 16/23] Added _DispatchKeys dataclass to hold tagnames

---
 src/access_nri_intake/catalog/translators.py | 57 +++++++++++++++-----
 1 file changed, 45 insertions(+), 12 deletions(-)

diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py
index 3f01b9cf..3f40d72b 100644
--- a/src/access_nri_intake/catalog/translators.py
+++ b/src/access_nri_intake/catalog/translators.py
@@ -6,6 +6,7 @@
 like the ACCESS-NRI catalog
 """
 
+from dataclasses import dataclass
 from functools import partial
 from typing import Callable, Optional
 
@@ -67,6 +68,7 @@ def __init__(self, source: DataSource, columns: list[str]):
             column: partial(self._default_translator, column=column)
             for column in columns
         }
+        self._dispatch_keys = _DispatchKeys()
 
     def _default_translator(self, column: str) -> pd.Series:
         """
@@ -185,24 +187,31 @@ def __init__(self, source: DataSource, columns: list[str]):
         self._dispatch["frequency"] = self._frequency_translator
         self._dispatch["variable"] = self._variable_translator
 
+        self._dispatch_keys = _DispatchKeys(
+            model="source_id",
+            realm="realm",
+            frequency="frequency",
+            variable="variable_id",
+        )
+
     def _model_translator(self):
         """
         Return model from source_id
         """
-        return _to_tuple(self.source.df["source_id"])
+        return _to_tuple(self.source.df[self._dispatch_keys.model])
 
     def _realm_translator(self):
         """
         Return realm, fixing a few issues
         """
-        return _cmip_realm_translator(self.source.df["realm"])
+        return _cmip_realm_translator(self.source.df[self._dispatch_keys.realm])
 
     def _frequency_translator(self):
         """
         Return frequency, fixing a few issues
         """
         return _to_tuple(
-            self.source.df["frequency"].apply(
+            self.source.df[self._dispatch_keys.frequency].apply(
                 lambda x: frequency_translations.get(x, x)
             )
         )
@@ -211,7 +220,7 @@ def _variable_translator(self):
         """
         Return variable as a tuple
         """
-        return _to_tuple(self.source.df["variable_id"])
+        return _to_tuple(self.source.df[self._dispatch_keys.variable])
 
 
 class Cmip5Translator(DefaultTranslator):
@@ -237,24 +246,31 @@ def __init__(self, source: DataSource, columns: list[str]):
         self._dispatch["frequency"] = self._frequency_translator
         self._dispatch["variable"] = self._variable_translator
 
+        self._dispatch_keys = _DispatchKeys(
+            model="model",
+            realm="realm",
+            frequency="frequency",
+            variable="variable",
+        )
+
     def _model_translator(self):
         """
         Return variable as a tuple
         """
-        return _to_tuple(self.source.df["model"])
+        return _to_tuple(self.source.df[self._dispatch_keys.model])
 
     def _realm_translator(self):
         """
         Return realm, fixing a few issues
         """
-        return _cmip_realm_translator(self.source.df["realm"])
+        return _cmip_realm_translator(self.source.df[self._dispatch_keys.realm])
 
     def _frequency_translator(self):
         """
         Return frequency, fixing a few issues
         """
         return _to_tuple(
-            self.source.df["frequency"].apply(
+            self.source.df[self._dispatch_keys.frequency].apply(
                 lambda x: frequency_translations.get(x, x)
             )
         )
@@ -263,7 +279,7 @@ def _variable_translator(self):
         """
         Return variable as a tuple
         """
-        return _to_tuple(self.source.df["variable"])
+        return _to_tuple(self.source.df[self._dispatch_keys.variable])
 
 
 class EraiTranslator(DefaultTranslator):
@@ -285,12 +301,13 @@ def __init__(self, source: DataSource, columns: list[str]):
 
         super().__init__(source, columns)
         self._dispatch["variable"] = self._variable_translator
+        self._dispatch_keys = _DispatchKeys(variable="variable")
 
     def _variable_translator(self):
         """
         Return variable as a tuple
         """
-        return _to_tuple(self.source.df["variable"])
+        return _to_tuple(self.source.df[self._dispatch_keys.variable])
 
 
 class BarpaTranslator(DefaultTranslator):
@@ -315,12 +332,18 @@ def __init__(self, source, columns):
         self._dispatch["realm"] = self._realm_translator
         self._dispatch["frequency"] = self._frequency_translator
         self._dispatch["variable"] = self._variable_translator
+        self._dispatch_keys = _DispatchKeys(
+            model="source_id",
+            realm="realm",
+            variable="variable_id",
+            frequency="freq",
+        )
 
     def _model_translator(self):
         """
         Return model from source_id
         """
-        return _to_tuple(self.source.df["source_id"])
+        return _to_tuple(self.source.df[self._dispatch_keys.model])
 
     def _realm_translator(self):
         """
@@ -333,14 +356,24 @@ def _frequency_translator(self):
         Return frequency, fixing a few issues
         """
         return _to_tuple(
-            self.source.df["freq"].apply(lambda x: frequency_translations.get(x, x))
+            self.source.df[self._dispatch_keys.frequency].apply(
+                lambda x: frequency_translations.get(x, x)
+            )
         )
 
     def _variable_translator(self):
         """
         Return variable as a tuple
         """
-        return _to_tuple(self.source.df["variable_id"])
+        return _to_tuple(self.source.df[self._dispatch_keys.variable])
+
+
+@dataclass
+class _DispatchKeys:
+    model: Optional[str] = None
+    realm: Optional[str] = None
+    frequency: Optional[str] = None
+    variable: Optional[str] = None
 
 
 def _cmip_realm_translator(series) -> pd.Series:

From 0e2d6698c130bebc6f451ece669cde8358a13210 Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Thu, 10 Oct 2024 15:31:51 +0800
Subject: [PATCH 17/23] Refactored a bunch of the translations out to the
 DefaultTranslator

---
 src/access_nri_intake/catalog/translators.py | 161 ++++++++-----------
 tests/test_translators.py                    |   4 +-
 2 files changed, 67 insertions(+), 98 deletions(-)

diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py
index 3f40d72b..8027f906 100644
--- a/src/access_nri_intake/catalog/translators.py
+++ b/src/access_nri_intake/catalog/translators.py
@@ -16,7 +16,7 @@
 
 from . import COLUMNS_WITH_ITERABLES
 
-frequency_translations = {
+FREQUENCY_TRANSLATIONS = {
     "3hrPt": "3hr",
     "6hrPt": "6hr",
     "daily": "1day",
@@ -32,6 +32,36 @@
 }
 
 
+def _to_tuple(series: pd.Series) -> pd.Series:
+    """
+    Make each entry in the provided series a tuple
+
+    Parameters
+    ----------
+    series: :py:class:`~pandas.Series`
+        A pandas Series or another object with an `apply` method
+    """
+    return series.apply(lambda x: (x,))
+
+
+def tuplify_series(func: Callable) -> Callable:
+    """
+    Decorator that wraps a function that returns a pandas Series and converts
+    each entry in the series to a tuple
+    """
+
+    def wrapper(*args, **kwargs):
+        # Check if the first argument is 'self'
+        if len(args) > 0 and hasattr(args[0], "__class__"):
+            self = args[0]
+            series = func(self, *args[1:], **kwargs)
+        else:
+            series = func(*args, **kwargs)
+        return _to_tuple(series)
+
+    return wrapper
+
+
 class TranslatorError(Exception):
     "Generic Exception for the Translator classes"
 
@@ -163,6 +193,35 @@ def _unique_values(series):
 
         return df[self.columns]  # Preserve ordering
 
+    def _realm_translator(self) -> pd.Series:
+        """
+        Return realm, fixing a few issues
+        """
+        return _cmip_realm_translator(self.source.df[self._dispatch_keys.realm])
+
+    @tuplify_series
+    def _model_translator(self) -> pd.Series:
+        """
+        Return model from dispatch_keys.model
+        """
+        return self.source.df[self._dispatch_keys.model]
+
+    @tuplify_series
+    def _frequency_translator(self) -> pd.Series:
+        """
+        Return frequency, fixing a few issues
+        """
+        return self.source.df[self._dispatch_keys.frequency].apply(
+            lambda x: FREQUENCY_TRANSLATIONS.get(x, x)
+        )
+
+    @tuplify_series
+    def _variable_translator(self) -> pd.Series:
+        """
+        Return variable as a tuple
+        """
+        return self.source.df[self._dispatch_keys.variable]
+
 
 class Cmip6Translator(DefaultTranslator):
     """
@@ -194,34 +253,6 @@ def __init__(self, source: DataSource, columns: list[str]):
             variable="variable_id",
         )
 
-    def _model_translator(self):
-        """
-        Return model from source_id
-        """
-        return _to_tuple(self.source.df[self._dispatch_keys.model])
-
-    def _realm_translator(self):
-        """
-        Return realm, fixing a few issues
-        """
-        return _cmip_realm_translator(self.source.df[self._dispatch_keys.realm])
-
-    def _frequency_translator(self):
-        """
-        Return frequency, fixing a few issues
-        """
-        return _to_tuple(
-            self.source.df[self._dispatch_keys.frequency].apply(
-                lambda x: frequency_translations.get(x, x)
-            )
-        )
-
-    def _variable_translator(self):
-        """
-        Return variable as a tuple
-        """
-        return _to_tuple(self.source.df[self._dispatch_keys.variable])
-
 
 class Cmip5Translator(DefaultTranslator):
     """
@@ -253,34 +284,6 @@ def __init__(self, source: DataSource, columns: list[str]):
             variable="variable",
         )
 
-    def _model_translator(self):
-        """
-        Return variable as a tuple
-        """
-        return _to_tuple(self.source.df[self._dispatch_keys.model])
-
-    def _realm_translator(self):
-        """
-        Return realm, fixing a few issues
-        """
-        return _cmip_realm_translator(self.source.df[self._dispatch_keys.realm])
-
-    def _frequency_translator(self):
-        """
-        Return frequency, fixing a few issues
-        """
-        return _to_tuple(
-            self.source.df[self._dispatch_keys.frequency].apply(
-                lambda x: frequency_translations.get(x, x)
-            )
-        )
-
-    def _variable_translator(self):
-        """
-        Return variable as a tuple
-        """
-        return _to_tuple(self.source.df[self._dispatch_keys.variable])
-
 
 class EraiTranslator(DefaultTranslator):
     """
@@ -303,11 +306,11 @@ def __init__(self, source: DataSource, columns: list[str]):
         self._dispatch["variable"] = self._variable_translator
         self._dispatch_keys = _DispatchKeys(variable="variable")
 
-    def _variable_translator(self):
-        """
-        Return variable as a tuple
-        """
-        return _to_tuple(self.source.df[self._dispatch_keys.variable])
+    def _realm_translator(self) -> pd.Series:
+        raise AttributeError("ERAI data does not have a realm column")
+
+    def _frequency_translator(self) -> pd.Series:
+        raise AttributeError("ERAI data does not have a frequency column")
 
 
 class BarpaTranslator(DefaultTranslator):
@@ -339,34 +342,12 @@ def __init__(self, source, columns):
             frequency="freq",
         )
 
-    def _model_translator(self):
-        """
-        Return model from source_id
-        """
-        return _to_tuple(self.source.df[self._dispatch_keys.model])
-
     def _realm_translator(self):
         """
         Return realm, fixing a few issues
         """
         return self.source.df.apply(lambda x: ("none",), 1)
 
-    def _frequency_translator(self):
-        """
-        Return frequency, fixing a few issues
-        """
-        return _to_tuple(
-            self.source.df[self._dispatch_keys.frequency].apply(
-                lambda x: frequency_translations.get(x, x)
-            )
-        )
-
-    def _variable_translator(self):
-        """
-        Return variable as a tuple
-        """
-        return _to_tuple(self.source.df[self._dispatch_keys.variable])
-
 
 @dataclass
 class _DispatchKeys:
@@ -400,15 +381,3 @@ def _translate(string: str) -> tuple[str, ...]:
         return tuple(realms)
 
     return series.apply(lambda string: _translate(string))
-
-
-def _to_tuple(series: pd.Series) -> pd.Series:
-    """
-    Make each entry in the provided series a tuple
-
-    Parameters
-    ----------
-    series: :py:class:`~pandas.Series`
-        A pandas Series or another object with an `apply` method
-    """
-    return series.apply(lambda x: (x,))
diff --git a/tests/test_translators.py b/tests/test_translators.py
index deea65da..825e7a5f 100644
--- a/tests/test_translators.py
+++ b/tests/test_translators.py
@@ -7,6 +7,7 @@
 
 from access_nri_intake.catalog import CORE_COLUMNS, TRANSLATOR_GROUPBY_COLUMNS
 from access_nri_intake.catalog.translators import (
+    FREQUENCY_TRANSLATIONS,
     BarpaTranslator,
     Cmip5Translator,
     Cmip6Translator,
@@ -15,7 +16,6 @@
     TranslatorError,
     _cmip_realm_translator,
     _to_tuple,
-    frequency_translations,
 )
 
 
@@ -68,7 +68,7 @@
 def test_cmip_frequency_translator(input, expected):
     """Test translation of entries in the CMIP frequency column"""
     series = pd.Series(input)
-    translated = series.apply(lambda x: frequency_translations.get(x, x))
+    translated = series.apply(lambda x: FREQUENCY_TRANSLATIONS.get(x, x))
     assert list(translated) == expected
 
 

From 28a8a460183178b841a11205de280692139e2aaa Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Fri, 11 Oct 2024 09:29:46 +0800
Subject: [PATCH 18/23] Updated translators.py to include CordexTranslator,
 added config/cordex.yaml and
 config/metadata_sources/cordex-ig45/metadata.yaml

---
 config/cordex.yaml                            |   9 ++
 .../cordex-ig45/metadata.yaml                 | 127 ++++++++++++++++++
 src/access_nri_intake/catalog/translators.py  |  47 ++++++-
 3 files changed, 181 insertions(+), 2 deletions(-)
 create mode 100644 config/cordex.yaml
 create mode 100644 config/metadata_sources/cordex-ig45/metadata.yaml

diff --git a/config/cordex.yaml b/config/cordex.yaml
new file mode 100644
index 00000000..da5338f4
--- /dev/null
+++ b/config/cordex.yaml
@@ -0,0 +1,9 @@
+builder: null
+
+translator: CordexTranslator
+
+sources:
+
+  - metadata_yaml: /g/data/xp65/admin/access-nri-intake-catalog/config/metadata_sources/cordex-ig45/metadata.yaml
+    path:
+      - /g/data/ig45/catalog/v2/esm/catalog.json
diff --git a/config/metadata_sources/cordex-ig45/metadata.yaml b/config/metadata_sources/cordex-ig45/metadata.yaml
new file mode 100644
index 00000000..c118c402
--- /dev/null
+++ b/config/metadata_sources/cordex-ig45/metadata.yaml
@@ -0,0 +1,127 @@
+name: cmip6_ig45
+experiment_uuid: c7021d1e-7ba2-11ef-beb5-000007d3fe80
+description: 20km regional projections for CORDEX-CMIP6 from the Queensland Future Climate Science Program
+long_description: >-
+  This dataset includes projections at 20km and 10km, formatted to meet the CORDEX-CMIP6 data standards. 
+  The 20km projections were derived from the 10km projections.
+model:
+- CMIP6
+frequency:
+- day
+- mon
+- 1hr
+- fx
+variable:
+- clt
+- tauv
+- clh
+- clwvi
+- ua850
+- sund
+- ua100m
+- va250
+- uas
+- prc
+- vas
+- mrfso
+- rlds
+- ta200
+- hus1000
+- hus600
+- prw
+- hus850
+- va200
+- tas
+- clivi
+- zg200
+- rsut
+- va600
+- rsdt
+- tasmax
+- sfcWindmax
+- va850
+- mrso
+- ps
+- hus400
+- ta1000
+- ua250
+- tauu
+- pr
+- va925
+- snc
+- hus200
+- clm
+- zg500
+- hurs
+- rlut
+- hus300
+- rsds
+- ua200
+- psl
+- ta850
+- va400
+- zg400
+- snm
+- ta925
+- prsn
+- hus250
+- zg1000
+- ta600
+- zg925
+- huss
+- ta500
+- va1000
+- zg700
+- zmla
+- hfss
+- zg850
+- ua925
+- zg600
+- ua300
+- rsus
+- hus500
+- sfcWind
+- ts
+- va500
+- va100m
+- ua500
+- ua700
+- va700
+- soilt
+- snd
+- ua1000
+- ta700
+- hfls
+- tasmin
+- zg250
+- cll
+- hus700
+- rlus
+- va300
+- ua600
+- hus925
+- ta250
+- ua400
+- prhmax
+- sftlf
+- ta400
+- ta300
+- snw
+- zg300
+- orog
+- sftlaf
+nominal_resolution:
+- 20km
+- 10km
+version: 
+contact: NCI
+email: help@nci.org.au
+reference:
+license:
+url: https://geonetwork.nci.org.au/geonetwork/srv/eng/catalog.search#/metadata/f7465_8388_5100_7022
+parent_experiment:
+related_experiments:
+-
+notes: 
+keywords:
+- cmip
\ No newline at end of file
diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py
index 8027f906..51398ca0 100644
--- a/src/access_nri_intake/catalog/translators.py
+++ b/src/access_nri_intake/catalog/translators.py
@@ -307,10 +307,14 @@ def __init__(self, source: DataSource, columns: list[str]):
         self._dispatch_keys = _DispatchKeys(variable="variable")
 
     def _realm_translator(self) -> pd.Series:
-        raise AttributeError("ERAI data does not have a realm column")
+        raise AttributeError(
+            f"{self.__class__.__name__}: data does not have a realm column"
+        )
 
     def _frequency_translator(self) -> pd.Series:
-        raise AttributeError("ERAI data does not have a frequency column")
+        raise AttributeError(
+            f"{self.__class__.__name__}: data does not have a frequency column"
+        )
 
 
 class BarpaTranslator(DefaultTranslator):
@@ -349,8 +353,47 @@ def _realm_translator(self):
         return self.source.df.apply(lambda x: ("none",), 1)
 
 
+class CordexTranslator(DefaultTranslator):
+    """
+    Cordex Translator for translating metadata from the NCI CORDEX intake datastores.
+    """
+
+    def __init__(self, source, columns):
+        """
+        Initialise a CordexTranslator
+
+        Parameters
+        ----------
+        source: :py:class:`~intake.DataSource`
+            The NCI CORDEX intake-esm datastore
+        columns: list of str
+            The columns to translate to (these are the core columns in the intake-dataframe-catalog)
+        """
+
+        super().__init__(source, columns)
+        self._dispatch["model"] = self._model_translator
+        self._dispatch["realm"] = self._realm_translator
+        self._dispatch["frequency"] = self._frequency_translator
+        self._dispatch["variable"] = self._variable_translator
+
+        self._dispatch_keys = _DispatchKeys(
+            model="source_id",
+            frequency="frequency",
+            variable="variable_id",
+        )
+
+    def _realm_translator(self) -> pd.Series:
+        raise AttributeError(
+            f"{self.__class__.__name__}: data does not have a realm column"
+        )
+
+
 @dataclass
 class _DispatchKeys:
+    """
+    Data class to store the keys for the dispatch dictionary in the Translator classes
+    """
+
     model: Optional[str] = None
     realm: Optional[str] = None
     frequency: Optional[str] = None

From 8dc5c2a5d39514b0b35ba977b52ccea32ba7fa8f Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Fri, 11 Oct 2024 11:29:50 +0800
Subject: [PATCH 19/23] Working cordex translator

---
 src/access_nri_intake/catalog/translators.py | 16 +++--
 tests/data/esm_datastore/cordex-ig45.csv     |  6 ++
 tests/data/esm_datastore/cordex-ig45.json    | 70 ++++++++++++++++++++
 tests/test_translators.py                    | 18 +++++
 4 files changed, 103 insertions(+), 7 deletions(-)
 create mode 100644 tests/data/esm_datastore/cordex-ig45.csv
 create mode 100644 tests/data/esm_datastore/cordex-ig45.json

diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py
index 51398ca0..32749411 100644
--- a/src/access_nri_intake/catalog/translators.py
+++ b/src/access_nri_intake/catalog/translators.py
@@ -308,12 +308,12 @@ def __init__(self, source: DataSource, columns: list[str]):
 
     def _realm_translator(self) -> pd.Series:
         raise AttributeError(
-            f"{self.__class__.__name__}: data does not have a realm column"
+            f"{self.__class__.__name__}: 'realm' does not require translation"
         )
 
     def _frequency_translator(self) -> pd.Series:
         raise AttributeError(
-            f"{self.__class__.__name__}: data does not have a frequency column"
+            f"{self.__class__.__name__}: 'data' does not require translation"
         )
 
 
@@ -372,20 +372,22 @@ def __init__(self, source, columns):
 
         super().__init__(source, columns)
         self._dispatch["model"] = self._model_translator
-        self._dispatch["realm"] = self._realm_translator
         self._dispatch["frequency"] = self._frequency_translator
         self._dispatch["variable"] = self._variable_translator
+        self._dispatch["realm"] = self._variable_translator
 
         self._dispatch_keys = _DispatchKeys(
             model="source_id",
             frequency="frequency",
             variable="variable_id",
+            realm="realm",
         )
 
-    def _realm_translator(self) -> pd.Series:
-        raise AttributeError(
-            f"{self.__class__.__name__}: data does not have a realm column"
-        )
+    def _realm_translator(self):
+        """
+        Return realm, fixing a few issues
+        """
+        return self.source.df.apply(lambda x: ("none",), 1)
 
 
 @dataclass
diff --git a/tests/data/esm_datastore/cordex-ig45.csv b/tests/data/esm_datastore/cordex-ig45.csv
new file mode 100644
index 00000000..06f6a2fd
--- /dev/null
+++ b/tests/data/esm_datastore/cordex-ig45.csv
@@ -0,0 +1,6 @@
+path,file_type,project_id,resolution,institution_id,source_id,experiment_id,member_id,frequency,variable_id,version,time_range
+/g/data/ig45/QldFCP-2/output/CMIP6/DD/AUS-10i/UQ-DEC/ACCESS-CM2/ssp126/r2i1p1f1/CCAMoc-v2112/v1-r1/day/hus200/v20240709/hus200_AUS-10i_ACCESS-CM2_ssp126_r2i1p1f1_UQ-DEC_CCAMoc-v2112_v1-r1_day_20580101-20581231.nc,f,output,AUS-10i,UQ-DEC,ACCESS-CM2,ssp126,r2i1p1f1,day,hus200,v20240709,20580101-20581231
+/g/data/ig45/QldFCP-2/CORDEX/CMIP6/DD/AUS-20i/UQ-DEC/ACCESS-ESM1-5/ssp126/r20i1p1f1/CCAMoc-v2112/v1-r1/mon/va925/v20240722/va925_AUS-20i_ACCESS-ESM1-5_ssp126_r20i1p1f1_UQ-DEC_CCAMoc-v2112_v1-r1_mon_208101-209012.nc,f,CORDEX,AUS-20i,UQ-DEC,ACCESS-ESM1-5,ssp126,r20i1p1f1,mon,va925,v20240722,208101-209012
+/g/data/ig45/QldFCP-2/CORDEX/CMIP6/DD/AUS-20i/UQ-DEC/ACCESS-ESM1-5/ssp370/r6i1p1f1/CCAM-v2105/v1-r1/mon/clh/v20240722/clh_AUS-20i_ACCESS-ESM1-5_ssp370_r6i1p1f1_UQ-DEC_CCAM-v2105_v1-r1_mon_201501-202012.nc,f,CORDEX,AUS-20i,UQ-DEC,ACCESS-ESM1-5,ssp370,r6i1p1f1,mon,clh,v20240722,201501-202012
+/g/data/ig45/QldFCP-2/output/CMIP6/DD/AUS-10i/UQ-DEC/ACCESS-CM2/ssp126/r2i1p1f1/CCAMoc-v2112/v1-r1/day/ta850/v20240709/ta850_AUS-10i_ACCESS-CM2_ssp126_r2i1p1f1_UQ-DEC_CCAMoc-v2112_v1-r1_day_20340101-20341231.nc,f,output,AUS-10i,UQ-DEC,ACCESS-CM2,ssp126,r2i1p1f1,day,ta850,v20240709,20340101-20341231
+/g/data/ig45/QldFCP-2/CORDEX/CMIP6/DD/AUS-20i/UQ-DEC/NorESM2-MM/ssp126/r1i1p1f1/CCAMoc-v2112/v1-r1/mon/hus200/v20240722/hus200_AUS-20i_NorESM2-MM_ssp126_r1i1p1f1_UQ-DEC_CCAMoc-v2112_v1-r1_mon_201501-202012.nc,f,CORDEX,AUS-20i,UQ-DEC,NorESM2-MM,ssp126,r1i1p1f1,mon,hus200,v20240722,201501-202012
diff --git a/tests/data/esm_datastore/cordex-ig45.json b/tests/data/esm_datastore/cordex-ig45.json
new file mode 100644
index 00000000..5fc783b9
--- /dev/null
+++ b/tests/data/esm_datastore/cordex-ig45.json
@@ -0,0 +1,70 @@
+{
+    "id": "qldfcp-2-ig45",
+    "title": "qldfcp-2-ig45",
+    "description": "Datasets on Gadi, both publised and replicated. All file versions present are in the listing\nMaintained By: NCI\nContact: help@nci.org.au",
+    "assets": {
+        "column_name": "path",
+        "format": "netcdf"
+    },
+    "aggregation_control": {
+        "variable_column_name": "variable_id",
+        "groupby_attrs": [
+            "file_type",
+            "project_id",
+            "resolution",
+            "institution_id",
+            "source_id",
+            "experiment_id",
+            "member_id",
+            "frequency",
+            "variable_id",
+            "version"
+        ],
+        "aggregations": [
+            {
+                "type": "join_existing",
+                "attribute_name": "time_range",
+                "options": {
+                    "dim": "time"
+                }
+            }
+        ]
+    },
+    "esmcat_version": "0.1.0",
+    "catalog_file": "cordex-ig45.csv",
+    "attributes": [
+        {
+            "column_name": "file_type"
+        },
+        {
+            "column_name": "project_id"
+        },
+        {
+            "column_name": "resolution"
+        },
+        {
+            "column_name": "institution_id"
+        },
+        {
+            "column_name": "source_id"
+        },
+        {
+            "column_name": "experiment_id"
+        },
+        {
+            "column_name": "member_id"
+        },
+        {
+            "column_name": "frequency"
+        },
+        {
+            "column_name": "variable_id"
+        },
+        {
+            "column_name": "version"
+        },
+        {
+            "column_name": "time_range"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/tests/test_translators.py b/tests/test_translators.py
index 825e7a5f..bf3d6708 100644
--- a/tests/test_translators.py
+++ b/tests/test_translators.py
@@ -11,6 +11,7 @@
     BarpaTranslator,
     Cmip5Translator,
     Cmip6Translator,
+    CordexTranslator,
     DefaultTranslator,
     EraiTranslator,
     TranslatorError,
@@ -263,3 +264,20 @@ def test_BarpaTranslator(test_data, groupby, n_entries):
     esmds.description = "description"
     df = BarpaTranslator(esmds, CORE_COLUMNS).translate(groupby)
     assert len(df) == n_entries
+
+
+@pytest.mark.parametrize(
+    "groupby, n_entries",
+    [
+        (None, 5),
+        (["variable"], 4),
+        (["frequency"], 2),
+    ],
+)
+def test_CordexTranslator(test_data, groupby, n_entries):
+    """Test CORDEX datastore translator"""
+    esmds = intake.open_esm_datastore(test_data / "esm_datastore/cordex-ig45.json")
+    esmds.name = "name"
+    esmds.description = "description"
+    df = CordexTranslator(esmds, CORE_COLUMNS).translate(groupby)
+    assert len(df) == n_entries

From b3da5772a245b7eb2ec1086b3c77905727e646ec Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Mon, 14 Oct 2024 11:20:24 +0800
Subject: [PATCH 20/23] Revert "Merge branch '660-coordinate-variables' into
 199-data-request-20km-regional-projections-for-cordex-cmip6-queensland-future-climate-science"

This reverts commit 27e460f7d5b632a7ca7acb14cee8aae9ec942791, reversing
changes made to 0c96e28f347c553afdb22c335f89ab5fd6f19b0b.
---
 .gitignore                                   |   3 -
 .pre-commit-config.yaml                      |   7 -
 src/access_nri_intake/catalog/manager.py     |  25 +-
 src/access_nri_intake/catalog/translators.py |  24 +-
 src/access_nri_intake/source/builders.py     | 253 +++---
 src/access_nri_intake/source/utils.py        | 108 +--
 src/access_nri_intake/utils.py               |  13 +-
 tests/test_builders.py                       | 774 ++++++-------------
 8 files changed, 418 insertions(+), 789 deletions(-)

diff --git a/.gitignore b/.gitignore
index b18b515f..da4e917f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -131,6 +131,3 @@ dmypy.json
 sandpit.ipynb
 *.DS_Store
 bin/build_all.sh.o*
-
-# Vs Code 
-.vscode/
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2843edd9..13106eeb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -10,10 +10,3 @@ repos:
     hooks:
     - id: black
       language_version: python3
-# Mypy
-  - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v1.11.2'  
-    hooks:
-    - id: mypy 
-      name: mypy 
-      additional_dependencies: [types-PyYAML==6.0.12.20240808]
diff --git a/src/access_nri_intake/catalog/manager.py b/src/access_nri_intake/catalog/manager.py
index 37b6cf01..f3d03243 100644
--- a/src/access_nri_intake/catalog/manager.py
+++ b/src/access_nri_intake/catalog/manager.py
@@ -4,7 +4,6 @@
 """ Manager for adding/updating intake sources in an intake-dataframe-catalog like the ACCESS-NRI catalog """
 
 import os
-from typing import Optional, Union
 
 import intake
 from intake_dataframe_catalog.core import DfFileCatalog
@@ -31,7 +30,7 @@ class CatalogManager:
     Add/update intake sources in an intake-dataframe-catalog like the ACCESS-NRI catalog
     """
 
-    def __init__(self, path: str):
+    def __init__(self, path):
         """
         Initialise a CatalogManager instance to add/update intake sources in a
         intake-dataframe-catalog like the ACCESS-NRI catalog
@@ -59,14 +58,14 @@ def __init__(self, path: str):
 
     def build_esm(
         self,
-        name: str,
-        description: str,
+        name,
+        description,
         builder,
-        path: Union[str, list[str]],
+        path,
         translator=DefaultTranslator,
-        metadata: Optional[dict] = None,
-        directory: Optional[str] = None,
-        overwrite: bool = False,
+        metadata=None,
+        directory=None,
+        overwrite=False,
         **kwargs,
     ):
         """
@@ -125,12 +124,12 @@ def build_esm(
 
     def load(
         self,
-        name: str,
-        description: str,
-        path: str,
-        driver: str = "esm_datastore",
+        name,
+        description,
+        path,
+        driver="esm_datastore",
         translator=DefaultTranslator,
-        metadata: Optional[dict] = None,
+        metadata=None,
         **kwargs,
     ):
         """
diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py
index 59cf678a..c048d0a7 100644
--- a/src/access_nri_intake/catalog/translators.py
+++ b/src/access_nri_intake/catalog/translators.py
@@ -12,7 +12,6 @@
 
 import pandas as pd
 import tlz
-from intake import DataSource
 
 from . import COLUMNS_WITH_ITERABLES
 
@@ -74,7 +73,7 @@ class DefaultTranslator:
     of metadata for use in an intake-dataframe-catalog.
     """
 
-    def __init__(self, source: DataSource, columns: list[str]):
+    def __init__(self, source, columns):
         """
         Initialise a DefaultTranslator. This Translator works as follows:
 
@@ -94,13 +93,13 @@ def __init__(self, source: DataSource, columns: list[str]):
 
         self.source = source
         self.columns = columns
-        self._dispatch: dict[str, Callable[[], pd.Series]] = {
+        self._dispatch = {
             column: partial(self._default_translator, column=column)
             for column in columns
         }
         self._dispatch_keys = _DispatchKeys()
 
-    def _default_translator(self, column: str) -> pd.Series:
+    def _default_translator(self, column):
         """
         Try to translate a column from a source using the default translator. This translator works as follows:
         - If the input source is an intake-esm datastore, the translator will first look for the column in the
@@ -146,7 +145,7 @@ def _default_translator(self, column: str) -> pd.Series:
 
         return pd.Series([val] * len_df)
 
-    def translate(self, groupby: Optional[list[str]] = None) -> pd.DataFrame:
+    def translate(self, groupby=None):
         """
         Return the translated :py:class:`~pandas.DataFrame` of metadata and merge into set of
         set of rows with unique values of the columns specified.
@@ -228,7 +227,7 @@ class Cmip6Translator(DefaultTranslator):
     CMIP6 Translator for translating metadata from the NCI CMIP6 intake datastores.
     """
 
-    def __init__(self, source: DataSource, columns: list[str]):
+    def __init__(self, source, columns):
         """
         Initialise a Cmip6Translator
 
@@ -259,7 +258,7 @@ class Cmip5Translator(DefaultTranslator):
     CMIP5 Translator for translating metadata from the NCI CMIP5 intake datastores.
     """
 
-    def __init__(self, source: DataSource, columns: list[str]):
+    def __init__(self, source, columns):
         """
         Initialise a Cmip5Translator
 
@@ -290,7 +289,7 @@ class EraiTranslator(DefaultTranslator):
     ERAI Translator for translating metadata from the NCI ERA-Interim intake datastore.
     """
 
-    def __init__(self, source: DataSource, columns: list[str]):
+    def __init__(self, source, columns):
         """
         Initialise a EraiTranslator
 
@@ -402,14 +401,13 @@ class _DispatchKeys:
     variable: Optional[str] = None
 
 
-def _cmip_realm_translator(series) -> pd.Series:
+def _cmip_realm_translator(series):
     """
-    Return realm from CMIP realm metadata, fixing some issues. This function takes
-    a series of strings and returns a series of tuples as there are sometimes multiple
-    realms per cmip asset
+    Return realm from CMIP realm metadata, fixing some issues. This function returns
+    a tuple as there are sometimes multiple realms per cmip asset
     """
 
-    def _translate(string: str) -> tuple[str, ...]:
+    def _translate(string):
         translations = {
             "na": "none",
             "landonly": "land",
diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py
index a05e8fed..13aa5249 100644
--- a/src/access_nri_intake/source/builders.py
+++ b/src/access_nri_intake/source/builders.py
@@ -7,22 +7,16 @@
 import re
 import traceback
 from pathlib import Path
-from typing import Optional, Union
 
 import xarray as xr
 from ecgtools.builder import INVALID_ASSET, TRACEBACK, Builder
 
 from ..utils import validate_against_schema
 from . import ESM_JSONSCHEMA, PATH_COLUMN, VARIABLE_COLUMN
-from .utils import (
-    EmptyFileError,
-    _AccessNCFileInfo,
-    _DataVarInfo,
-    get_timeinfo,
-)
+from .utils import EmptyFileError, get_timeinfo
 
 # Frequency translations
-FREQUENCIES: dict[str, tuple[int, str]] = {
+FREQUENCIES = {
     "daily": (1, "day"),
     "_dai$": (1, "day"),
     "month": (1, "mon"),
@@ -53,19 +47,19 @@ class BaseBuilder(Builder):
     """
 
     # Base class carries an empty set
-    PATTERNS: list = []
+    PATTERNS = []
 
     def __init__(
         self,
-        path: Union[str, list[str]],
-        depth: int = 0,
-        exclude_patterns: Optional[list[str]] = None,
-        include_patterns: Optional[list[str]] = None,
-        data_format: str = "netcdf",
-        groupby_attrs: Optional[list[str]] = None,
-        aggregations: Optional[list[dict]] = None,
-        storage_options: Optional[dict] = None,
-        joblib_parallel_kwargs: dict = {"n_jobs": multiprocessing.cpu_count()},
+        path,
+        depth=0,
+        exclude_patterns=None,
+        include_patterns=None,
+        data_format="netcdf",
+        groupby_attrs=None,
+        aggregations=None,
+        storage_options=None,
+        joblib_parallel_kwargs={"n_jobs": multiprocessing.cpu_count()},
     ):
         """
         This method should be overwritten. The expection is that some of these arguments
@@ -119,7 +113,7 @@ def parse(self):
         self._parse()
         return self
 
-    def _save(self, name: str, description: str, directory: Union[str, None]):
+    def _save(self, name, description, directory):
         super().save(
             name=name,
             path_column_name=PATH_COLUMN,
@@ -134,9 +128,7 @@ def _save(self, name: str, description: str, directory: Union[str, None]):
             to_csv_kwargs={"compression": "gzip"},
         )
 
-    def save(
-        self, name: str, description: str, directory: Optional[str] = None
-    ) -> None:
+    def save(self, name, description, directory=None):
         """
         Save datastore contents to a file.
 
@@ -218,12 +210,8 @@ def parser(file):
 
     @classmethod
     def parse_access_filename(
-        cls,
-        filename: str,
-        patterns: Optional[list[str]] = None,
-        frequencies: dict = FREQUENCIES,
-        redaction_fill: str = "X",
-    ) -> tuple[str, Union[str, None], Union[str, None]]:
+        cls, filename, patterns=None, frequencies=FREQUENCIES, redaction_fill: str = "X"
+    ):
         """
         Parse an ACCESS model filename and return a file id and any time information
 
@@ -231,22 +219,16 @@ def parse_access_filename(
         ----------
         filename: str
             The filename to parse with the extension removed
-        patterns: list of str, optional
-            A list of regex patterns to match against the filename. If None, use the class PATTERNS
-        frequencies: dict, optional
-            A dictionary of regex patterns to match against the filename to determine the frequency
-        redaction_fill: str, optional
-            The character to replace time information with. Defaults to "X"
 
         Returns
         -------
         file_id: str
             The file id constructed by redacting time information and replacing non-python characters
             with underscores
-        timestamp: str | None
-            A string of the redacted time information (e.g. "1990-01") if available, otherwise None
-        frequency: str | None
-            The frequency of the file if available in the filename, otherwise None
+        timestamp: str
+            A string of the redacted time information (e.g. "1990-01")
+        frequency: str
+            The frequency of the file if available in the filename
         """
         if patterns is None:
             patterns = cls.PATTERNS
@@ -278,9 +260,7 @@ def parse_access_filename(
         return file_id, timestamp, frequency
 
     @classmethod
-    def parse_access_ncfile(
-        cls, file: str, time_dim: str = "time"
-    ) -> _AccessNCFileInfo:
+    def parse_access_ncfile(cls, file, time_dim="time"):
         """
         Get Intake-ESM datastore entry info from an ACCESS netcdf file
 
@@ -293,18 +273,13 @@ def parse_access_ncfile(
 
         Returns
         -------
-        output_nc_info: AccessNCFileInfo
-            A dataclass containing the information parsed from the file
-
-        Raises
-        ------
-        EmptyFileError: If the file contains no variables
         """
 
-        file_path = Path(file)
+        file = Path(file)
+        filename = file.name
 
         file_id, filename_timestamp, filename_frequency = cls.parse_access_filename(
-            file_path.stem
+            file.stem
         )
 
         with xr.open_dataset(
@@ -314,31 +289,51 @@ def parse_access_ncfile(
             decode_times=False,
             decode_coords=False,
         ) as ds:
-            dvars = _DataVarInfo()
-
-            for var in ds.variables:
+            variable_list = []
+            variable_long_name_list = []
+            variable_standard_name_list = []
+            variable_cell_methods_list = []
+            variable_units_list = []
+            for var in ds.data_vars:
                 attrs = ds[var].attrs
-                dvars.append_attrs(var, attrs)  # type: ignore
+                if "long_name" in attrs:
+                    variable_list.append(var)
+                    variable_long_name_list.append(attrs["long_name"])
+                    if "standard_name" in attrs:
+                        variable_standard_name_list.append(attrs["standard_name"])
+                    else:
+                        variable_standard_name_list.append("")
+                    if "cell_methods" in attrs:
+                        variable_cell_methods_list.append(attrs["cell_methods"])
+                    else:
+                        variable_cell_methods_list.append("")
+                    if "units" in attrs:
+                        variable_units_list.append(attrs["units"])
+                    else:
+                        variable_units_list.append("")
 
             start_date, end_date, frequency = get_timeinfo(
                 ds, filename_frequency, time_dim
             )
 
-        if not dvars.variable_list:
+        if not variable_list:
             raise EmptyFileError("This file contains no variables")
 
-        output_ncfile = _AccessNCFileInfo(
-            filename=file_path.name,
-            path=file,
-            file_id=file_id,
-            filename_timestamp=filename_timestamp,
-            frequency=frequency,
-            start_date=start_date,
-            end_date=end_date,
-            **dvars.to_ncinfo_dict(),
+        outputs = (
+            filename,
+            file_id,
+            filename_timestamp,
+            frequency,
+            start_date,
+            end_date,
+            variable_list,
+            variable_long_name_list,
+            variable_standard_name_list,
+            variable_cell_methods_list,
+            variable_units_list,
         )
 
-        return output_ncfile
+        return outputs
 
 
 class AccessOm2Builder(BaseBuilder):
@@ -383,23 +378,44 @@ def __init__(self, path):
         super().__init__(**kwargs)
 
     @classmethod
-    def parser(cls, file) -> dict:
+    def parser(cls, file):
         try:
-            # mypy gets upset as match can return None. I assume this is why we
-            # have try/except block in the first place? If so, we might be able
-            # to make this more explicit?
-            match_groups = re.match(r".*/output\d+/([^/]*)/.*\.nc", file).groups()  # type: ignore
+            match_groups = re.match(r".*/output\d+/([^/]*)/.*\.nc", file).groups()
             realm = match_groups[0]
 
             if realm == "ice":
                 realm = "seaIce"
 
-            nc_info = cls.parse_access_ncfile(file)
-            ncinfo_dict = nc_info.to_dict()
-
-            ncinfo_dict["realm"] = realm
-
-            return ncinfo_dict
+            (
+                filename,
+                file_id,
+                _,
+                frequency,
+                start_date,
+                end_date,
+                variable_list,
+                variable_long_name_list,
+                variable_standard_name_list,
+                variable_cell_methods_list,
+                variable_units_list,
+            ) = cls.parse_access_ncfile(file)
+
+            info = {
+                "path": str(file),
+                "realm": realm,
+                "variable": variable_list,
+                "frequency": frequency,
+                "start_date": start_date,
+                "end_date": end_date,
+                "variable_long_name": variable_long_name_list,
+                "variable_standard_name": variable_standard_name_list,
+                "variable_cell_methods": variable_cell_methods_list,
+                "variable_units": variable_units_list,
+                "filename": filename,
+                "file_id": file_id,
+            }
+
+            return info
 
         except Exception:
             return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}
@@ -450,22 +466,47 @@ def __init__(self, path):
         super().__init__(**kwargs)
 
     @classmethod
-    def parser(cls, file) -> dict:
+    def parser(cls, file):
         try:
-            output_nc_info = cls.parse_access_ncfile(file)
-            ncinfo_dict = output_nc_info.to_dict()
-
-            if "mom6" in ncinfo_dict["filename"]:
+            (
+                filename,
+                file_id,
+                _,
+                frequency,
+                start_date,
+                end_date,
+                variable_list,
+                variable_long_name_list,
+                variable_standard_name_list,
+                variable_cell_methods_list,
+                variable_units_list,
+            ) = cls.parse_access_ncfile(file)
+
+            if "mom6" in filename:
                 realm = "ocean"
-            elif "ww3" in ncinfo_dict["filename"]:
+            elif "ww3" in filename:
                 realm = "wave"
-            elif "cice" in ncinfo_dict["filename"]:
+            elif "cice" in filename:
                 realm = "seaIce"
             else:
                 raise ParserError(f"Cannot determine realm for file {file}")
-            ncinfo_dict["realm"] = realm
 
-            return ncinfo_dict
+            info = {
+                "path": str(file),
+                "realm": realm,
+                "variable": variable_list,
+                "frequency": frequency,
+                "start_date": start_date,
+                "end_date": end_date,
+                "variable_long_name": variable_long_name_list,
+                "variable_standard_name": variable_standard_name_list,
+                "variable_cell_methods": variable_cell_methods_list,
+                "variable_units": variable_units_list,
+                "filename": filename,
+                "file_id": file_id,
+            }
+
+            return info
 
         except Exception:
             return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}
@@ -529,18 +570,42 @@ def parser(cls, file):
             realm = match_groups[1]
 
             realm_mapping = {"atm": "atmos", "ocn": "ocean", "ice": "seaIce"}
-
-            nc_info = cls.parse_access_ncfile(file)
-            ncinfo_dict = nc_info.to_dict()
+            realm = realm_mapping[realm]
+
+            (
+                filename,
+                file_id,
+                _,
+                frequency,
+                start_date,
+                end_date,
+                variable_list,
+                variable_long_name_list,
+                variable_standard_name_list,
+                variable_cell_methods_list,
+                variable_units_list,
+            ) = cls.parse_access_ncfile(file)
 
             # Remove exp_id from file id so that members can be part of the same dataset
-            ncinfo_dict["file_id"] = re.sub(exp_id, "", ncinfo_dict["file_id"]).strip(
-                "_"
-            )
-            ncinfo_dict["realm"] = realm_mapping[realm]
-            ncinfo_dict["member"] = exp_id
-
-            return ncinfo_dict
+            file_id = re.sub(exp_id, "", file_id).strip("_")
+
+            info = {
+                "path": str(file),
+                "realm": realm,
+                "variable": variable_list,
+                "frequency": frequency,
+                "start_date": start_date,
+                "end_date": end_date,
+                "member": exp_id,
+                "variable_long_name": variable_long_name_list,
+                "variable_standard_name": variable_standard_name_list,
+                "variable_cell_methods": variable_cell_methods_list,
+                "variable_units": variable_units_list,
+                "filename": filename,
+                "file_id": file_id,
+            }
+
+            return info
 
         except Exception:
             return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}
diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py
index c9082f32..a3a8cfe9 100644
--- a/src/access_nri_intake/source/utils.py
+++ b/src/access_nri_intake/source/utils.py
@@ -4,96 +4,16 @@
 """ Shared utilities for writing Intake-ESM builders and their parsers """
 
 import warnings
-from dataclasses import asdict, dataclass, field
 from datetime import timedelta
-from pathlib import Path
-from typing import Union
 
 import cftime
-import xarray as xr
 
 
 class EmptyFileError(Exception):
     pass
 
 
-@dataclass
-class _AccessNCFileInfo:
-    """
-    Holds information about a NetCDF file that is used to create an intake-esm
-    catalog entry.
-
-    ______
-    Notes:
-    Use of both path and filename seems redundant, but constructing filename from
-    the path using a __post_init__ method makes testing more difficult. On balance,
-    more explicit tests are probably more important than the slight redundancy.
-    """
-
-    filename: Union[str, Path]
-    file_id: str
-    path: str
-    filename_timestamp: Union[str, None]
-    frequency: str
-    start_date: str
-    end_date: str
-    variable: list[str]
-    variable_long_name: list[str]
-    variable_standard_name: list[str]
-    variable_cell_methods: list[str]
-    variable_units: list[str]
-
-    def to_dict(self) -> dict[str, Union[str, list[str]]]:
-        """
-        Return a dictionary representation of the NcFileInfo object
-        """
-        return asdict(self)
-
-
-@dataclass
-class _DataVarInfo:
-    """
-    Holds information about the data variables in a NetCDF file that is used to
-    create an intake-esm catalog entry.
-    """
-
-    variable_list: list[str] = field(default_factory=list)
-    long_name_list: list[str] = field(default_factory=list)
-    standard_name_list: list[str] = field(default_factory=list)
-    cell_methods_list: list[str] = field(default_factory=list)
-    units_list: list[str] = field(default_factory=list)
-
-    def append_attrs(self, var: str, attrs: dict) -> None:
-        """
-        Append attributes to the DataVarInfo object, if the attribute has a
-        'long_name' key.
-
-        TODO: Why do we need a long name key? seems important
-        """
-        if "long_name" not in attrs:
-            return None
-
-        self.variable_list.append(var)
-        self.long_name_list.append(attrs["long_name"])
-        self.standard_name_list.append(attrs.get("standard_name", ""))
-        self.cell_methods_list.append(attrs.get("cell_methods", ""))
-        self.units_list.append(attrs.get("units", ""))
-
-    def to_ncinfo_dict(self) -> dict[str, list[str]]:
-        """
-        Return a dictionary representation of the DataVarInfo object. Fields are
-        defined explicitly for use in the _AccessNCFileInfo constructor.
-        """
-        return {
-            "variable": self.variable_list,
-            "variable_long_name": self.long_name_list,
-            "variable_standard_name": self.standard_name_list,
-            "variable_cell_methods": self.cell_methods_list,
-            "variable_units": self.units_list,
-        }
-
-
-def _add_month_start(time, n: int):
+def _add_month_start(time, n):
     """Add months to cftime datetime and truncate to start"""
     year = time.year + ((time.month + n - 1) // 12)
     month = (time.month + n - 1) % 12 + 1
@@ -102,7 +22,7 @@ def _add_month_start(time, n: int):
     )
 
 
-def _add_year_start(time, n: int):
+def _add_year_start(time, n):
     """Add years to cftime datetime and truncate to start"""
     return time.replace(
         year=time.year + n, month=1, day=1, hour=0, minute=0, second=0, microsecond=0
@@ -139,11 +59,7 @@ def _guess_start_end_dates(ts, te, frequency):
     return ts, te
 
 
-def get_timeinfo(
-    ds: xr.Dataset,
-    filename_frequency: Union[str, None],
-    time_dim: str,
-) -> tuple[str, str, str]:
+def get_timeinfo(ds, filename_frequency, time_dim):
     """
     Get start date, end date and frequency of a xarray dataset. Stolen and adapted from the
     cosima cookbook, see
@@ -153,24 +69,8 @@ def get_timeinfo(
     ----------
     ds: :py:class:`xarray.Dataset`
         The dataset to parse the time info from
-    filename_frequency: str
-        Frequency as determined from the filename
     time_dim: str
         The name of the time dimension
-
-    Returns
-    -------
-    start_date: str
-        The start date of the dataset
-    end_date: str
-        The end date of the dataset
-    frequency: str
-        The frequency of the dataset
-
-    Raises
-    ------
-    EmptyFileError
-        If the dataset has a valid unlimited dimension, but no data
     """
 
     def _todate(t):
@@ -179,7 +79,7 @@ def _todate(t):
     time_format = "%Y-%m-%d, %H:%M:%S"
     ts = None
     te = None
-    frequency: Union[str, tuple[Union[int, None], str]] = "fx"
+    frequency = "fx"
     has_time = time_dim in ds
 
     if has_time:
diff --git a/src/access_nri_intake/utils.py b/src/access_nri_intake/utils.py
index aed413dc..b2895f62 100644
--- a/src/access_nri_intake/utils.py
+++ b/src/access_nri_intake/utils.py
@@ -11,7 +11,7 @@
 import yaml
 
 
-def get_jsonschema(metadata_file: str, required: list) -> tuple[dict, dict]:
+def get_jsonschema(metadata_file, required):
     """
     Read in the required JSON schema, and annotate it with "required" fields.
 
@@ -22,7 +22,7 @@ def get_jsonschema(metadata_file: str, required: list) -> tuple[dict, dict]:
     """
 
     schema_file = rsr.files("access_nri_intake").joinpath(metadata_file)
-    with schema_file.open(mode="r") as fpath:  # type: ignore
+    with schema_file.open(mode="r") as fpath:
         schema = json.load(fpath)
 
     schema_required = schema.copy()
@@ -40,7 +40,7 @@ def get_jsonschema(metadata_file: str, required: list) -> tuple[dict, dict]:
     return schema, schema_required
 
 
-def load_metadata_yaml(path: str, jsonschema: dict) -> dict:
+def load_metadata_yaml(path, jsonschema):
     """
     Load a metadata.yaml file, leaving dates as strings, and validate against a jsonschema,
     allowing for tuples as arrays
@@ -77,7 +77,7 @@ def remove_implicit_resolver(cls, tag_to_remove):
     return metadata
 
 
-def validate_against_schema(instance: dict, schema: dict) -> None:
+def validate_against_schema(instance, schema):
     """
     Validate a dictionary against a jsonschema, allowing for tuples as arrays
 
@@ -87,11 +87,6 @@ def validate_against_schema(instance: dict, schema: dict) -> None:
         The instance to validate
     schema: dict
         The jsonschema
-
-    Raises
-    ------
-    jsonschema.exceptions.ValidationError
-        If the instance does not match the schema
     """
 
     Validator = jsonschema.validators.validator_for(schema)
diff --git a/tests/test_builders.py b/tests/test_builders.py
index c28f9eac..7b8dc5d3 100644
--- a/tests/test_builders.py
+++ b/tests/test_builders.py
@@ -8,7 +8,6 @@
 import pytest
 
 from access_nri_intake.source import CORE_COLUMNS, builders
-from access_nri_intake.source.utils import _AccessNCFileInfo
 
 
 @pytest.mark.parametrize(
@@ -365,725 +364,411 @@ def test_parse_access_filename(builder, filename, expected):
         (
             builders.AccessOm2Builder,
             "access-om2/output000/ocean/ocean_grid.nc",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="ocean_grid.nc",
-                file_id="ocean_grid",
-                filename_timestamp=None,
-                frequency="fx",
-                start_date="none",
-                end_date="none",
-                variable=["geolat_t", "geolon_t", "xt_ocean", "yt_ocean"],
-                variable_long_name=[
-                    "tracer latitude",
-                    "tracer longitude",
-                    "tcell longitude",
-                    "tcell latitude",
-                ],
-                variable_standard_name=["", "", "", ""],
-                variable_cell_methods=["time: point", "time: point", "", ""],
-                variable_units=["degrees_N", "degrees_E", "degrees_E", "degrees_N"],
+            (
+                "ocean_grid.nc",
+                "ocean_grid",
+                None,
+                "fx",
+                "none",
+                "none",
+                ["geolat_t", "geolon_t"],
+                ["tracer latitude", "tracer longitude"],
+                ["", ""],
+                ["time: point", "time: point"],
+                ["degrees_N", "degrees_E"],
             ),
         ),
         (
             builders.AccessOm2Builder,
             "access-om2/output000/ocean/ocean.nc",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="ocean.nc",
-                file_id="ocean",
-                filename_timestamp=None,
-                frequency="1yr",
-                start_date="1900-01-01, 00:00:00",
-                end_date="1910-01-01, 00:00:00",
-                variable=[
-                    "nv",
-                    "st_ocean",
-                    "temp",
-                    "time",
-                    "time_bounds",
-                    "xt_ocean",
-                    "yt_ocean",
-                ],
-                variable_long_name=[
-                    "vertex number",
-                    "tcell zstar depth",
-                    "Conservative temperature",
-                    "time",
-                    "time axis boundaries",
-                    "tcell longitude",
-                    "tcell latitude",
-                ],
-                variable_standard_name=[
-                    "",
-                    "",
-                    "sea_water_conservative_temperature",
-                    "",
-                    "",
-                    "",
-                    "",
-                ],
-                variable_cell_methods=["", "", "time: mean", "", "", "", ""],
-                variable_units=[
-                    "none",
-                    "meters",
-                    "K",
-                    "days since 1900-01-01 00:00:00",
-                    "days",
-                    "degrees_E",
-                    "degrees_N",
-                ],
+            (
+                "ocean.nc",
+                "ocean",
+                None,
+                "1yr",
+                "1900-01-01, 00:00:00",
+                "1910-01-01, 00:00:00",
+                ["temp", "time_bounds"],
+                ["Conservative temperature", "time axis boundaries"],
+                ["sea_water_conservative_temperature", ""],
+                ["time: mean", ""],
+                ["K", "days"],
             ),
         ),
         (
             builders.AccessOm2Builder,
             "access-om2/output000/ocean/ocean_month.nc",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="ocean_month.nc",
-                file_id="ocean_month",
-                filename_timestamp=None,
-                frequency="1mon",
-                start_date="1900-01-01, 00:00:00",
-                end_date="1910-01-01, 00:00:00",
-                variable=["mld", "nv", "time", "time_bounds", "xt_ocean", "yt_ocean"],
-                variable_long_name=[
+            (
+                "ocean_month.nc",
+                "ocean_month",
+                None,
+                "1mon",
+                "1900-01-01, 00:00:00",
+                "1910-01-01, 00:00:00",
+                ["mld", "time_bounds"],
+                [
                     "mixed layer depth determined by density criteria",
-                    "vertex number",
-                    "time",
                     "time axis boundaries",
-                    "tcell longitude",
-                    "tcell latitude",
-                ],
-                variable_standard_name=[
-                    "ocean_mixed_layer_thickness_defined_by_sigma_t",
-                    "",
-                    "",
-                    "",
-                    "",
-                    "",
-                ],
-                variable_cell_methods=["time: mean", "", "", "", "", ""],
-                variable_units=[
-                    "m",
-                    "none",
-                    "days since 1900-01-01 00:00:00",
-                    "days",
-                    "degrees_E",
-                    "degrees_N",
                 ],
+                ["ocean_mixed_layer_thickness_defined_by_sigma_t", ""],
+                ["time: mean", ""],
+                ["m", "days"],
             ),
         ),
         (
             builders.AccessOm2Builder,
             "access-om2/output000/ocean/ocean_month_inst_nobounds.nc",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="ocean_month_inst_nobounds.nc",
-                file_id="ocean_month_inst_nobounds",
-                filename_timestamp=None,
-                frequency="1mon",
-                start_date="1900-01-01, 00:00:00",
-                end_date="1900-02-01, 00:00:00",
-                variable=["mld", "time", "xt_ocean", "yt_ocean"],
-                variable_long_name=[
-                    "mixed layer depth determined by density criteria",
-                    "time",
-                    "tcell longitude",
-                    "tcell latitude",
-                ],
-                variable_standard_name=[
-                    "ocean_mixed_layer_thickness_defined_by_sigma_t",
-                    "",
-                    "",
-                    "",
-                ],
-                variable_cell_methods=["time: mean", "", "", ""],
-                variable_units=[
-                    "m",
-                    "days since 1900-01-01 00:00:00",
-                    "degrees_E",
-                    "degrees_N",
-                ],
+            (
+                "ocean_month_inst_nobounds.nc",
+                "ocean_month_inst_nobounds",
+                None,
+                "1mon",
+                "1900-01-01, 00:00:00",
+                "1900-02-01, 00:00:00",
+                ["mld"],
+                ["mixed layer depth determined by density criteria"],
+                ["ocean_mixed_layer_thickness_defined_by_sigma_t"],
+                ["time: mean"],
+                ["m"],
             ),
         ),
         (
             builders.AccessOm2Builder,
             "access-om2/output000/ice/OUTPUT/iceh.1900-01.nc",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="iceh.1900-01.nc",
-                file_id="iceh_XXXX_XX",
-                filename_timestamp="1900-01",
-                frequency="1mon",
-                start_date="1900-01-01, 00:00:00",
-                end_date="1900-02-01, 00:00:00",
-                variable=["TLAT", "TLON", "aice_m", "tarea", "time", "time_bounds"],
-                variable_long_name=[
+            (
+                "iceh.1900-01.nc",
+                "iceh_XXXX_XX",
+                "1900-01",
+                "1mon",
+                "1900-01-01, 00:00:00",
+                "1900-02-01, 00:00:00",
+                ["TLAT", "TLON", "aice_m", "tarea", "time_bounds"],
+                [
                     "T grid center latitude",
                     "T grid center longitude",
                     "ice area  (aggregate)",
                     "area of T grid cells",
-                    "model time",
                     "boundaries for time-averaging interval",
                 ],
-                variable_standard_name=["", "", "", "", "", ""],
-                variable_cell_methods=["", "", "time: mean", "", "", ""],
-                variable_units=[
+                ["", "", "", "", ""],
+                ["", "", "time: mean", "", ""],
+                [
                     "degrees_north",
                     "degrees_east",
                     "1",
                     "m^2",
                     "days since 1900-01-01 00:00:00",
-                    "days since 1900-01-01 00:00:00",
                 ],
             ),
         ),
         (
             builders.AccessCm2Builder,
             "access-cm2/by578/history/atm/netCDF/by578a.pd201501_dai.nc",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="by578a.pd201501_dai.nc",
-                file_id="by578a_pdXXXXXX_dai",
-                filename_timestamp="201501",
-                frequency="1day",
-                start_date="2015-01-01, 00:00:00",
-                end_date="2015-02-01, 00:00:00",
-                variable=["fld_s03i236"],
-                variable_long_name=["TEMPERATURE AT 1.5M"],
-                variable_standard_name=["air_temperature"],
-                variable_cell_methods=["time: mean"],
-                variable_units=["K"],
+            (
+                "by578a.pd201501_dai.nc",
+                "by578a_pdXXXXXX_dai",
+                "201501",
+                "1day",
+                "2015-01-01, 00:00:00",
+                "2015-02-01, 00:00:00",
+                ["fld_s03i236"],
+                ["TEMPERATURE AT 1.5M"],
+                ["air_temperature"],
+                ["time: mean"],
+                ["K"],
             ),
         ),
         (
             builders.AccessCm2Builder,
             "access-cm2/by578/history/ice/iceh_d.2015-01.nc",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="iceh_d.2015-01.nc",
-                file_id="iceh_d_XXXX_XX",
-                filename_timestamp="2015-01",
-                frequency="1day",
-                start_date="2015-01-01, 00:00:00",
-                end_date="2015-02-01, 00:00:00",
-                variable=["TLAT", "TLON", "aice", "tarea", "time", "time_bounds"],
-                variable_long_name=[
+            (
+                "iceh_d.2015-01.nc",
+                "iceh_d_XXXX_XX",
+                "2015-01",
+                "1day",
+                "2015-01-01, 00:00:00",
+                "2015-02-01, 00:00:00",
+                ["TLAT", "TLON", "aice", "tarea", "time_bounds"],
+                [
                     "T grid center latitude",
                     "T grid center longitude",
                     "ice area  (aggregate)",
                     "area of T grid cells",
-                    "model time",
                     "boundaries for time-averaging interval",
                 ],
-                variable_standard_name=["", "", "", "", "", ""],
-                variable_cell_methods=["", "", "time: mean", "", "", ""],
-                variable_units=[
+                ["", "", "", "", ""],
+                ["", "", "time: mean", "", ""],
+                [
                     "degrees_north",
                     "degrees_east",
                     "1",
                     "m^2",
                     "days since 1850-01-01 00:00:00",
-                    "days since 1850-01-01 00:00:00",
                 ],
             ),
         ),
         (
             builders.AccessCm2Builder,
             "access-cm2/by578/history/ocn/ocean_daily.nc-20150630",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="ocean_daily.nc-20150630",
-                file_id="ocean_daily",
-                filename_timestamp=None,
-                frequency="1day",
-                start_date="2015-01-01, 00:00:00",
-                end_date="2015-07-01, 00:00:00",
-                variable=["nv", "sst", "time", "time_bounds", "xt_ocean", "yt_ocean"],
-                variable_long_name=[
-                    "vertex number",
-                    "Potential temperature",
-                    "time",
-                    "time axis boundaries",
-                    "tcell longitude",
-                    "tcell latitude",
-                ],
-                variable_standard_name=["", "sea_surface_temperature", "", "", "", ""],
-                variable_cell_methods=["", "time: mean", "", "", "", ""],
-                variable_units=[
-                    "none",
-                    "K",
-                    "days since 1850-01-01 00:00:00",
-                    "days",
-                    "degrees_E",
-                    "degrees_N",
-                ],
+            (
+                "ocean_daily.nc-20150630",
+                "ocean_daily",
+                None,
+                "1day",
+                "2015-01-01, 00:00:00",
+                "2015-07-01, 00:00:00",
+                ["sst", "time_bounds"],
+                ["Potential temperature", "time axis boundaries"],
+                ["sea_surface_temperature", ""],
+                ["time: mean", ""],
+                ["K", "days"],
             ),
         ),
         (
             builders.AccessCm2Builder,
             "access-cm2/by578/history/ocn/ocean_scalar.nc-20150630",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="ocean_scalar.nc-20150630",
-                file_id="ocean_scalar",
-                filename_timestamp=None,
-                frequency="1mon",
-                start_date="2015-01-01, 00:00:00",
-                end_date="2015-07-01, 00:00:00",
-                variable=[
-                    "nv",
-                    "scalar_axis",
-                    "temp_global_ave",
-                    "time",
-                    "time_bounds",
-                ],
-                variable_long_name=[
-                    "vertex number",
-                    "none",
-                    "Global mean temp in liquid seawater",
-                    "time",
-                    "time axis boundaries",
-                ],
-                variable_standard_name=[
-                    "",
-                    "",
-                    "sea_water_potential_temperature",
-                    "",
-                    "",
-                ],
-                variable_cell_methods=["", "", "time: mean", "", ""],
-                variable_units=[
-                    "none",
-                    "none",
-                    "deg_C",
-                    "days since 1850-01-01 00:00:00",
-                    "days",
-                ],
+            (
+                "ocean_scalar.nc-20150630",
+                "ocean_scalar",
+                None,
+                "1mon",
+                "2015-01-01, 00:00:00",
+                "2015-07-01, 00:00:00",
+                ["temp_global_ave", "time_bounds"],
+                ["Global mean temp in liquid seawater", "time axis boundaries"],
+                ["sea_water_potential_temperature", ""],
+                ["time: mean", ""],
+                ["deg_C", "days"],
             ),
         ),
         (
             builders.AccessEsm15Builder,
             "access-esm1-5/history/atm/netCDF/HI-C-05-r1.pa-185001_mon.nc",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="HI-C-05-r1.pa-185001_mon.nc",
-                file_id="HI_C_05_r1_pa_XXXXXX_mon",
-                filename_timestamp="185001",
-                frequency="1mon",
-                start_date="1850-01-01, 00:00:00",
-                end_date="1850-02-01, 00:00:00",
-                variable=["fld_s03i236"],
-                variable_long_name=["TEMPERATURE AT 1.5M"],
-                variable_standard_name=["air_temperature"],
-                variable_cell_methods=["time: mean"],
-                variable_units=["K"],
+            (
+                "HI-C-05-r1.pa-185001_mon.nc",
+                "HI_C_05_r1_pa_XXXXXX_mon",
+                "185001",
+                "1mon",
+                "1850-01-01, 00:00:00",
+                "1850-02-01, 00:00:00",
+                ["fld_s03i236"],
+                ["TEMPERATURE AT 1.5M"],
+                ["air_temperature"],
+                ["time: mean"],
+                ["K"],
             ),
         ),
         (
             builders.AccessEsm15Builder,
             "access-esm1-5/history/ice/iceh.1850-01.nc",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="iceh.1850-01.nc",
-                file_id="iceh_XXXX_XX",
-                filename_timestamp="1850-01",
-                frequency="1mon",
-                start_date="1850-01-01, 00:00:00",
-                end_date="1850-02-01, 00:00:00",
-                variable=["TLAT", "TLON", "aice", "tarea", "time", "time_bounds"],
-                variable_long_name=[
+            (
+                "iceh.1850-01.nc",
+                "iceh_XXXX_XX",
+                "1850-01",
+                "1mon",
+                "1850-01-01, 00:00:00",
+                "1850-02-01, 00:00:00",
+                ["TLAT", "TLON", "aice", "tarea", "time_bounds"],
+                [
                     "T grid center latitude",
                     "T grid center longitude",
                     "ice area  (aggregate)",
                     "area of T grid cells",
-                    "model time",
                     "boundaries for time-averaging interval",
                 ],
-                variable_standard_name=["", "", "", "", "", ""],
-                variable_cell_methods=["", "", "time: mean", "", "", ""],
-                variable_units=[
+                ["", "", "", "", ""],
+                ["", "", "time: mean", "", ""],
+                [
                     "degrees_north",
                     "degrees_east",
                     "1",
                     "m^2",
                     "days since 0001-01-01 00:00:00",
-                    "days since 0001-01-01 00:00:00",
                 ],
             ),
         ),
         (
             builders.AccessEsm15Builder,
             "access-esm1-5/history/ocn/ocean_bgc_ann.nc-18501231",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="ocean_bgc_ann.nc-18501231",
-                file_id="ocean_bgc_ann",
-                filename_timestamp=None,
-                frequency="1yr",
-                start_date="1849-12-30, 00:00:00",
-                end_date="1850-12-30, 00:00:00",
-                variable=[
-                    "fgco2_raw",
-                    "nv",
-                    "time",
-                    "time_bounds",
-                    "xt_ocean",
-                    "yt_ocean",
-                ],
-                variable_long_name=[
-                    "Flux into ocean - DIC, inc. anth.",
-                    "vertex number",
-                    "time",
-                    "time axis boundaries",
-                    "tcell longitude",
-                    "tcell latitude",
-                ],
-                variable_standard_name=["", "", "", "", "", ""],
-                variable_cell_methods=["time: mean", "", "", "", "", ""],
-                variable_units=[
-                    "mmol/m^2/s",
-                    "none",
-                    "days since 0001-01-01 00:00:00",
-                    "days",
-                    "degrees_E",
-                    "degrees_N",
-                ],
+            (
+                "ocean_bgc_ann.nc-18501231",
+                "ocean_bgc_ann",
+                None,
+                "1yr",
+                "1849-12-30, 00:00:00",
+                "1850-12-30, 00:00:00",
+                ["fgco2_raw", "time_bounds"],
+                ["Flux into ocean - DIC, inc. anth.", "time axis boundaries"],
+                ["", ""],
+                ["time: mean", ""],
+                ["mmol/m^2/s", "days"],
             ),
         ),
         (
             builders.AccessEsm15Builder,
             "access-esm1-5/history/ocn/ocean_bgc.nc-18501231",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="ocean_bgc.nc-18501231",
-                file_id="ocean_bgc",
-                filename_timestamp=None,
-                frequency="1mon",
-                start_date="1849-12-30, 00:00:00",
-                end_date="1850-12-30, 00:00:00",
-                variable=[
-                    "nv",
-                    "o2",
-                    "st_ocean",
-                    "time",
-                    "time_bounds",
-                    "xt_ocean",
-                    "yt_ocean",
-                ],
-                variable_long_name=[
-                    "vertex number",
-                    "o2",
-                    "tcell zstar depth",
-                    "time",
-                    "time axis boundaries",
-                    "tcell longitude",
-                    "tcell latitude",
-                ],
-                variable_standard_name=["", "", "", "", "", "", ""],
-                variable_cell_methods=["", "time: mean", "", "", "", "", ""],
-                variable_units=[
-                    "none",
-                    "mmol/m^3",
-                    "meters",
-                    "days since 0001-01-01 00:00:00",
-                    "days",
-                    "degrees_E",
-                    "degrees_N",
-                ],
+            (
+                "ocean_bgc.nc-18501231",
+                "ocean_bgc",
+                None,
+                "1mon",
+                "1849-12-30, 00:00:00",
+                "1850-12-30, 00:00:00",
+                ["o2", "time_bounds"],
+                ["o2", "time axis boundaries"],
+                ["", ""],
+                ["time: mean", ""],
+                ["mmol/m^3", "days"],
             ),
         ),
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.mom6.h.native_1900_01.nc",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="GMOM_JRA_WD.mom6.h.native_1900_01.nc",
-                file_id="GMOM_JRA_WD_mom6_h_native_XXXX_XX",
-                filename_timestamp="1900_01",
-                frequency="1mon",
-                start_date="1900-01-01, 00:00:00",
-                end_date="1900-02-01, 00:00:00",
-                variable=[
-                    "average_DT",
-                    "average_T1",
-                    "average_T2",
-                    "nv",
-                    "thetao",
-                    "time",
-                    "time_bnds",
-                    "xh",
-                    "yh",
-                    "zl",
-                ],
-                variable_long_name=[
+            (
+                "GMOM_JRA_WD.mom6.h.native_1900_01.nc",
+                "GMOM_JRA_WD_mom6_h_native_XXXX_XX",
+                "1900_01",
+                "1mon",
+                "1900-01-01, 00:00:00",
+                "1900-02-01, 00:00:00",
+                ["average_DT", "average_T1", "average_T2", "thetao", "time_bnds"],
+                [
                     "Length of average period",
                     "Start time for average period",
                     "End time for average period",
-                    "vertex number",
                     "Sea Water Potential Temperature",
-                    "time",
                     "time axis boundaries",
-                    "h point nominal longitude",
-                    "h point nominal latitude",
-                    "Layer pseudo-depth, -z*",
                 ],
-                variable_standard_name=[
-                    "",
-                    "",
-                    "",
-                    "",
-                    "sea_water_potential_temperature",
-                    "",
-                    "",
-                    "",
-                    "",
-                    "",
-                ],
-                variable_cell_methods=[
-                    "",
-                    "",
-                    "",
-                    "",
-                    "area:mean zl:mean yh:mean xh:mean time: mean",
-                    "",
-                    "",
-                    "",
-                    "",
-                    "",
-                ],
-                variable_units=[
+                ["", "", "", "sea_water_potential_temperature", ""],
+                ["", "", "", "area:mean zl:mean yh:mean xh:mean time: mean", ""],
+                [
                     "days",
                     "days since 0001-01-01 00:00:00",
                     "days since 0001-01-01 00:00:00",
-                    "",
                     "degC",
                     "days since 0001-01-01 00:00:00",
-                    "days since 0001-01-01 00:00:00",
-                    "degrees_east",
-                    "degrees_north",
-                    "meter",
                 ],
             ),
         ),
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc",
-                file_id="GMOM_JRA_WD_mom6_h_sfc_XXXX_XX_XX",
-                filename_timestamp="1900_01_02",
-                frequency="1day",
-                start_date="1900-01-01, 00:00:00",
-                end_date="1900-01-02, 00:00:00",
-                variable=[
-                    "average_DT",
-                    "average_T1",
-                    "average_T2",
-                    "nv",
-                    "time",
-                    "time_bnds",
-                    "tos",
-                    "xh",
-                    "yh",
-                ],
-                variable_long_name=[
+            (
+                "GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc",
+                "GMOM_JRA_WD_mom6_h_sfc_XXXX_XX_XX",
+                "1900_01_02",
+                "1day",
+                "1900-01-01, 00:00:00",
+                "1900-01-02, 00:00:00",
+                ["average_DT", "average_T1", "average_T2", "time_bnds", "tos"],
+                [
                     "Length of average period",
                     "Start time for average period",
                     "End time for average period",
-                    "vertex number",
-                    "time",
                     "time axis boundaries",
                     "Sea Surface Temperature",
-                    "h point nominal longitude",
-                    "h point nominal latitude",
-                ],
-                variable_standard_name=[
-                    "",
-                    "",
-                    "",
-                    "",
-                    "",
-                    "",
-                    "sea_surface_temperature",
-                    "",
-                    "",
-                ],
-                variable_cell_methods=[
-                    "",
-                    "",
-                    "",
-                    "",
-                    "",
-                    "",
-                    "area:mean yh:mean xh:mean time: mean",
-                    "",
-                    "",
                 ],
-                variable_units=[
+                ["", "", "", "", "sea_surface_temperature"],
+                ["", "", "", "", "area:mean yh:mean xh:mean time: mean"],
+                [
                     "days",
                     "days since 0001-01-01 00:00:00",
                     "days since 0001-01-01 00:00:00",
-                    "",
-                    "days since 0001-01-01 00:00:00",
                     "days since 0001-01-01 00:00:00",
                     "degC",
-                    "degrees_east",
-                    "degrees_north",
                 ],
             ),
         ),
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.mom6.h.static.nc",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="GMOM_JRA_WD.mom6.h.static.nc",
-                file_id="GMOM_JRA_WD_mom6_h_static",
-                filename_timestamp=None,
-                frequency="fx",
-                start_date="none",
-                end_date="none",
-                variable=["geolat", "geolon", "xh", "yh"],
-                variable_long_name=[
-                    "Latitude of tracer (T) points",
-                    "Longitude of tracer (T) points",
-                    "h point nominal longitude",
-                    "h point nominal latitude",
-                ],
-                variable_standard_name=["", "", "", ""],
-                variable_cell_methods=["time: point", "time: point", "", ""],
-                variable_units=[
-                    "degrees_north",
-                    "degrees_east",
-                    "degrees_east",
-                    "degrees_north",
-                ],
+            (
+                "GMOM_JRA_WD.mom6.h.static.nc",
+                "GMOM_JRA_WD_mom6_h_static",
+                None,
+                "fx",
+                "none",
+                "none",
+                ["geolat", "geolon"],
+                ["Latitude of tracer (T) points", "Longitude of tracer (T) points"],
+                ["", ""],
+                ["time: point", "time: point"],
+                ["degrees_north", "degrees_east"],
             ),
         ),
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.mom6.h.z_1900_01.nc",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="GMOM_JRA_WD.mom6.h.z_1900_01.nc",
-                file_id="GMOM_JRA_WD_mom6_h_z_XXXX_XX",
-                filename_timestamp="1900_01",
-                frequency="1mon",
-                start_date="1900-01-01, 00:00:00",
-                end_date="1900-02-01, 00:00:00",
-                variable=[
-                    "average_DT",
-                    "average_T1",
-                    "average_T2",
-                    "nv",
-                    "thetao",
-                    "time",
-                    "time_bnds",
-                    "xh",
-                    "yh",
-                    "z_l",
-                ],
-                variable_long_name=[
+            (
+                "GMOM_JRA_WD.mom6.h.z_1900_01.nc",
+                "GMOM_JRA_WD_mom6_h_z_XXXX_XX",
+                "1900_01",
+                "1mon",
+                "1900-01-01, 00:00:00",
+                "1900-02-01, 00:00:00",
+                ["average_DT", "average_T1", "average_T2", "thetao", "time_bnds"],
+                [
                     "Length of average period",
                     "Start time for average period",
                     "End time for average period",
-                    "vertex number",
                     "Sea Water Potential Temperature",
-                    "time",
                     "time axis boundaries",
-                    "h point nominal longitude",
-                    "h point nominal latitude",
-                    "Depth at cell center",
-                ],
-                variable_standard_name=[
-                    "",
-                    "",
-                    "",
-                    "",
-                    "sea_water_potential_temperature",
-                    "",
-                    "",
-                    "",
-                    "",
-                    "",
-                ],
-                variable_cell_methods=[
-                    "",
-                    "",
-                    "",
-                    "",
-                    "area:mean z_l:mean yh:mean xh:mean time: mean",
-                    "",
-                    "",
-                    "",
-                    "",
-                    "",
                 ],
-                variable_units=[
+                ["", "", "", "sea_water_potential_temperature", ""],
+                ["", "", "", "area:mean z_l:mean yh:mean xh:mean time: mean", ""],
+                [
                     "days",
                     "days since 0001-01-01 00:00:00",
                     "days since 0001-01-01 00:00:00",
-                    "",
                     "degC",
                     "days since 0001-01-01 00:00:00",
-                    "days since 0001-01-01 00:00:00",
-                    "degrees_east",
-                    "degrees_north",
-                    "meters",
                 ],
             ),
         ),
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.cice.h.1900-01-01.nc",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="GMOM_JRA_WD.cice.h.1900-01-01.nc",
-                file_id="GMOM_JRA_WD_cice_h_XXXX_XX_XX",
-                filename_timestamp="1900-01-01",
-                frequency="1day",
-                start_date="1900-01-01, 00:00:00",
-                end_date="1900-01-02, 00:00:00",
-                variable=["TLAT", "TLON", "aice", "tarea", "time", "time_bounds"],
-                variable_long_name=[
+            (
+                "GMOM_JRA_WD.cice.h.1900-01-01.nc",
+                "GMOM_JRA_WD_cice_h_XXXX_XX_XX",
+                "1900-01-01",
+                "1day",
+                "1900-01-01, 00:00:00",
+                "1900-01-02, 00:00:00",
+                ["TLAT", "TLON", "aice", "tarea", "time_bounds"],
+                [
                     "T grid center latitude",
                     "T grid center longitude",
                     "ice area  (aggregate)",
                     "area of T grid cells",
-                    "time",
                     "time interval endpoints",
                 ],
-                variable_standard_name=["", "", "", "", "", ""],
-                variable_cell_methods=["", "", "time: mean", "", "", ""],
-                variable_units=[
+                ["", "", "", "", ""],
+                ["", "", "time: mean", "", ""],
+                [
                     "degrees_north",
                     "degrees_east",
                     "1",
                     "m^2",
                     "days since 0000-01-01 00:00:00",
-                    "days since 0000-01-01 00:00:00",
                 ],
             ),
         ),
         (
             builders.AccessOm3Builder,
             "access-om3/output000/GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc",
-            _AccessNCFileInfo(
-                path=None,  # type: ignore
-                filename="GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc",
-                file_id="GMOM_JRA_WD_ww3_hi_XXXX_XX_XX_XXXXX",
-                filename_timestamp="1900-01-02-00000",
-                frequency="fx",  # WW3 provides no time bounds
-                start_date="1900-01-02, 00:00:00",
-                end_date="1900-01-02, 00:00:00",
-                variable=["EF", "mapsta"],
-                variable_long_name=["1D spectral density", "map status"],
-                variable_standard_name=["", ""],
-                variable_cell_methods=["", ""],
-                variable_units=["m2 s", "unitless"],
+            (
+                "GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc",
+                "GMOM_JRA_WD_ww3_hi_XXXX_XX_XX_XXXXX",
+                "1900-01-02-00000",
+                "fx",  # WW3 provides no time bounds
+                "1900-01-02, 00:00:00",
+                "1900-01-02, 00:00:00",
+                ["EF", "mapsta"],
+                ["1D spectral density", "map status"],
+                ["", ""],
+                ["", ""],
+                ["m2 s", "unitless"],
             ),
         ),
     ],
@@ -1091,7 +776,4 @@ def test_parse_access_filename(builder, filename, expected):
 def test_parse_access_ncfile(test_data, builder, filename, expected):
     file = str(test_data / Path(filename))
 
-    # Set the path to the test data directory
-    expected.path = file
-
     assert builder.parse_access_ncfile(file) == expected

From b303840b75b1b366476555330072d7eab5b601f4 Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Mon, 14 Oct 2024 14:40:00 +0800
Subject: [PATCH 21/23] - Updated tests (covered missing lines introduced by
 refactor found by   sentry) - Updated cmip6.yaml as a different translator is
 required for Cordex   experiments as main CMIP6 experiments.

---
 config/cmip6.yaml                             |   6 +-
 config/experiments/cmip6_ig45/metadata.yaml   |  26 ----
 .../cordex-ig45/metadata.yaml                 |   1 -
 src/access_nri_intake/catalog/translators.py  | 133 +++++++++++-------
 tests/test_translators.py                     |  52 ++++++-
 5 files changed, 132 insertions(+), 86 deletions(-)
 delete mode 100644 config/experiments/cmip6_ig45/metadata.yaml

diff --git a/config/cmip6.yaml b/config/cmip6.yaml
index 5af7bd5e..0ea165de 100644
--- a/config/cmip6.yaml
+++ b/config/cmip6.yaml
@@ -10,8 +10,4 @@ sources:
     
   - metadata_yaml: /g/data/xp65/admin/access-nri-intake-catalog/config/metadata_sources/cmip6-oi10/metadata.yaml
     path:
-      - /g/data/oi10/catalog/v2/esm/catalog.json
-
-  - metadata_yaml: /g/data/xp65/admin/intake/metadata/cmip6_ig45/metadata.yaml
-    path:
-    - /g/data/ig45/catalog/v2/esm/catalog.json 
\ No newline at end of file
+      - /g/data/oi10/catalog/v2/esm/catalog.json
\ No newline at end of file
diff --git a/config/experiments/cmip6_ig45/metadata.yaml b/config/experiments/cmip6_ig45/metadata.yaml
deleted file mode 100644
index 8046f731..00000000
--- a/config/experiments/cmip6_ig45/metadata.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-name: cmip6_ig45
-experiment_uuid: c7021d1e-7ba2-11ef-beb5-000007d3fe80
-description: 20km regional projections for CORDEX-CMIP6 from the Queensland Future Climate Science Program
-long_description: >-
-  This dataset includes projections at 20km, formatted to meet the CORDEX-CMIP6 data standards. 
-  The 20km projections were derived from the 10km projections.
-model:
-- CMIP6
-frequency:
-- 
-variable:
-- 
-nominal_resolution:
-- 
-version: 
-contact: NCI
-email: help@nci.org.au
-reference:
-license:
-url: https://geonetwork.nci.org.au/geonetwork/srv/eng/catalog.search#/metadata/f7465_8388_5100_7022
-parent_experiment:
-related_experiments:
--
-notes: 
-keywords:
-- cmip
\ No newline at end of file
diff --git a/config/metadata_sources/cordex-ig45/metadata.yaml b/config/metadata_sources/cordex-ig45/metadata.yaml
index c118c402..1d013e63 100644
--- a/config/metadata_sources/cordex-ig45/metadata.yaml
+++ b/config/metadata_sources/cordex-ig45/metadata.yaml
@@ -121,7 +121,6 @@ license:
 url: https://geonetwork.nci.org.au/geonetwork/srv/eng/catalog.search#/metadata/f7465_8388_5100_7022
 parent_experiment:
 related_experiments:
--
 notes: 
 keywords:
 - cmip
\ No newline at end of file
diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py
index c048d0a7..fb8fb5ae 100644
--- a/src/access_nri_intake/catalog/translators.py
+++ b/src/access_nri_intake/catalog/translators.py
@@ -192,6 +192,31 @@ def _unique_values(series):
 
         return df[self.columns]  # Preserve ordering
 
+    def set_dispatch(
+        self, core_colname: str, func: Callable, input_name: Optional[str] = None
+    ):
+        """
+        Set a dispatch function for a column. Typically only required when either:
+            1. `core_colname != input_name`
+            2. A custom translation function (`func`) is required.
+
+        Parameters
+        ----------
+        core_colname: str
+            The core column name to translate to
+        input_name: str, optional
+            The name of the column in the source. If not provided, this defaults
+            to none, and no translation will occur
+        func: callable
+            The function to translate the column
+        """
+        if core_colname not in ["model", "realm", "frequency", "variable"]:
+            raise TranslatorError(
+                f"'core_colname' must be one of 'model', 'realm', 'frequency', 'variable', not {core_colname}"
+            )
+        self._dispatch[core_colname] = func
+        setattr(self._dispatch_keys, core_colname, input_name)
+
     def _realm_translator(self) -> pd.Series:
         """
         Return realm, fixing a few issues
@@ -240,16 +265,21 @@ def __init__(self, source, columns):
         """
 
         super().__init__(source, columns)
-        self._dispatch["model"] = self._model_translator
-        self._dispatch["realm"] = self._realm_translator
-        self._dispatch["frequency"] = self._frequency_translator
-        self._dispatch["variable"] = self._variable_translator
-
-        self._dispatch_keys = _DispatchKeys(
-            model="source_id",
-            realm="realm",
-            frequency="frequency",
-            variable="variable_id",
+        self.set_dispatch(
+            input_name="source_id", core_colname="model", func=super()._model_translator
+        )
+        self.set_dispatch(
+            input_name="realm", core_colname="realm", func=super()._realm_translator
+        )
+        self.set_dispatch(
+            input_name="frequency",
+            core_colname="frequency",
+            func=super()._frequency_translator,
+        )
+        self.set_dispatch(
+            input_name="variable_id",
+            core_colname="variable",
+            func=super()._variable_translator,
         )
 
 
@@ -271,16 +301,21 @@ def __init__(self, source, columns):
         """
 
         super().__init__(source, columns)
-        self._dispatch["model"] = self._model_translator
-        self._dispatch["realm"] = self._realm_translator
-        self._dispatch["frequency"] = self._frequency_translator
-        self._dispatch["variable"] = self._variable_translator
-
-        self._dispatch_keys = _DispatchKeys(
-            model="model",
-            realm="realm",
-            frequency="frequency",
-            variable="variable",
+        self.set_dispatch(
+            input_name="model", core_colname="model", func=super()._model_translator
+        )
+        self.set_dispatch(
+            input_name="realm", core_colname="realm", func=super()._realm_translator
+        )
+        self.set_dispatch(
+            input_name="frequency",
+            core_colname="frequency",
+            func=super()._frequency_translator,
+        )
+        self.set_dispatch(
+            input_name="variable",
+            core_colname="variable",
+            func=super()._variable_translator,
         )
 
 
@@ -302,17 +337,11 @@ def __init__(self, source, columns):
         """
 
         super().__init__(source, columns)
-        self._dispatch["variable"] = self._variable_translator
-        self._dispatch_keys = _DispatchKeys(variable="variable")
-
-    def _realm_translator(self) -> pd.Series:
-        raise AttributeError(
-            f"{self.__class__.__name__}: 'realm' does not require translation"
-        )
 
-    def _frequency_translator(self) -> pd.Series:
-        raise AttributeError(
-            f"{self.__class__.__name__}: 'data' does not require translation"
+        self.set_dispatch(
+            input_name="variable",
+            core_colname="variable",
+            func=super()._variable_translator,
         )
 
 
@@ -334,15 +363,21 @@ def __init__(self, source, columns):
         """
 
         super().__init__(source, columns)
-        self._dispatch["model"] = self._model_translator
-        self._dispatch["realm"] = self._realm_translator
-        self._dispatch["frequency"] = self._frequency_translator
-        self._dispatch["variable"] = self._variable_translator
-        self._dispatch_keys = _DispatchKeys(
-            model="source_id",
-            realm="realm",
-            variable="variable_id",
-            frequency="freq",
+        self.set_dispatch(
+            input_name="source_id", core_colname="model", func=super()._model_translator
+        )
+        self.set_dispatch(
+            input_name="realm", core_colname="realm", func=self._realm_translator
+        )
+        self.set_dispatch(
+            input_name="freq",
+            core_colname="frequency",
+            func=super()._frequency_translator,
+        )
+        self.set_dispatch(
+            input_name="variable_id",
+            core_colname="variable",
+            func=super()._variable_translator,
         )
 
     def _realm_translator(self):
@@ -370,16 +405,16 @@ def __init__(self, source, columns):
         """
 
         super().__init__(source, columns)
-        self._dispatch["model"] = self._model_translator
-        self._dispatch["frequency"] = self._frequency_translator
-        self._dispatch["variable"] = self._variable_translator
-        self._dispatch["realm"] = self._variable_translator
-
-        self._dispatch_keys = _DispatchKeys(
-            model="source_id",
-            frequency="frequency",
-            variable="variable_id",
-            realm="realm",
+        self.set_dispatch(
+            input_name="source_id", core_colname="model", func=super()._model_translator
+        )
+        self.set_dispatch(
+            input_name="variable_id",
+            core_colname="variable",
+            func=super()._variable_translator,
+        )
+        self.set_dispatch(
+            input_name="realm", core_colname="realm", func=self._realm_translator
         )
 
     def _realm_translator(self):
diff --git a/tests/test_translators.py b/tests/test_translators.py
index aba1c880..29ecb756 100644
--- a/tests/test_translators.py
+++ b/tests/test_translators.py
@@ -17,6 +17,7 @@
     TranslatorError,
     _cmip_realm_translator,
     _to_tuple,
+    tuplify_series,
 )
 
 
@@ -189,6 +190,29 @@ def test_DefaultTranslator_error(test_data):
     assert "Could not translate" in str(excinfo.value)
 
 
+@pytest.mark.parametrize(
+    "colname, should_raise",
+    [
+        ("model", False),
+        ("realm", False),
+        ("frequency", False),
+        ("variable", False),
+        ("random_string", True),
+    ],
+)
+def test_DefaultTranslator_set_dispatch(test_data, colname, should_raise):
+    """Test that only valid translation setups are allowed"""
+    esmds = intake.open_esm_datastore(test_data / "esm_datastore/cmip5-al33.json")
+    dtrans = DefaultTranslator(esmds, CORE_COLUMNS)
+    if should_raise:
+        with pytest.raises(TranslatorError) as excinfo:
+            dtrans.set_dispatch(colname, dtrans._model_translator, "model")
+            assert "'core_colname' must be one of" in str(excinfo.value)
+    else:
+        dtrans.set_dispatch(colname, dtrans._model_translator, colname)
+        assert dtrans._dispatch[colname] == dtrans._model_translator
+
+
 @pytest.mark.parametrize(
     "groupby, n_entries",
     [
@@ -271,11 +295,7 @@ def test_BarpaTranslator(test_data, groupby, n_entries):
 
 @pytest.mark.parametrize(
     "groupby, n_entries",
-    [
-        (None, 5),
-        (["variable"], 4),
-        (["frequency"], 2),
-    ],
+    [(None, 5), (["variable"], 4), (["frequency"], 2), (["realm"], 1)],
 )
 def test_CordexTranslator(test_data, groupby, n_entries):
     """Test CORDEX datastore translator"""
@@ -284,3 +304,25 @@ def test_CordexTranslator(test_data, groupby, n_entries):
     esmds.description = "description"
     df = CordexTranslator(esmds, CORE_COLUMNS).translate(groupby)
     assert len(df) == n_entries
+
+
+@pytest.mark.parametrize(
+    "input_series, expected_output",
+    [
+        (pd.Series([1, 2, 3]), pd.Series([(1,), (2,), (3,)])),
+    ],
+)
+def test_tuplify_series(input_series, expected_output):
+    """Test the _tuplify_series function"""
+
+    @tuplify_series
+    def tuplify_func(series):
+        return series
+
+    class TestSeries:
+        @tuplify_series
+        def method(self, series):
+            return series
+
+    assert all(tuplify_func(input_series) == expected_output)
+    assert all(TestSeries().method(input_series) == expected_output)

From cd0e2412dd61248254b97e062042fe7c6a74e13f Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Wed, 16 Oct 2024 14:50:13 +0800
Subject: [PATCH 22/23] - Renamed experiment in metadata.yaml - Updated path
 for cordex.yaml file

---
 config/cordex.yaml                                | 2 +-
 config/metadata_sources/cordex-ig45/metadata.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/config/cordex.yaml b/config/cordex.yaml
index da5338f4..ca947025 100644
--- a/config/cordex.yaml
+++ b/config/cordex.yaml
@@ -4,6 +4,6 @@ translator: CordexTranslator
 
 sources:
 
-  - metadata_yaml: /g/data/xp65/admin/access-nri-intake-catalog/config/metadata_sources/cordex-ig45/metadata.yaml
+  - metadata_yaml: /g/data/xp65/admin/intake/metadata/cordex_ig45/metadata.yaml
     path:
       - /g/data/ig45/catalog/v2/esm/catalog.json
diff --git a/config/metadata_sources/cordex-ig45/metadata.yaml b/config/metadata_sources/cordex-ig45/metadata.yaml
index 1d013e63..f1fc6b9b 100644
--- a/config/metadata_sources/cordex-ig45/metadata.yaml
+++ b/config/metadata_sources/cordex-ig45/metadata.yaml
@@ -1,4 +1,4 @@
-name: cmip6_ig45
+name: cordex_ig45
 experiment_uuid: c7021d1e-7ba2-11ef-beb5-000007d3fe80
 description: 20km regional projections for CORDEX-CMIP6 from the Queensland Future Climate Science Program
 long_description: >-

From 35c20fb93e4784e02946b86cfda252e63d1845b4 Mon Sep 17 00:00:00 2001
From: Charles Turner <charles.turner@anu.edu.au>
Date: Thu, 17 Oct 2024 12:13:01 +0800
Subject: [PATCH 23/23] Add missing newlines to end of yaml & json files

---
 config/cmip6.yaml                                 | 2 +-
 config/metadata_sources/cordex-ig45/metadata.yaml | 2 +-
 tests/data/esm_datastore/cordex-ig45.json         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/config/cmip6.yaml b/config/cmip6.yaml
index 0ea165de..acd39e1e 100644
--- a/config/cmip6.yaml
+++ b/config/cmip6.yaml
@@ -10,4 +10,4 @@ sources:
     
   - metadata_yaml: /g/data/xp65/admin/access-nri-intake-catalog/config/metadata_sources/cmip6-oi10/metadata.yaml
     path:
-      - /g/data/oi10/catalog/v2/esm/catalog.json
\ No newline at end of file
+      - /g/data/oi10/catalog/v2/esm/catalog.json
diff --git a/config/metadata_sources/cordex-ig45/metadata.yaml b/config/metadata_sources/cordex-ig45/metadata.yaml
index f1fc6b9b..88b9a197 100644
--- a/config/metadata_sources/cordex-ig45/metadata.yaml
+++ b/config/metadata_sources/cordex-ig45/metadata.yaml
@@ -123,4 +123,4 @@ parent_experiment:
 related_experiments:
 notes: 
 keywords:
-- cmip
\ No newline at end of file
+- cmip
diff --git a/tests/data/esm_datastore/cordex-ig45.json b/tests/data/esm_datastore/cordex-ig45.json
index 5fc783b9..fab7b871 100644
--- a/tests/data/esm_datastore/cordex-ig45.json
+++ b/tests/data/esm_datastore/cordex-ig45.json
@@ -67,4 +67,4 @@
             "column_name": "time_range"
         }
     ]
-}
\ No newline at end of file
+}