From 95e53e7a5b92a0eb5a7fe4354faa47e2aacd0e63 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Thu, 12 Sep 2024 10:05:25 +0800 Subject: [PATCH 01/23] Updated .gitignore to include some vs code settings --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index da4e917f..b18b515f 100644 --- a/.gitignore +++ b/.gitignore @@ -131,3 +131,6 @@ dmypy.json sandpit.ipynb *.DS_Store bin/build_all.sh.o* + +# Vs Code +.vscode/ \ No newline at end of file From 13b38a90f1a14fee2b79ed5962181005b88c6464 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Thu, 12 Sep 2024 10:08:36 +0800 Subject: [PATCH 02/23] * Added type hints * Replaced a couple of try/excepts with .get in `src/access_nri_intake/catalog/translators.py` * Updated a misleading docstring --- mypy.ini | 3 ++ src/access_nri_intake/catalog/manager.py | 24 +++++------ src/access_nri_intake/catalog/translators.py | 42 +++++++++---------- src/access_nri_intake/source/builders.py | 43 +++++++++++--------- src/access_nri_intake/source/utils.py | 16 ++++---- src/access_nri_intake/utils.py | 12 ++++-- 6 files changed, 76 insertions(+), 64 deletions(-) create mode 100644 mypy.ini diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 00000000..a47639ed --- /dev/null +++ b/mypy.ini @@ -0,0 +1,3 @@ +[mypy] +python_version = 3.12 +ignore_missing_imports = True \ No newline at end of file diff --git a/src/access_nri_intake/catalog/manager.py b/src/access_nri_intake/catalog/manager.py index f3d03243..ed954e36 100644 --- a/src/access_nri_intake/catalog/manager.py +++ b/src/access_nri_intake/catalog/manager.py @@ -30,7 +30,7 @@ class CatalogManager: Add/update intake sources in an intake-dataframe-catalog like the ACCESS-NRI catalog """ - def __init__(self, path): + def __init__(self, path : str): """ Initialise a CatalogManager instance to add/update intake sources in a intake-dataframe-catalog like the ACCESS-NRI catalog @@ -58,14 +58,14 @@ def __init__(self, path): def build_esm( self, - name, - description, + name : str, + description : str, builder, - path, + path : list[str] | str, translator=DefaultTranslator, - metadata=None, - directory=None, - overwrite=False, + metadata : dict | None = None, + directory : str | None = None, + overwrite : bool =False, **kwargs, ): """ @@ -124,12 +124,12 @@ def build_esm( def load( self, - name, - description, - path, - driver="esm_datastore", + name : str, + description : str, + path : str, + driver : str ="esm_datastore", translator=DefaultTranslator, - metadata=None, + metadata : dict | None =None, **kwargs, ): """ diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py index 509fe362..18c4fb8c 100644 --- a/src/access_nri_intake/catalog/translators.py +++ b/src/access_nri_intake/catalog/translators.py @@ -6,13 +6,18 @@ like the ACCESS-NRI catalog """ +from __future__ import annotations from functools import partial +from typing import Callable, TYPE_CHECKING +from intake import DataSource import pandas as pd import tlz from . import COLUMNS_WITH_ITERABLES +if TYPE_CHECKING: + from intake import DataSource class TranslatorError(Exception): "Generic Exception for the Translator classes" @@ -25,7 +30,7 @@ class DefaultTranslator: of metadata for use in an intake-dataframe-catalog. """ - def __init__(self, source, columns): + def __init__(self, source : DataSource, columns : list[str]): """ Initialise a DefaultTranslator. This Translator works as follows: @@ -45,12 +50,12 @@ def __init__(self, source, columns): self.source = source self.columns = columns - self._dispatch = { + self._dispatch : dict[str, Callable[[],pd.Series]] = { column: partial(self._default_translator, column=column) for column in columns } - def _default_translator(self, column): + def _default_translator(self, column: str) -> pd.Series: """ Try to translate a column from a source using the default translator. This translator works as follows: - If the input source is an intake-esm datastore, the translator will first look for the column in the @@ -96,7 +101,7 @@ def _default_translator(self, column): return pd.Series([val] * len_df) - def translate(self, groupby=None): + def translate(self, groupby : list[str] | None = None) -> pd.DataFrame: """ Return the translated :py:class:`~pandas.DataFrame` of metadata and merge into set of set of rows with unique values of the columns specified. @@ -149,7 +154,7 @@ class Cmip6Translator(DefaultTranslator): CMIP6 Translator for translating metadata from the NCI CMIP6 intake datastores. """ - def __init__(self, source, columns): + def __init__(self, source : DataSource, columns :list[str]): """ Initialise a Cmip6Translator @@ -197,7 +202,7 @@ class Cmip5Translator(DefaultTranslator): CMIP5 Translator for translating metadata from the NCI CMIP5 intake datastores. """ - def __init__(self, source, columns): + def __init__(self, source : DataSource, columns : list[str]): """ Initialise a Cmip5Translator @@ -245,7 +250,7 @@ class EraiTranslator(DefaultTranslator): ERAI Translator for translating metadata from the NCI ERA-Interim intake datastore. """ - def __init__(self, source, columns): + def __init__(self, source : DataSource, columns : list[str]): """ Initialise a EraiTranslator @@ -267,7 +272,7 @@ def _variable_translator(self): return _to_tuple(self.source.df["variable"]) -def _cmip_frequency_translator(series): +def _cmip_frequency_translator(series : pd.Series) -> pd.Series: """ Return frequency from CMIP frequency metadata """ @@ -288,21 +293,19 @@ def _translate(string): "yrPt": "1yr", } - try: - return translations[string] - except KeyError: - return string + return translations.get(string, string) return series.apply(lambda string: _translate(string)) -def _cmip_realm_translator(series): +def _cmip_realm_translator(series) -> pd.Series: """ - Return realm from CMIP realm metadata, fixing some issues. This function returns - a tuple as there are sometimes multiple realms per cmip asset + Return realm from CMIP realm metadata, fixing some issues. This function takes + a series of strings and returns a series of tuples as there are sometimes multiple + realms per cmip asset """ - def _translate(string): + def _translate(string : str) -> tuple[str, ...]: translations = { "na": "none", "landonly": "land", @@ -313,10 +316,7 @@ def _translate(string): raw_realms = string.split(" ") realms = [] for realm in raw_realms: - try: - realm = translations[realm] - except KeyError: - pass + realm = translations.get(realm, realm) if realm not in realms: realms.append(realm) return tuple(realms) @@ -324,7 +324,7 @@ def _translate(string): return series.apply(lambda string: _translate(string)) -def _to_tuple(series): +def _to_tuple(series : pd.Series) -> pd.Series: """ Make each entry in the provided series a tuple diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py index 13aa5249..cee340c6 100644 --- a/src/access_nri_intake/source/builders.py +++ b/src/access_nri_intake/source/builders.py @@ -16,7 +16,7 @@ from .utils import EmptyFileError, get_timeinfo # Frequency translations -FREQUENCIES = { +FREQUENCIES : dict[str, tuple[int, str]] = { "daily": (1, "day"), "_dai$": (1, "day"), "month": (1, "mon"), @@ -47,19 +47,19 @@ class BaseBuilder(Builder): """ # Base class carries an empty set - PATTERNS = [] + PATTERNS : list = [] def __init__( self, - path, - depth=0, - exclude_patterns=None, - include_patterns=None, - data_format="netcdf", - groupby_attrs=None, - aggregations=None, - storage_options=None, - joblib_parallel_kwargs={"n_jobs": multiprocessing.cpu_count()}, + path : str | list[str], + depth : int = 0, + exclude_patterns : list[str] | None = None, + include_patterns : list[str] | None = None, + data_format : str ="netcdf", + groupby_attrs : list[str] | None = None, + aggregations : list[dict] | None = None, + storage_options : dict | None = None, + joblib_parallel_kwargs : dict ={"n_jobs": multiprocessing.cpu_count()}, ): """ This method should be overwritten. The expection is that some of these arguments @@ -113,7 +113,7 @@ def parse(self): self._parse() return self - def _save(self, name, description, directory): + def _save(self, name : str, description : str, directory : str | None): super().save( name=name, path_column_name=PATH_COLUMN, @@ -128,7 +128,7 @@ def _save(self, name, description, directory): to_csv_kwargs={"compression": "gzip"}, ) - def save(self, name, description, directory=None): + def save(self, name : str, description : str , directory : str | None = None) -> None: """ Save datastore contents to a file. @@ -210,8 +210,8 @@ def parser(file): @classmethod def parse_access_filename( - cls, filename, patterns=None, frequencies=FREQUENCIES, redaction_fill: str = "X" - ): + cls, filename : str, patterns=None, frequencies=FREQUENCIES, redaction_fill: str = "X" + ) -> tuple[str, str | None, str | None]: """ Parse an ACCESS model filename and return a file id and any time information @@ -260,7 +260,7 @@ def parse_access_filename( return file_id, timestamp, frequency @classmethod - def parse_access_ncfile(cls, file, time_dim="time"): + def parse_access_ncfile(cls, file : str , time_dim : str ="time"): """ Get Intake-ESM datastore entry info from an ACCESS netcdf file @@ -273,13 +273,18 @@ def parse_access_ncfile(cls, file, time_dim="time"): Returns ------- + outputs: tuple + + Raises + ------ + EmptyFileError: If the file contains no variables """ - file = Path(file) - filename = file.name + file_path = Path(file) + filename = file_path.name file_id, filename_timestamp, filename_frequency = cls.parse_access_filename( - file.stem + file_path.stem ) with xr.open_dataset( diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index a3a8cfe9..8af4cacd 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -13,7 +13,7 @@ class EmptyFileError(Exception): pass -def _add_month_start(time, n): +def _add_month_start(time, n : int): """Add months to cftime datetime and truncate to start""" year = time.year + ((time.month + n - 1) // 12) month = (time.month + n - 1) % 12 + 1 @@ -22,7 +22,7 @@ def _add_month_start(time, n): ) -def _add_year_start(time, n): +def _add_year_start(time, n : int): """Add years to cftime datetime and truncate to start""" return time.replace( year=time.year + n, month=1, day=1, hour=0, minute=0, second=0, microsecond=0 @@ -59,7 +59,7 @@ def _guess_start_end_dates(ts, te, frequency): return ts, te -def get_timeinfo(ds, filename_frequency, time_dim): +def get_timeinfo(ds, filename_frequency, time_dim : str ) -> tuple[str, str, str]: """ Get start date, end date and frequency of a xarray dataset. Stolen and adapted from the cosima cookbook, see @@ -109,17 +109,17 @@ def _todate(t): # TODO: This is not a very good way to get the frequency if dt.days >= 365: years = round(dt.days / 365) - frequency = (years, "yr") + frequency = (years, "yr") # type: ignore elif dt.days >= 28: months = round(dt.days / 30) - frequency = (months, "mon") + frequency = (months, "mon") # type: ignore elif dt.days >= 1: - frequency = (dt.days, "day") + frequency = (dt.days, "day") # type: ignore elif dt.seconds >= 3600: hours = round(dt.seconds / 3600) - frequency = (hours, "hr") + frequency = (hours, "hr") # type: ignore else: - frequency = (None, "subhr") + frequency = (None, "subhr") # type: ignore if filename_frequency: if filename_frequency != frequency: diff --git a/src/access_nri_intake/utils.py b/src/access_nri_intake/utils.py index af2e6a61..bfb5a35c 100644 --- a/src/access_nri_intake/utils.py +++ b/src/access_nri_intake/utils.py @@ -11,7 +11,7 @@ import yaml -def get_jsonschema(url, known_hash, required): +def get_jsonschema(url : str , known_hash : str , required : list) -> tuple[dict, dict]: """ Download a jsonschema from a url. Returns the unaltered jsonschema and a version with the "required" key matching the properties provided. @@ -46,8 +46,7 @@ def get_jsonschema(url, known_hash, required): return schema, schema_required - -def load_metadata_yaml(path, jsonschema): +def load_metadata_yaml(path : str, jsonschema : dict) -> dict: """ Load a metadata.yaml file, leaving dates as strings, and validate against a jsonschema, allowing for tuples as arrays @@ -84,7 +83,7 @@ def remove_implicit_resolver(cls, tag_to_remove): return metadata -def validate_against_schema(instance, schema): +def validate_against_schema(instance : dict, schema : dict) -> None: """ Validate a dictionary against a jsonschema, allowing for tuples as arrays @@ -94,6 +93,11 @@ def validate_against_schema(instance, schema): The instance to validate schema: dict The jsonschema + + Raises + ------ + jsonschema.exceptions.ValidationError + If the instance does not match the schema """ Validator = jsonschema.validators.validator_for(schema) From f89ece163ffecb0102346a35eccf673c81f02269 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Fri, 13 Sep 2024 13:17:00 +0800 Subject: [PATCH 03/23] More type hints --- src/access_nri_intake/source/builders.py | 19 +++++++++++------ src/access_nri_intake/source/utils.py | 26 +++++++++++++++++++++++- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py index cee340c6..7fac34f4 100644 --- a/src/access_nri_intake/source/builders.py +++ b/src/access_nri_intake/source/builders.py @@ -210,7 +210,8 @@ def parser(file): @classmethod def parse_access_filename( - cls, filename : str, patterns=None, frequencies=FREQUENCIES, redaction_fill: str = "X" + cls, filename : str, patterns : list[str] | None =None, + frequencies : dict = FREQUENCIES, redaction_fill: str = "X" ) -> tuple[str, str | None, str | None]: """ Parse an ACCESS model filename and return a file id and any time information @@ -219,16 +220,22 @@ def parse_access_filename( ---------- filename: str The filename to parse with the extension removed + patterns: list of str, optional + A list of regex patterns to match against the filename. If None, use the class PATTERNS + frequencies: dict, optional + A dictionary of regex patterns to match against the filename to determine the frequency + redaction_fill: str, optional + The character to replace time information with. Defaults to "X" Returns ------- file_id: str The file id constructed by redacting time information and replacing non-python characters with underscores - timestamp: str - A string of the redacted time information (e.g. "1990-01") - frequency: str - The frequency of the file if available in the filename + timestamp: str | None + A string of the redacted time information (e.g. "1990-01") if available, otherwise None + frequency: str | None + The frequency of the file if available in the filename, otherwise None """ if patterns is None: patterns = cls.PATTERNS @@ -260,7 +267,7 @@ def parse_access_filename( return file_id, timestamp, frequency @classmethod - def parse_access_ncfile(cls, file : str , time_dim : str ="time"): + def parse_access_ncfile(cls, file : str , time_dim : str = "time") -> tuple: """ Get Intake-ESM datastore entry info from an ACCESS netcdf file diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index 8af4cacd..c51a5fde 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -5,9 +5,13 @@ import warnings from datetime import timedelta +from typing import TYPE_CHECKING import cftime +if TYPE_CHECKING: + import xarray as xr + class EmptyFileError(Exception): pass @@ -59,7 +63,11 @@ def _guess_start_end_dates(ts, te, frequency): return ts, te -def get_timeinfo(ds, filename_frequency, time_dim : str ) -> tuple[str, str, str]: +def get_timeinfo( + ds : xr.Dataset, + filename_frequency : str | None, + time_dim : str, + ) -> tuple[str, str, str]: """ Get start date, end date and frequency of a xarray dataset. Stolen and adapted from the cosima cookbook, see @@ -69,8 +77,24 @@ def get_timeinfo(ds, filename_frequency, time_dim : str ) -> tuple[str, str, str ---------- ds: :py:class:`xarray.Dataset` The dataset to parse the time info from + filename_frequency: str + Frequency as determined from the filename time_dim: str The name of the time dimension + + Returns + ------- + start_date: str + The start date of the dataset + end_date: str + The end date of the dataset + frequency: str + The frequency of the dataset + + Raises + ------ + EmptyFileError + If the dataset has a valid unlimited dimension, but no data """ def _todate(t): From aeaa9701e780867667227e99993a19e840915fab Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Wed, 18 Sep 2024 14:10:36 +0800 Subject: [PATCH 04/23] Type hint upddates --- src/access_nri_intake/source/utils.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index c51a5fde..3d4b7728 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -8,9 +8,7 @@ from typing import TYPE_CHECKING import cftime - -if TYPE_CHECKING: - import xarray as xr +import xarray as xr class EmptyFileError(Exception): @@ -103,7 +101,7 @@ def _todate(t): time_format = "%Y-%m-%d, %H:%M:%S" ts = None te = None - frequency = "fx" + frequency : str | tuple[int | None, str] = "fx" has_time = time_dim in ds if has_time: @@ -133,17 +131,17 @@ def _todate(t): # TODO: This is not a very good way to get the frequency if dt.days >= 365: years = round(dt.days / 365) - frequency = (years, "yr") # type: ignore + frequency = (years, "yr") elif dt.days >= 28: months = round(dt.days / 30) - frequency = (months, "mon") # type: ignore + frequency = (months, "mon") elif dt.days >= 1: - frequency = (dt.days, "day") # type: ignore + frequency = (dt.days, "day") elif dt.seconds >= 3600: hours = round(dt.seconds / 3600) - frequency = (hours, "hr") # type: ignore + frequency = (hours, "hr") else: - frequency = (None, "subhr") # type: ignore + frequency = (None, "subhr") if filename_frequency: if filename_frequency != frequency: From 6eb659ec30e3a25e71ab9920c00ebf4ba6f2e88f Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Mon, 23 Sep 2024 15:51:00 +1000 Subject: [PATCH 05/23] Updated data_vars => variables in parse_access_ncfile to allow coordinate variable searching & indexing --- src/access_nri_intake/source/builders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py index 13aa5249..b7c8c1b7 100644 --- a/src/access_nri_intake/source/builders.py +++ b/src/access_nri_intake/source/builders.py @@ -294,7 +294,7 @@ def parse_access_ncfile(cls, file, time_dim="time"): variable_standard_name_list = [] variable_cell_methods_list = [] variable_units_list = [] - for var in ds.data_vars: + for var in ds.variables: attrs = ds[var].attrs if "long_name" in attrs: variable_list.append(var) From ca1743e556afce7247ae7c23c886be902508e91f Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Tue, 24 Sep 2024 12:45:59 +1000 Subject: [PATCH 06/23] Updated all the builders to use a dataclass rather than a tuple: likely to be necessary for passing around coordinates as well as data variables as we begin to make coordinates indexable - I think we'll begin to get confused about what belongs where. --- src/access_nri_intake/catalog/manager.py | 24 +-- src/access_nri_intake/catalog/translators.py | 25 ++- src/access_nri_intake/source/builders.py | 215 ++++++------------- src/access_nri_intake/source/utils.py | 70 +++++- src/access_nri_intake/utils.py | 9 +- tests/test_builders.py | 39 ++-- 6 files changed, 180 insertions(+), 202 deletions(-) diff --git a/src/access_nri_intake/catalog/manager.py b/src/access_nri_intake/catalog/manager.py index ed954e36..f52e2126 100644 --- a/src/access_nri_intake/catalog/manager.py +++ b/src/access_nri_intake/catalog/manager.py @@ -30,7 +30,7 @@ class CatalogManager: Add/update intake sources in an intake-dataframe-catalog like the ACCESS-NRI catalog """ - def __init__(self, path : str): + def __init__(self, path: str): """ Initialise a CatalogManager instance to add/update intake sources in a intake-dataframe-catalog like the ACCESS-NRI catalog @@ -58,14 +58,14 @@ def __init__(self, path : str): def build_esm( self, - name : str, - description : str, + name: str, + description: str, builder, - path : list[str] | str, + path: list[str] | str, translator=DefaultTranslator, - metadata : dict | None = None, - directory : str | None = None, - overwrite : bool =False, + metadata: dict | None = None, + directory: str | None = None, + overwrite: bool = False, **kwargs, ): """ @@ -124,12 +124,12 @@ def build_esm( def load( self, - name : str, - description : str, - path : str, - driver : str ="esm_datastore", + name: str, + description: str, + path: str, + driver: str = "esm_datastore", translator=DefaultTranslator, - metadata : dict | None =None, + metadata: dict | None = None, **kwargs, ): """ diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py index 18c4fb8c..8f8bad96 100644 --- a/src/access_nri_intake/catalog/translators.py +++ b/src/access_nri_intake/catalog/translators.py @@ -7,17 +7,16 @@ """ from __future__ import annotations + from functools import partial -from typing import Callable, TYPE_CHECKING -from intake import DataSource +from typing import Callable import pandas as pd import tlz +from intake import DataSource from . import COLUMNS_WITH_ITERABLES -if TYPE_CHECKING: - from intake import DataSource class TranslatorError(Exception): "Generic Exception for the Translator classes" @@ -30,7 +29,7 @@ class DefaultTranslator: of metadata for use in an intake-dataframe-catalog. """ - def __init__(self, source : DataSource, columns : list[str]): + def __init__(self, source: DataSource, columns: list[str]): """ Initialise a DefaultTranslator. This Translator works as follows: @@ -50,7 +49,7 @@ def __init__(self, source : DataSource, columns : list[str]): self.source = source self.columns = columns - self._dispatch : dict[str, Callable[[],pd.Series]] = { + self._dispatch: dict[str, Callable[[], pd.Series]] = { column: partial(self._default_translator, column=column) for column in columns } @@ -101,7 +100,7 @@ def _default_translator(self, column: str) -> pd.Series: return pd.Series([val] * len_df) - def translate(self, groupby : list[str] | None = None) -> pd.DataFrame: + def translate(self, groupby: list[str] | None = None) -> pd.DataFrame: """ Return the translated :py:class:`~pandas.DataFrame` of metadata and merge into set of set of rows with unique values of the columns specified. @@ -154,7 +153,7 @@ class Cmip6Translator(DefaultTranslator): CMIP6 Translator for translating metadata from the NCI CMIP6 intake datastores. """ - def __init__(self, source : DataSource, columns :list[str]): + def __init__(self, source: DataSource, columns: list[str]): """ Initialise a Cmip6Translator @@ -202,7 +201,7 @@ class Cmip5Translator(DefaultTranslator): CMIP5 Translator for translating metadata from the NCI CMIP5 intake datastores. """ - def __init__(self, source : DataSource, columns : list[str]): + def __init__(self, source: DataSource, columns: list[str]): """ Initialise a Cmip5Translator @@ -250,7 +249,7 @@ class EraiTranslator(DefaultTranslator): ERAI Translator for translating metadata from the NCI ERA-Interim intake datastore. """ - def __init__(self, source : DataSource, columns : list[str]): + def __init__(self, source: DataSource, columns: list[str]): """ Initialise a EraiTranslator @@ -272,7 +271,7 @@ def _variable_translator(self): return _to_tuple(self.source.df["variable"]) -def _cmip_frequency_translator(series : pd.Series) -> pd.Series: +def _cmip_frequency_translator(series: pd.Series) -> pd.Series: """ Return frequency from CMIP frequency metadata """ @@ -305,7 +304,7 @@ def _cmip_realm_translator(series) -> pd.Series: realms per cmip asset """ - def _translate(string : str) -> tuple[str, ...]: + def _translate(string: str) -> tuple[str, ...]: translations = { "na": "none", "landonly": "land", @@ -324,7 +323,7 @@ def _translate(string : str) -> tuple[str, ...]: return series.apply(lambda string: _translate(string)) -def _to_tuple(series : pd.Series) -> pd.Series: +def _to_tuple(series: pd.Series) -> pd.Series: """ Make each entry in the provided series a tuple diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py index 28246f67..c43c9741 100644 --- a/src/access_nri_intake/source/builders.py +++ b/src/access_nri_intake/source/builders.py @@ -13,10 +13,10 @@ from ..utils import validate_against_schema from . import ESM_JSONSCHEMA, PATH_COLUMN, VARIABLE_COLUMN -from .utils import EmptyFileError, get_timeinfo +from .utils import AccessNCFileInfo, EmptyFileError, get_timeinfo # Frequency translations -FREQUENCIES : dict[str, tuple[int, str]] = { +FREQUENCIES: dict[str, tuple[int, str]] = { "daily": (1, "day"), "_dai$": (1, "day"), "month": (1, "mon"), @@ -47,19 +47,19 @@ class BaseBuilder(Builder): """ # Base class carries an empty set - PATTERNS : list = [] + PATTERNS: list = [] def __init__( self, - path : str | list[str], - depth : int = 0, - exclude_patterns : list[str] | None = None, - include_patterns : list[str] | None = None, - data_format : str ="netcdf", - groupby_attrs : list[str] | None = None, - aggregations : list[dict] | None = None, - storage_options : dict | None = None, - joblib_parallel_kwargs : dict ={"n_jobs": multiprocessing.cpu_count()}, + path: str | list[str], + depth: int = 0, + exclude_patterns: list[str] | None = None, + include_patterns: list[str] | None = None, + data_format: str = "netcdf", + groupby_attrs: list[str] | None = None, + aggregations: list[dict] | None = None, + storage_options: dict | None = None, + joblib_parallel_kwargs: dict = {"n_jobs": multiprocessing.cpu_count()}, ): """ This method should be overwritten. The expection is that some of these arguments @@ -113,7 +113,7 @@ def parse(self): self._parse() return self - def _save(self, name : str, description : str, directory : str | None): + def _save(self, name: str, description: str, directory: str | None): super().save( name=name, path_column_name=PATH_COLUMN, @@ -128,7 +128,7 @@ def _save(self, name : str, description : str, directory : str | None): to_csv_kwargs={"compression": "gzip"}, ) - def save(self, name : str, description : str , directory : str | None = None) -> None: + def save(self, name: str, description: str, directory: str | None = None) -> None: """ Save datastore contents to a file. @@ -210,8 +210,11 @@ def parser(file): @classmethod def parse_access_filename( - cls, filename : str, patterns : list[str] | None =None, - frequencies : dict = FREQUENCIES, redaction_fill: str = "X" + cls, + filename: str, + patterns: list[str] | None = None, + frequencies: dict = FREQUENCIES, + redaction_fill: str = "X", ) -> tuple[str, str | None, str | None]: """ Parse an ACCESS model filename and return a file id and any time information @@ -267,7 +270,7 @@ def parse_access_filename( return file_id, timestamp, frequency @classmethod - def parse_access_ncfile(cls, file : str , time_dim : str = "time") -> tuple: + def parse_access_ncfile(cls, file: str, time_dim: str = "time") -> AccessNCFileInfo: """ Get Intake-ESM datastore entry info from an ACCESS netcdf file @@ -280,14 +283,15 @@ def parse_access_ncfile(cls, file : str , time_dim : str = "time") -> tuple: Returns ------- - outputs: tuple + output_nc_info: AccessNCFileInfo + A dataclass containing the information parsed from the file Raises ------ EmptyFileError: If the file contains no variables """ - file_path = Path(file) + file_path = Path(file) filename = file_path.name file_id, filename_timestamp, filename_frequency = cls.parse_access_filename( @@ -306,23 +310,14 @@ def parse_access_ncfile(cls, file : str , time_dim : str = "time") -> tuple: variable_standard_name_list = [] variable_cell_methods_list = [] variable_units_list = [] - for var in ds.variables: + for var in ds.data_vars: attrs = ds[var].attrs if "long_name" in attrs: variable_list.append(var) variable_long_name_list.append(attrs["long_name"]) - if "standard_name" in attrs: - variable_standard_name_list.append(attrs["standard_name"]) - else: - variable_standard_name_list.append("") - if "cell_methods" in attrs: - variable_cell_methods_list.append(attrs["cell_methods"]) - else: - variable_cell_methods_list.append("") - if "units" in attrs: - variable_units_list.append(attrs["units"]) - else: - variable_units_list.append("") + variable_standard_name_list.append(attrs.get("standard_name", "")) + variable_cell_methods_list.append(attrs.get("cell_methods", "")) + variable_units_list.append(attrs.get("units", "")) start_date, end_date, frequency = get_timeinfo( ds, filename_frequency, time_dim @@ -331,21 +326,21 @@ def parse_access_ncfile(cls, file : str , time_dim : str = "time") -> tuple: if not variable_list: raise EmptyFileError("This file contains no variables") - outputs = ( - filename, - file_id, - filename_timestamp, - frequency, - start_date, - end_date, - variable_list, - variable_long_name_list, - variable_standard_name_list, - variable_cell_methods_list, - variable_units_list, + output_ncfile = AccessNCFileInfo( + filename=filename, + file_id=file_id, + filename_timestamp=filename_timestamp, + frequency=frequency, + start_date=start_date, + end_date=end_date, + variable=variable_list, # type: ignore + variable_long_name=variable_long_name_list, + variable_standard_name=variable_standard_name_list, + variable_cell_methods=variable_cell_methods_list, + variable_units=variable_units_list, ) - return outputs + return output_ncfile class AccessOm2Builder(BaseBuilder): @@ -390,44 +385,23 @@ def __init__(self, path): super().__init__(**kwargs) @classmethod - def parser(cls, file): + def parser(cls, file) -> dict: try: - match_groups = re.match(r".*/output\d+/([^/]*)/.*\.nc", file).groups() + # Need to check, but I think that the .groups() method that mypy is + # getting upset about is what the try/catch is for here - if the regex + # doesn't match, then it will throw an exception. + match_groups = re.match(r".*/output\d+/([^/]*)/.*\.nc", file).groups() # type: ignore realm = match_groups[0] if realm == "ice": realm = "seaIce" - ( - filename, - file_id, - _, - frequency, - start_date, - end_date, - variable_list, - variable_long_name_list, - variable_standard_name_list, - variable_cell_methods_list, - variable_units_list, - ) = cls.parse_access_ncfile(file) - - info = { - "path": str(file), - "realm": realm, - "variable": variable_list, - "frequency": frequency, - "start_date": start_date, - "end_date": end_date, - "variable_long_name": variable_long_name_list, - "variable_standard_name": variable_standard_name_list, - "variable_cell_methods": variable_cell_methods_list, - "variable_units": variable_units_list, - "filename": filename, - "file_id": file_id, - } - - return info + nc_info = cls.parse_access_ncfile(file) + ncinfo_dict = nc_info.to_dict() + + ncinfo_dict["realm"] = realm + + return ncinfo_dict except Exception: return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()} @@ -478,47 +452,22 @@ def __init__(self, path): super().__init__(**kwargs) @classmethod - def parser(cls, file): + def parser(cls, file) -> dict: try: - ( - filename, - file_id, - _, - frequency, - start_date, - end_date, - variable_list, - variable_long_name_list, - variable_standard_name_list, - variable_cell_methods_list, - variable_units_list, - ) = cls.parse_access_ncfile(file) - - if "mom6" in filename: + output_nc_info = cls.parse_access_ncfile(file) + ncinfo_dict = output_nc_info.to_dict() + + if "mom6" in ncinfo_dict["filename"]: realm = "ocean" - elif "ww3" in filename: + elif "ww3" in ncinfo_dict["filename"]: realm = "wave" - elif "cice" in filename: + elif "cice" in ncinfo_dict["filename"]: realm = "seaIce" else: raise ParserError(f"Cannot determine realm for file {file}") + ncinfo_dict["realm"] = realm - info = { - "path": str(file), - "realm": realm, - "variable": variable_list, - "frequency": frequency, - "start_date": start_date, - "end_date": end_date, - "variable_long_name": variable_long_name_list, - "variable_standard_name": variable_standard_name_list, - "variable_cell_methods": variable_cell_methods_list, - "variable_units": variable_units_list, - "filename": filename, - "file_id": file_id, - } - - return info + return ncinfo_dict except Exception: return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()} @@ -582,42 +531,18 @@ def parser(cls, file): realm = match_groups[1] realm_mapping = {"atm": "atmos", "ocn": "ocean", "ice": "seaIce"} - realm = realm_mapping[realm] - - ( - filename, - file_id, - _, - frequency, - start_date, - end_date, - variable_list, - variable_long_name_list, - variable_standard_name_list, - variable_cell_methods_list, - variable_units_list, - ) = cls.parse_access_ncfile(file) + + nc_info = cls.parse_access_ncfile(file) + ncinfo_dict = nc_info.to_dict() # Remove exp_id from file id so that members can be part of the same dataset - file_id = re.sub(exp_id, "", file_id).strip("_") - - info = { - "path": str(file), - "realm": realm, - "variable": variable_list, - "frequency": frequency, - "start_date": start_date, - "end_date": end_date, - "member": exp_id, - "variable_long_name": variable_long_name_list, - "variable_standard_name": variable_standard_name_list, - "variable_cell_methods": variable_cell_methods_list, - "variable_units": variable_units_list, - "filename": filename, - "file_id": file_id, - } - - return info + ncinfo_dict["file_id"] = re.sub(exp_id, "", ncinfo_dict["file_id"]).strip( + "_" + ) + ncinfo_dict["realm"] = realm_mapping[realm] + ncinfo_dict["member"] = exp_id + + return ncinfo_dict except Exception: return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()} diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index 3d4b7728..f5c35240 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -4,8 +4,9 @@ """ Shared utilities for writing Intake-ESM builders and their parsers """ import warnings +from dataclasses import asdict, dataclass, field from datetime import timedelta -from typing import TYPE_CHECKING +from pathlib import Path import cftime import xarray as xr @@ -15,7 +16,58 @@ class EmptyFileError(Exception): pass -def _add_month_start(time, n : int): +@dataclass +class AccessNCFileInfo: + """ + Holds information about a NetCDF file that is used to create an intake-esm + catalog entry. + """ + + filename: str | Path + file_id: str + filename_timestamp: str | None + frequency: str + start_date: str + end_date: str + variable: list[str] + variable_long_name: list[str] + variable_standard_name: list[str] + variable_cell_methods: list[str] + variable_units: list[str] + path: str = field(init=False) + + def __post_init__(self): + self.path = str(self.filename) + + def to_dict(self) -> dict[str, str | list[str]]: + """ + Return a dictionary representation of the NcFileInfo object + """ + return asdict(self) + + def to_tuple( + self, + ) -> tuple[ + str, + str | None, + str, + str, + str, + list[str], + list[str], + list[str], + list[str], + list[str], + ]: + """ + Return a tuple representation of the NcFileInfo object. + + Returns an insanely long tuple: aiming to clean this up. + """ + return tuple(asdict(self).values()) + + +def _add_month_start(time, n: int): """Add months to cftime datetime and truncate to start""" year = time.year + ((time.month + n - 1) // 12) month = (time.month + n - 1) % 12 + 1 @@ -24,7 +76,7 @@ def _add_month_start(time, n : int): ) -def _add_year_start(time, n : int): +def _add_year_start(time, n: int): """Add years to cftime datetime and truncate to start""" return time.replace( year=time.year + n, month=1, day=1, hour=0, minute=0, second=0, microsecond=0 @@ -62,10 +114,10 @@ def _guess_start_end_dates(ts, te, frequency): def get_timeinfo( - ds : xr.Dataset, - filename_frequency : str | None, - time_dim : str, - ) -> tuple[str, str, str]: + ds: xr.Dataset, + filename_frequency: str | None, + time_dim: str, +) -> tuple[str, str, str]: """ Get start date, end date and frequency of a xarray dataset. Stolen and adapted from the cosima cookbook, see @@ -101,7 +153,7 @@ def _todate(t): time_format = "%Y-%m-%d, %H:%M:%S" ts = None te = None - frequency : str | tuple[int | None, str] = "fx" + frequency: str | tuple[int | None, str] = "fx" has_time = time_dim in ds if has_time: @@ -134,7 +186,7 @@ def _todate(t): frequency = (years, "yr") elif dt.days >= 28: months = round(dt.days / 30) - frequency = (months, "mon") + frequency = (months, "mon") elif dt.days >= 1: frequency = (dt.days, "day") elif dt.seconds >= 3600: diff --git a/src/access_nri_intake/utils.py b/src/access_nri_intake/utils.py index dc0cf3f8..ceae573d 100644 --- a/src/access_nri_intake/utils.py +++ b/src/access_nri_intake/utils.py @@ -11,7 +11,7 @@ import yaml -def get_jsonschema(url : str , known_hash : str , required : list) -> tuple[dict, dict]: +def get_jsonschema(metadata_file: str, required: list) -> tuple[dict, dict]: """ Read in the required JSON schema, and annotate it with "required" fields. @@ -22,7 +22,7 @@ def get_jsonschema(url : str , known_hash : str , required : list) -> tuple[dict """ schema_file = rsr.files("access_nri_intake").joinpath(metadata_file) - with schema_file.open(mode="r") as fpath: + with schema_file.open(mode="r") as fpath: # type: ignore schema = json.load(fpath) schema_required = schema.copy() @@ -39,7 +39,8 @@ def get_jsonschema(url : str , known_hash : str , required : list) -> tuple[dict return schema, schema_required -def load_metadata_yaml(path : str, jsonschema : dict) -> dict: + +def load_metadata_yaml(path: str, jsonschema: dict) -> dict: """ Load a metadata.yaml file, leaving dates as strings, and validate against a jsonschema, allowing for tuples as arrays @@ -76,7 +77,7 @@ def remove_implicit_resolver(cls, tag_to_remove): return metadata -def validate_against_schema(instance : dict, schema : dict) -> None: +def validate_against_schema(instance: dict, schema: dict) -> None: """ Validate a dictionary against a jsonschema, allowing for tuples as arrays diff --git a/tests/test_builders.py b/tests/test_builders.py index 7b8dc5d3..d2283d61 100644 --- a/tests/test_builders.py +++ b/tests/test_builders.py @@ -8,6 +8,7 @@ import pytest from access_nri_intake.source import CORE_COLUMNS, builders +from access_nri_intake.source.utils import AccessNCFileInfo @pytest.mark.parametrize( @@ -364,7 +365,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm2Builder, "access-om2/output000/ocean/ocean_grid.nc", - ( + AccessNCFileInfo( "ocean_grid.nc", "ocean_grid", None, @@ -381,7 +382,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm2Builder, "access-om2/output000/ocean/ocean.nc", - ( + AccessNCFileInfo( "ocean.nc", "ocean", None, @@ -398,7 +399,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm2Builder, "access-om2/output000/ocean/ocean_month.nc", - ( + AccessNCFileInfo( "ocean_month.nc", "ocean_month", None, @@ -418,7 +419,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm2Builder, "access-om2/output000/ocean/ocean_month_inst_nobounds.nc", - ( + AccessNCFileInfo( "ocean_month_inst_nobounds.nc", "ocean_month_inst_nobounds", None, @@ -435,7 +436,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm2Builder, "access-om2/output000/ice/OUTPUT/iceh.1900-01.nc", - ( + AccessNCFileInfo( "iceh.1900-01.nc", "iceh_XXXX_XX", "1900-01", @@ -464,7 +465,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessCm2Builder, "access-cm2/by578/history/atm/netCDF/by578a.pd201501_dai.nc", - ( + AccessNCFileInfo( "by578a.pd201501_dai.nc", "by578a_pdXXXXXX_dai", "201501", @@ -481,7 +482,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessCm2Builder, "access-cm2/by578/history/ice/iceh_d.2015-01.nc", - ( + AccessNCFileInfo( "iceh_d.2015-01.nc", "iceh_d_XXXX_XX", "2015-01", @@ -510,7 +511,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessCm2Builder, "access-cm2/by578/history/ocn/ocean_daily.nc-20150630", - ( + AccessNCFileInfo( "ocean_daily.nc-20150630", "ocean_daily", None, @@ -527,7 +528,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessCm2Builder, "access-cm2/by578/history/ocn/ocean_scalar.nc-20150630", - ( + AccessNCFileInfo( "ocean_scalar.nc-20150630", "ocean_scalar", None, @@ -544,7 +545,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessEsm15Builder, "access-esm1-5/history/atm/netCDF/HI-C-05-r1.pa-185001_mon.nc", - ( + AccessNCFileInfo( "HI-C-05-r1.pa-185001_mon.nc", "HI_C_05_r1_pa_XXXXXX_mon", "185001", @@ -561,7 +562,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessEsm15Builder, "access-esm1-5/history/ice/iceh.1850-01.nc", - ( + AccessNCFileInfo( "iceh.1850-01.nc", "iceh_XXXX_XX", "1850-01", @@ -590,7 +591,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessEsm15Builder, "access-esm1-5/history/ocn/ocean_bgc_ann.nc-18501231", - ( + AccessNCFileInfo( "ocean_bgc_ann.nc-18501231", "ocean_bgc_ann", None, @@ -607,7 +608,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessEsm15Builder, "access-esm1-5/history/ocn/ocean_bgc.nc-18501231", - ( + AccessNCFileInfo( "ocean_bgc.nc-18501231", "ocean_bgc", None, @@ -624,7 +625,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.native_1900_01.nc", - ( + AccessNCFileInfo( "GMOM_JRA_WD.mom6.h.native_1900_01.nc", "GMOM_JRA_WD_mom6_h_native_XXXX_XX", "1900_01", @@ -653,7 +654,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc", - ( + AccessNCFileInfo( "GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc", "GMOM_JRA_WD_mom6_h_sfc_XXXX_XX_XX", "1900_01_02", @@ -682,7 +683,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.static.nc", - ( + AccessNCFileInfo( "GMOM_JRA_WD.mom6.h.static.nc", "GMOM_JRA_WD_mom6_h_static", None, @@ -699,7 +700,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.z_1900_01.nc", - ( + AccessNCFileInfo( "GMOM_JRA_WD.mom6.h.z_1900_01.nc", "GMOM_JRA_WD_mom6_h_z_XXXX_XX", "1900_01", @@ -728,7 +729,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.cice.h.1900-01-01.nc", - ( + AccessNCFileInfo( "GMOM_JRA_WD.cice.h.1900-01-01.nc", "GMOM_JRA_WD_cice_h_XXXX_XX_XX", "1900-01-01", @@ -757,7 +758,7 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc", - ( + AccessNCFileInfo( "GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc", "GMOM_JRA_WD_ww3_hi_XXXX_XX_XX_XXXXX", "1900-01-02-00000", From 1c5378e2034486ef45902d483ec0b1cee52c8973 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Wed, 25 Sep 2024 15:44:01 +0800 Subject: [PATCH 07/23] - Updated the builders to use separate coordinate & variables dataclasses that are fed into a dataclass holding all the coordinate and data variables from a netCDF file. - Updated tests so that they are all passing: tests now expect to find coordinate variables from the netCDF files as well as data variables. - Some minor changes to make code more readable - changed long tuples to dataclasses & dictionaries where possible. --- src/access_nri_intake/source/builders.py | 42 +- src/access_nri_intake/source/utils.py | 110 +++- src/access_nri_intake/utils.py | 2 +- tests/test_builders.py | 723 ++++++++++++++++------- 4 files changed, 609 insertions(+), 268 deletions(-) diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py index c43c9741..ed41a098 100644 --- a/src/access_nri_intake/source/builders.py +++ b/src/access_nri_intake/source/builders.py @@ -13,7 +13,13 @@ from ..utils import validate_against_schema from . import ESM_JSONSCHEMA, PATH_COLUMN, VARIABLE_COLUMN -from .utils import AccessNCFileInfo, EmptyFileError, get_timeinfo +from .utils import ( + EmptyFileError, + _AccessNCFileInfo, + _CoordVarInfo, + _DataVarInfo, + get_timeinfo, +) # Frequency translations FREQUENCIES: dict[str, tuple[int, str]] = { @@ -270,7 +276,9 @@ def parse_access_filename( return file_id, timestamp, frequency @classmethod - def parse_access_ncfile(cls, file: str, time_dim: str = "time") -> AccessNCFileInfo: + def parse_access_ncfile( + cls, file: str, time_dim: str = "time" + ) -> _AccessNCFileInfo: """ Get Intake-ESM datastore entry info from an ACCESS netcdf file @@ -305,39 +313,33 @@ def parse_access_ncfile(cls, file: str, time_dim: str = "time") -> AccessNCFileI decode_times=False, decode_coords=False, ) as ds: - variable_list = [] - variable_long_name_list = [] - variable_standard_name_list = [] - variable_cell_methods_list = [] - variable_units_list = [] + dvars = _DataVarInfo() + cvars = _CoordVarInfo() + for var in ds.data_vars: attrs = ds[var].attrs - if "long_name" in attrs: - variable_list.append(var) - variable_long_name_list.append(attrs["long_name"]) - variable_standard_name_list.append(attrs.get("standard_name", "")) - variable_cell_methods_list.append(attrs.get("cell_methods", "")) - variable_units_list.append(attrs.get("units", "")) + dvars.append_attrs(var, attrs) # type: ignore + + for var in ds.coords: + attrs = ds[var].attrs + cvars.append_attrs(var, attrs) # type: ignore start_date, end_date, frequency = get_timeinfo( ds, filename_frequency, time_dim ) - if not variable_list: + if not dvars.variable_list: raise EmptyFileError("This file contains no variables") - output_ncfile = AccessNCFileInfo( + output_ncfile = _AccessNCFileInfo( filename=filename, file_id=file_id, filename_timestamp=filename_timestamp, frequency=frequency, start_date=start_date, end_date=end_date, - variable=variable_list, # type: ignore - variable_long_name=variable_long_name_list, - variable_standard_name=variable_standard_name_list, - variable_cell_methods=variable_cell_methods_list, - variable_units=variable_units_list, + **dvars.to_ncinfo_dict(), + **cvars.to_ncinfo_dict(), ) return output_ncfile diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index f5c35240..816bb3b4 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -17,7 +17,7 @@ class EmptyFileError(Exception): @dataclass -class AccessNCFileInfo: +class _AccessNCFileInfo: """ Holds information about a NetCDF file that is used to create an intake-esm catalog entry. @@ -34,6 +34,12 @@ class AccessNCFileInfo: variable_standard_name: list[str] variable_cell_methods: list[str] variable_units: list[str] + coords: list[str] + coord_long_name: list[str] + coord_cartesian_axes: list[str] + coord_calendar_types: list[str] + coord_bounds: list[str] + coord_units: list[str] path: str = field(init=False) def __post_init__(self): @@ -45,26 +51,94 @@ def to_dict(self) -> dict[str, str | list[str]]: """ return asdict(self) - def to_tuple( - self, - ) -> tuple[ - str, - str | None, - str, - str, - str, - list[str], - list[str], - list[str], - list[str], - list[str], - ]: + +@dataclass +class _DataVarInfo: + """ + Holds information about the data variables in a NetCDF file that is used to + create an intake-esm catalog entry. + """ + + variable_list: list[str] = field(default_factory=list) + long_name_list: list[str] = field(default_factory=list) + standard_name_list: list[str] = field(default_factory=list) + cell_methods_list: list[str] = field(default_factory=list) + units_list: list[str] = field(default_factory=list) + + def append_attrs(self, var: str, attrs: dict) -> None: + """ + Append attributes to the DataVarInfo object, if the attribute has a + 'long_name' key. + + TODO: Why do we need a long name key? seems important + """ + if "long_name" not in attrs: + return None + + self.variable_list.append(var) + self.long_name_list.append(attrs["long_name"]) + self.standard_name_list.append(attrs.get("standard_name", "")) + self.cell_methods_list.append(attrs.get("cell_methods", "")) + self.units_list.append(attrs.get("units", "")) + + def to_ncinfo_dict(self) -> dict[str, list[str]]: + """ + Return a dictionary representation of the DataVarInfo object. Fields are + defined explicitly for use in the _AccessNCFileInfo constructor. """ - Return a tuple representation of the NcFileInfo object. + return { + "variable": self.variable_list, + "variable_long_name": self.long_name_list, + "variable_standard_name": self.standard_name_list, + "variable_cell_methods": self.cell_methods_list, + "variable_units": self.units_list, + } - Returns an insanely long tuple: aiming to clean this up. + +@dataclass +class _CoordVarInfo: + """ + Holds information about the coordinate variables in a NetCDF file that is + used to create an intake-esm catalog entry. + """ + + coord_list: list[str] = field(default_factory=list) + long_name_list: list[str] = field(default_factory=list) + cartesian_axis_list: list[str] = field(default_factory=list) + calendar_type_list: list[str] = field(default_factory=list) + bounds_list: list[str] = field(default_factory=list) + units_list: list[str] = field(default_factory=list) + + def append_attrs(self, var: str, attrs: dict) -> None: + """ + Append attributes to the CoordVarInfo object, if the attribute has a + 'long_name' key. + + TODO: Why do we need a long name key? seems important + """ + if "long_name" not in attrs: + return None + + self.coord_list.append(var) + self.long_name_list.append(attrs["long_name"]) + self.cartesian_axis_list.append(attrs.get("cartesian_axis", "")) + self.calendar_type_list.append(attrs.get("calendar_type", "")) + self.bounds_list.append(attrs.get("bounds", "")) + self.units_list.append(attrs.get("units", "")) + + def to_ncinfo_dict(self) -> dict[str, list[str]]: + """ + Return a dictionary representation of the CoordVarInfo object. Fields are + defined explicitly for use in the _AccessNCFileInfo constructor. """ - return tuple(asdict(self).values()) + return { + "coords": self.coord_list, + "coord_long_name": self.long_name_list, + "coord_cartesian_axes": self.cartesian_axis_list, + "coord_calendar_types": self.calendar_type_list, + "coord_bounds": self.bounds_list, + "coord_units": self.units_list, + } def _add_month_start(time, n: int): diff --git a/src/access_nri_intake/utils.py b/src/access_nri_intake/utils.py index ceae573d..aed413dc 100644 --- a/src/access_nri_intake/utils.py +++ b/src/access_nri_intake/utils.py @@ -22,7 +22,7 @@ def get_jsonschema(metadata_file: str, required: list) -> tuple[dict, dict]: """ schema_file = rsr.files("access_nri_intake").joinpath(metadata_file) - with schema_file.open(mode="r") as fpath: # type: ignore + with schema_file.open(mode="r") as fpath: # type: ignore schema = json.load(fpath) schema_required = schema.copy() diff --git a/tests/test_builders.py b/tests/test_builders.py index d2283d61..2cf523eb 100644 --- a/tests/test_builders.py +++ b/tests/test_builders.py @@ -8,7 +8,7 @@ import pytest from access_nri_intake.source import CORE_COLUMNS, builders -from access_nri_intake.source.utils import AccessNCFileInfo +from access_nri_intake.source.utils import _AccessNCFileInfo @pytest.mark.parametrize( @@ -365,411 +365,676 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm2Builder, "access-om2/output000/ocean/ocean_grid.nc", - AccessNCFileInfo( - "ocean_grid.nc", - "ocean_grid", - None, - "fx", - "none", - "none", - ["geolat_t", "geolon_t"], - ["tracer latitude", "tracer longitude"], - ["", ""], - ["time: point", "time: point"], - ["degrees_N", "degrees_E"], + _AccessNCFileInfo( + filename="ocean_grid.nc", + file_id="ocean_grid", + filename_timestamp=None, + frequency="fx", + start_date="none", + end_date="none", + variable=["geolat_t", "geolon_t"], + variable_long_name=["tracer latitude", "tracer longitude"], + variable_standard_name=["", ""], + variable_cell_methods=["time: point", "time: point"], + variable_units=["degrees_N", "degrees_E"], + coord_long_name=["tcell longitude", "tcell latitude"], + coords=["xt_ocean", "yt_ocean"], + coord_cartesian_axes=["X", "Y"], + coord_calendar_types=["", ""], + coord_bounds=["", ""], + coord_units=["degrees_E", "degrees_N"], ), ), ( builders.AccessOm2Builder, "access-om2/output000/ocean/ocean.nc", - AccessNCFileInfo( - "ocean.nc", - "ocean", - None, - "1yr", - "1900-01-01, 00:00:00", - "1910-01-01, 00:00:00", - ["temp", "time_bounds"], - ["Conservative temperature", "time axis boundaries"], - ["sea_water_conservative_temperature", ""], - ["time: mean", ""], - ["K", "days"], + _AccessNCFileInfo( + filename="ocean.nc", + file_id="ocean", + filename_timestamp=None, + frequency="1yr", + start_date="1900-01-01, 00:00:00", + end_date="1910-01-01, 00:00:00", + variable=["temp", "time_bounds"], + variable_long_name=["Conservative temperature", "time axis boundaries"], + variable_standard_name=["sea_water_conservative_temperature", ""], + variable_cell_methods=["time: mean", ""], + variable_units=["K", "days"], + coords=["nv", "st_ocean", "time", "xt_ocean", "yt_ocean"], + coord_long_name=[ + "vertex number", + "tcell zstar depth", + "time", + "tcell longitude", + "tcell latitude", + ], + coord_cartesian_axes=["N", "Z", "T", "X", "Y"], + coord_calendar_types=["", "", "NOLEAP", "", ""], + coord_bounds=["", "", "time_bounds", "", ""], + coord_units=[ + "none", + "meters", + "days since 1900-01-01 00:00:00", + "degrees_E", + "degrees_N", + ], ), ), ( builders.AccessOm2Builder, "access-om2/output000/ocean/ocean_month.nc", - AccessNCFileInfo( - "ocean_month.nc", - "ocean_month", - None, - "1mon", - "1900-01-01, 00:00:00", - "1910-01-01, 00:00:00", - ["mld", "time_bounds"], - [ + _AccessNCFileInfo( + filename="ocean_month.nc", + file_id="ocean_month", + filename_timestamp=None, + frequency="1mon", + start_date="1900-01-01, 00:00:00", + end_date="1910-01-01, 00:00:00", + variable=["mld", "time_bounds"], + variable_long_name=[ "mixed layer depth determined by density criteria", "time axis boundaries", ], - ["ocean_mixed_layer_thickness_defined_by_sigma_t", ""], - ["time: mean", ""], - ["m", "days"], + variable_standard_name=[ + "ocean_mixed_layer_thickness_defined_by_sigma_t", + "", + ], + variable_cell_methods=["time: mean", ""], + variable_units=["m", "days"], + coords=["nv", "time", "xt_ocean", "yt_ocean"], + coord_long_name=[ + "vertex number", + "time", + "tcell longitude", + "tcell latitude", + ], + coord_cartesian_axes=["N", "T", "X", "Y"], + coord_calendar_types=["", "NOLEAP", "", ""], + coord_bounds=["", "time_bounds", "", ""], + coord_units=[ + "none", + "days since 1900-01-01 00:00:00", + "degrees_E", + "degrees_N", + ], ), ), ( builders.AccessOm2Builder, "access-om2/output000/ocean/ocean_month_inst_nobounds.nc", - AccessNCFileInfo( - "ocean_month_inst_nobounds.nc", - "ocean_month_inst_nobounds", - None, - "1mon", - "1900-01-01, 00:00:00", - "1900-02-01, 00:00:00", - ["mld"], - ["mixed layer depth determined by density criteria"], - ["ocean_mixed_layer_thickness_defined_by_sigma_t"], - ["time: mean"], - ["m"], + _AccessNCFileInfo( + filename="ocean_month_inst_nobounds.nc", + file_id="ocean_month_inst_nobounds", + filename_timestamp=None, + frequency="1mon", + start_date="1900-01-01, 00:00:00", + end_date="1900-02-01, 00:00:00", + variable=["mld"], + variable_long_name=["mixed layer depth determined by density criteria"], + variable_standard_name=[ + "ocean_mixed_layer_thickness_defined_by_sigma_t" + ], + variable_cell_methods=["time: mean"], + variable_units=["m"], + coords=["time", "xt_ocean", "yt_ocean"], + coord_long_name=["time", "tcell longitude", "tcell latitude"], + coord_cartesian_axes=["T", "X", "Y"], + coord_calendar_types=["NOLEAP", "", ""], + coord_bounds=["time_bounds", "", ""], + coord_units=[ + "days since 1900-01-01 00:00:00", + "degrees_E", + "degrees_N", + ], ), ), ( builders.AccessOm2Builder, "access-om2/output000/ice/OUTPUT/iceh.1900-01.nc", - AccessNCFileInfo( - "iceh.1900-01.nc", - "iceh_XXXX_XX", - "1900-01", - "1mon", - "1900-01-01, 00:00:00", - "1900-02-01, 00:00:00", - ["TLAT", "TLON", "aice_m", "tarea", "time_bounds"], - [ + _AccessNCFileInfo( + filename="iceh.1900-01.nc", + file_id="iceh_XXXX_XX", + filename_timestamp="1900-01", + frequency="1mon", + start_date="1900-01-01, 00:00:00", + end_date="1900-02-01, 00:00:00", + variable=["TLAT", "TLON", "aice_m", "tarea", "time_bounds"], + variable_long_name=[ "T grid center latitude", "T grid center longitude", "ice area (aggregate)", "area of T grid cells", "boundaries for time-averaging interval", ], - ["", "", "", "", ""], - ["", "", "time: mean", "", ""], - [ + variable_standard_name=["", "", "", "", ""], + variable_cell_methods=["", "", "time: mean", "", ""], + variable_units=[ "degrees_north", "degrees_east", "1", "m^2", "days since 1900-01-01 00:00:00", ], + coords=["time"], + coord_long_name=["model time"], + coord_cartesian_axes=[""], + coord_calendar_types=[""], + coord_bounds=["time_bounds"], + coord_units=["days since 1900-01-01 00:00:00"], ), ), ( builders.AccessCm2Builder, "access-cm2/by578/history/atm/netCDF/by578a.pd201501_dai.nc", - AccessNCFileInfo( - "by578a.pd201501_dai.nc", - "by578a_pdXXXXXX_dai", - "201501", - "1day", - "2015-01-01, 00:00:00", - "2015-02-01, 00:00:00", - ["fld_s03i236"], - ["TEMPERATURE AT 1.5M"], - ["air_temperature"], - ["time: mean"], - ["K"], + _AccessNCFileInfo( + filename="by578a.pd201501_dai.nc", + file_id="by578a_pdXXXXXX_dai", + filename_timestamp="201501", + frequency="1day", + start_date="2015-01-01, 00:00:00", + end_date="2015-02-01, 00:00:00", + variable=["fld_s03i236"], + variable_long_name=["TEMPERATURE AT 1.5M"], + variable_standard_name=["air_temperature"], + variable_cell_methods=["time: mean"], + variable_units=["K"], + coords=[], + coord_long_name=[], + coord_cartesian_axes=[], + coord_calendar_types=[], + coord_bounds=[], + coord_units=[], ), ), ( builders.AccessCm2Builder, "access-cm2/by578/history/ice/iceh_d.2015-01.nc", - AccessNCFileInfo( - "iceh_d.2015-01.nc", - "iceh_d_XXXX_XX", - "2015-01", - "1day", - "2015-01-01, 00:00:00", - "2015-02-01, 00:00:00", - ["TLAT", "TLON", "aice", "tarea", "time_bounds"], - [ + _AccessNCFileInfo( + filename="iceh_d.2015-01.nc", + file_id="iceh_d_XXXX_XX", + filename_timestamp="2015-01", + frequency="1day", + start_date="2015-01-01, 00:00:00", + end_date="2015-02-01, 00:00:00", + variable=["TLAT", "TLON", "aice", "tarea", "time_bounds"], + variable_long_name=[ "T grid center latitude", "T grid center longitude", "ice area (aggregate)", "area of T grid cells", "boundaries for time-averaging interval", ], - ["", "", "", "", ""], - ["", "", "time: mean", "", ""], - [ + variable_standard_name=["", "", "", "", ""], + variable_cell_methods=["", "", "time: mean", "", ""], + variable_units=[ "degrees_north", "degrees_east", "1", "m^2", "days since 1850-01-01 00:00:00", ], + coords=["time"], + coord_long_name=["model time"], + coord_cartesian_axes=[""], + coord_calendar_types=[""], + coord_bounds=["time_bounds"], + coord_units=["days since 1850-01-01 00:00:00"], ), ), ( builders.AccessCm2Builder, "access-cm2/by578/history/ocn/ocean_daily.nc-20150630", - AccessNCFileInfo( - "ocean_daily.nc-20150630", - "ocean_daily", - None, - "1day", - "2015-01-01, 00:00:00", - "2015-07-01, 00:00:00", - ["sst", "time_bounds"], - ["Potential temperature", "time axis boundaries"], - ["sea_surface_temperature", ""], - ["time: mean", ""], - ["K", "days"], + _AccessNCFileInfo( + filename="ocean_daily.nc-20150630", + file_id="ocean_daily", + filename_timestamp=None, + frequency="1day", + start_date="2015-01-01, 00:00:00", + end_date="2015-07-01, 00:00:00", + variable=["sst", "time_bounds"], + variable_long_name=["Potential temperature", "time axis boundaries"], + variable_standard_name=["sea_surface_temperature", ""], + variable_cell_methods=["time: mean", ""], + variable_units=["K", "days"], + coords=["nv", "time", "xt_ocean", "yt_ocean"], + coord_long_name=[ + "vertex number", + "time", + "tcell longitude", + "tcell latitude", + ], + coord_cartesian_axes=["N", "T", "X", "Y"], + coord_calendar_types=["", "GREGORIAN", "", ""], + coord_bounds=["", "time_bounds", "", ""], + coord_units=[ + "none", + "days since 1850-01-01 00:00:00", + "degrees_E", + "degrees_N", + ], ), ), ( builders.AccessCm2Builder, "access-cm2/by578/history/ocn/ocean_scalar.nc-20150630", - AccessNCFileInfo( - "ocean_scalar.nc-20150630", - "ocean_scalar", - None, - "1mon", - "2015-01-01, 00:00:00", - "2015-07-01, 00:00:00", - ["temp_global_ave", "time_bounds"], - ["Global mean temp in liquid seawater", "time axis boundaries"], - ["sea_water_potential_temperature", ""], - ["time: mean", ""], - ["deg_C", "days"], + _AccessNCFileInfo( + filename="ocean_scalar.nc-20150630", + file_id="ocean_scalar", + filename_timestamp=None, + frequency="1mon", + start_date="2015-01-01, 00:00:00", + end_date="2015-07-01, 00:00:00", + variable=["temp_global_ave", "time_bounds"], + variable_long_name=[ + "Global mean temp in liquid seawater", + "time axis boundaries", + ], + variable_standard_name=["sea_water_potential_temperature", ""], + variable_cell_methods=["time: mean", ""], + variable_units=["deg_C", "days"], + coords=["nv", "scalar_axis", "time"], + coord_long_name=["vertex number", "none", "time"], + coord_cartesian_axes=["N", "X", "T"], + coord_calendar_types=["", "", "GREGORIAN"], + coord_bounds=["", "", "time_bounds"], + coord_units=["none", "none", "days since 1850-01-01 00:00:00"], ), ), ( builders.AccessEsm15Builder, "access-esm1-5/history/atm/netCDF/HI-C-05-r1.pa-185001_mon.nc", - AccessNCFileInfo( - "HI-C-05-r1.pa-185001_mon.nc", - "HI_C_05_r1_pa_XXXXXX_mon", - "185001", - "1mon", - "1850-01-01, 00:00:00", - "1850-02-01, 00:00:00", - ["fld_s03i236"], - ["TEMPERATURE AT 1.5M"], - ["air_temperature"], - ["time: mean"], - ["K"], + _AccessNCFileInfo( + filename="HI-C-05-r1.pa-185001_mon.nc", + file_id="HI_C_05_r1_pa_XXXXXX_mon", + filename_timestamp="185001", + frequency="1mon", + start_date="1850-01-01, 00:00:00", + end_date="1850-02-01, 00:00:00", + variable=["fld_s03i236"], + variable_long_name=["TEMPERATURE AT 1.5M"], + variable_standard_name=["air_temperature"], + variable_cell_methods=["time: mean"], + variable_units=["K"], + coords=[], + coord_long_name=[], + coord_cartesian_axes=[], + coord_calendar_types=[], + coord_bounds=[], + coord_units=[], ), ), ( builders.AccessEsm15Builder, "access-esm1-5/history/ice/iceh.1850-01.nc", - AccessNCFileInfo( - "iceh.1850-01.nc", - "iceh_XXXX_XX", - "1850-01", - "1mon", - "1850-01-01, 00:00:00", - "1850-02-01, 00:00:00", - ["TLAT", "TLON", "aice", "tarea", "time_bounds"], - [ + _AccessNCFileInfo( + filename="iceh.1850-01.nc", + file_id="iceh_XXXX_XX", + filename_timestamp="1850-01", + frequency="1mon", + start_date="1850-01-01, 00:00:00", + end_date="1850-02-01, 00:00:00", + variable=["TLAT", "TLON", "aice", "tarea", "time_bounds"], + variable_long_name=[ "T grid center latitude", "T grid center longitude", "ice area (aggregate)", "area of T grid cells", "boundaries for time-averaging interval", ], - ["", "", "", "", ""], - ["", "", "time: mean", "", ""], - [ + variable_standard_name=["", "", "", "", ""], + variable_cell_methods=["", "", "time: mean", "", ""], + variable_units=[ "degrees_north", "degrees_east", "1", "m^2", "days since 0001-01-01 00:00:00", ], + coords=["time"], + coord_long_name=["model time"], + coord_cartesian_axes=[""], + coord_calendar_types=[""], + coord_bounds=["time_bounds"], + coord_units=["days since 0001-01-01 00:00:00"], ), ), ( builders.AccessEsm15Builder, "access-esm1-5/history/ocn/ocean_bgc_ann.nc-18501231", - AccessNCFileInfo( - "ocean_bgc_ann.nc-18501231", - "ocean_bgc_ann", - None, - "1yr", - "1849-12-30, 00:00:00", - "1850-12-30, 00:00:00", - ["fgco2_raw", "time_bounds"], - ["Flux into ocean - DIC, inc. anth.", "time axis boundaries"], - ["", ""], - ["time: mean", ""], - ["mmol/m^2/s", "days"], + _AccessNCFileInfo( + filename="ocean_bgc_ann.nc-18501231", + file_id="ocean_bgc_ann", + filename_timestamp=None, + frequency="1yr", + start_date="1849-12-30, 00:00:00", + end_date="1850-12-30, 00:00:00", + variable=["fgco2_raw", "time_bounds"], + variable_long_name=[ + "Flux into ocean - DIC, inc. anth.", + "time axis boundaries", + ], + variable_standard_name=["", ""], + variable_cell_methods=["time: mean", ""], + variable_units=["mmol/m^2/s", "days"], + coords=["nv", "time", "xt_ocean", "yt_ocean"], + coord_long_name=[ + "vertex number", + "time", + "tcell longitude", + "tcell latitude", + ], + coord_cartesian_axes=["N", "T", "X", "Y"], + coord_calendar_types=["", "GREGORIAN", "", ""], + coord_bounds=["", "time_bounds", "", ""], + coord_units=[ + "none", + "days since 0001-01-01 00:00:00", + "degrees_E", + "degrees_N", + ], ), ), ( builders.AccessEsm15Builder, "access-esm1-5/history/ocn/ocean_bgc.nc-18501231", - AccessNCFileInfo( - "ocean_bgc.nc-18501231", - "ocean_bgc", - None, - "1mon", - "1849-12-30, 00:00:00", - "1850-12-30, 00:00:00", - ["o2", "time_bounds"], - ["o2", "time axis boundaries"], - ["", ""], - ["time: mean", ""], - ["mmol/m^3", "days"], + _AccessNCFileInfo( + filename="ocean_bgc.nc-18501231", + file_id="ocean_bgc", + filename_timestamp=None, + frequency="1mon", + start_date="1849-12-30, 00:00:00", + end_date="1850-12-30, 00:00:00", + variable=["o2", "time_bounds"], + variable_long_name=["o2", "time axis boundaries"], + variable_standard_name=["", ""], + variable_cell_methods=["time: mean", ""], + variable_units=["mmol/m^3", "days"], + coords=["nv", "st_ocean", "time", "xt_ocean", "yt_ocean"], + coord_long_name=[ + "vertex number", + "tcell zstar depth", + "time", + "tcell longitude", + "tcell latitude", + ], + coord_cartesian_axes=["N", "Z", "T", "X", "Y"], + coord_calendar_types=["", "", "GREGORIAN", "", ""], + coord_bounds=["", "", "time_bounds", "", ""], + coord_units=[ + "none", + "meters", + "days since 0001-01-01 00:00:00", + "degrees_E", + "degrees_N", + ], ), ), ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.native_1900_01.nc", - AccessNCFileInfo( - "GMOM_JRA_WD.mom6.h.native_1900_01.nc", - "GMOM_JRA_WD_mom6_h_native_XXXX_XX", - "1900_01", - "1mon", - "1900-01-01, 00:00:00", - "1900-02-01, 00:00:00", - ["average_DT", "average_T1", "average_T2", "thetao", "time_bnds"], - [ + _AccessNCFileInfo( + filename="GMOM_JRA_WD.mom6.h.native_1900_01.nc", + file_id="GMOM_JRA_WD_mom6_h_native_XXXX_XX", + filename_timestamp="1900_01", + frequency="1mon", + start_date="1900-01-01, 00:00:00", + end_date="1900-02-01, 00:00:00", + variable=[ + "average_DT", + "average_T1", + "average_T2", + "thetao", + "time_bnds", + ], + variable_long_name=[ "Length of average period", "Start time for average period", "End time for average period", "Sea Water Potential Temperature", "time axis boundaries", ], - ["", "", "", "sea_water_potential_temperature", ""], - ["", "", "", "area:mean zl:mean yh:mean xh:mean time: mean", ""], - [ + variable_standard_name=[ + "", + "", + "", + "sea_water_potential_temperature", + "", + ], + variable_cell_methods=[ + "", + "", + "", + "area:mean zl:mean yh:mean xh:mean time: mean", + "", + ], + variable_units=[ "days", "days since 0001-01-01 00:00:00", "days since 0001-01-01 00:00:00", "degC", "days since 0001-01-01 00:00:00", ], + coords=["nv", "time", "xh", "yh", "zl"], + coord_long_name=[ + "vertex number", + "time", + "h point nominal longitude", + "h point nominal latitude", + "Layer pseudo-depth, -z*", + ], + coord_cartesian_axes=["", "", "", "", ""], + coord_calendar_types=["", "NOLEAP", "", "", ""], + coord_bounds=["", "time_bnds", "", "", ""], + coord_units=[ + "", + "days since 0001-01-01 00:00:00", + "degrees_east", + "degrees_north", + "meter", + ], ), ), ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc", - AccessNCFileInfo( - "GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc", - "GMOM_JRA_WD_mom6_h_sfc_XXXX_XX_XX", - "1900_01_02", - "1day", - "1900-01-01, 00:00:00", - "1900-01-02, 00:00:00", - ["average_DT", "average_T1", "average_T2", "time_bnds", "tos"], - [ + _AccessNCFileInfo( + filename="GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc", + file_id="GMOM_JRA_WD_mom6_h_sfc_XXXX_XX_XX", + filename_timestamp="1900_01_02", + frequency="1day", + start_date="1900-01-01, 00:00:00", + end_date="1900-01-02, 00:00:00", + variable=["average_DT", "average_T1", "average_T2", "time_bnds", "tos"], + variable_long_name=[ "Length of average period", "Start time for average period", "End time for average period", "time axis boundaries", "Sea Surface Temperature", ], - ["", "", "", "", "sea_surface_temperature"], - ["", "", "", "", "area:mean yh:mean xh:mean time: mean"], - [ + variable_standard_name=["", "", "", "", "sea_surface_temperature"], + variable_cell_methods=[ + "", + "", + "", + "", + "area:mean yh:mean xh:mean time: mean", + ], + variable_units=[ "days", "days since 0001-01-01 00:00:00", "days since 0001-01-01 00:00:00", "days since 0001-01-01 00:00:00", "degC", ], + coords=["nv", "time", "xh", "yh"], + coord_long_name=[ + "vertex number", + "time", + "h point nominal longitude", + "h point nominal latitude", + ], + coord_cartesian_axes=["", "", "", ""], + coord_calendar_types=["", "NOLEAP", "", ""], + coord_bounds=["", "time_bnds", "", ""], + coord_units=[ + "", + "days since 0001-01-01 00:00:00", + "degrees_east", + "degrees_north", + ], ), ), ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.static.nc", - AccessNCFileInfo( - "GMOM_JRA_WD.mom6.h.static.nc", - "GMOM_JRA_WD_mom6_h_static", - None, - "fx", - "none", - "none", - ["geolat", "geolon"], - ["Latitude of tracer (T) points", "Longitude of tracer (T) points"], - ["", ""], - ["time: point", "time: point"], - ["degrees_north", "degrees_east"], + _AccessNCFileInfo( + filename="GMOM_JRA_WD.mom6.h.static.nc", + file_id="GMOM_JRA_WD_mom6_h_static", + filename_timestamp=None, + frequency="fx", + start_date="none", + end_date="none", + variable=["geolat", "geolon"], + variable_long_name=[ + "Latitude of tracer (T) points", + "Longitude of tracer (T) points", + ], + variable_standard_name=["", ""], + variable_cell_methods=["time: point", "time: point"], + variable_units=["degrees_north", "degrees_east"], + coords=["xh", "yh"], + coord_long_name=[ + "h point nominal longitude", + "h point nominal latitude", + ], + coord_cartesian_axes=["", ""], + coord_calendar_types=["", ""], + coord_bounds=["", ""], + coord_units=["degrees_east", "degrees_north"], ), ), ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.z_1900_01.nc", - AccessNCFileInfo( - "GMOM_JRA_WD.mom6.h.z_1900_01.nc", - "GMOM_JRA_WD_mom6_h_z_XXXX_XX", - "1900_01", - "1mon", - "1900-01-01, 00:00:00", - "1900-02-01, 00:00:00", - ["average_DT", "average_T1", "average_T2", "thetao", "time_bnds"], - [ + _AccessNCFileInfo( + filename="GMOM_JRA_WD.mom6.h.z_1900_01.nc", + file_id="GMOM_JRA_WD_mom6_h_z_XXXX_XX", + filename_timestamp="1900_01", + frequency="1mon", + start_date="1900-01-01, 00:00:00", + end_date="1900-02-01, 00:00:00", + variable=[ + "average_DT", + "average_T1", + "average_T2", + "thetao", + "time_bnds", + ], + variable_long_name=[ "Length of average period", "Start time for average period", "End time for average period", "Sea Water Potential Temperature", "time axis boundaries", ], - ["", "", "", "sea_water_potential_temperature", ""], - ["", "", "", "area:mean z_l:mean yh:mean xh:mean time: mean", ""], - [ + variable_standard_name=[ + "", + "", + "", + "sea_water_potential_temperature", + "", + ], + variable_cell_methods=[ + "", + "", + "", + "area:mean z_l:mean yh:mean xh:mean time: mean", + "", + ], + variable_units=[ "days", "days since 0001-01-01 00:00:00", "days since 0001-01-01 00:00:00", "degC", "days since 0001-01-01 00:00:00", ], + coords=["nv", "time", "xh", "yh", "z_l"], + coord_long_name=[ + "vertex number", + "time", + "h point nominal longitude", + "h point nominal latitude", + "Depth at cell center", + ], + coord_cartesian_axes=["", "", "", "", ""], + coord_calendar_types=["", "NOLEAP", "", "", ""], + coord_bounds=["", "time_bnds", "", "", ""], + coord_units=[ + "", + "days since 0001-01-01 00:00:00", + "degrees_east", + "degrees_north", + "meters", + ], ), ), ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.cice.h.1900-01-01.nc", - AccessNCFileInfo( - "GMOM_JRA_WD.cice.h.1900-01-01.nc", - "GMOM_JRA_WD_cice_h_XXXX_XX_XX", - "1900-01-01", - "1day", - "1900-01-01, 00:00:00", - "1900-01-02, 00:00:00", - ["TLAT", "TLON", "aice", "tarea", "time_bounds"], - [ + _AccessNCFileInfo( + filename="GMOM_JRA_WD.cice.h.1900-01-01.nc", + file_id="GMOM_JRA_WD_cice_h_XXXX_XX_XX", + filename_timestamp="1900-01-01", + frequency="1day", + start_date="1900-01-01, 00:00:00", + end_date="1900-01-02, 00:00:00", + variable=["TLAT", "TLON", "aice", "tarea", "time_bounds"], + variable_long_name=[ "T grid center latitude", "T grid center longitude", "ice area (aggregate)", "area of T grid cells", "time interval endpoints", ], - ["", "", "", "", ""], - ["", "", "time: mean", "", ""], - [ + variable_standard_name=["", "", "", "", ""], + variable_cell_methods=["", "", "time: mean", "", ""], + variable_units=[ "degrees_north", "degrees_east", "1", "m^2", "days since 0000-01-01 00:00:00", ], + coords=["time"], + coord_long_name=["time"], + coord_cartesian_axes=[""], + coord_calendar_types=[""], + coord_bounds=["time_bounds"], + coord_units=["days since 0000-01-01 00:00:00"], ), ), ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc", - AccessNCFileInfo( - "GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc", - "GMOM_JRA_WD_ww3_hi_XXXX_XX_XX_XXXXX", - "1900-01-02-00000", - "fx", # WW3 provides no time bounds - "1900-01-02, 00:00:00", - "1900-01-02, 00:00:00", - ["EF", "mapsta"], - ["1D spectral density", "map status"], - ["", ""], - ["", ""], - ["m2 s", "unitless"], + _AccessNCFileInfo( + filename="GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc", + file_id="GMOM_JRA_WD_ww3_hi_XXXX_XX_XX_XXXXX", + filename_timestamp="1900-01-02-00000", + frequency="fx", # WW3 provides no time bounds + start_date="1900-01-02, 00:00:00", + end_date="1900-01-02, 00:00:00", + variable=["EF", "mapsta"], + variable_long_name=["1D spectral density", "map status"], + variable_standard_name=["", ""], + variable_cell_methods=["", ""], + variable_units=["m2 s", "unitless"], + coords=[], + coord_long_name=[], + coord_cartesian_axes=[], + coord_calendar_types=[], + coord_bounds=[], + coord_units=[], ), ), ], From 7f34102f6341054b793d6159161ad39318b18101 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Wed, 25 Sep 2024 16:18:50 +0800 Subject: [PATCH 08/23] - Added mypy checks for 3.9..3.12. Note type hints fail on Python3.9 due to use of '|' type union syntax - might be worth consideration? - Removed a couple of unused imports, cleaned up some comments --- mypy/mypy_3.10.ini | 3 +++ mypy/mypy_3.11.ini | 3 +++ mypy.ini => mypy/mypy_3.12.ini | 0 mypy/mypy_3.9.ini | 3 +++ src/access_nri_intake/catalog/translators.py | 2 -- src/access_nri_intake/source/builders.py | 6 +++--- 6 files changed, 12 insertions(+), 5 deletions(-) create mode 100644 mypy/mypy_3.10.ini create mode 100644 mypy/mypy_3.11.ini rename mypy.ini => mypy/mypy_3.12.ini (100%) create mode 100644 mypy/mypy_3.9.ini diff --git a/mypy/mypy_3.10.ini b/mypy/mypy_3.10.ini new file mode 100644 index 00000000..99c9e230 --- /dev/null +++ b/mypy/mypy_3.10.ini @@ -0,0 +1,3 @@ +[mypy] +python_version = 3.10 +ignore_missing_imports = True \ No newline at end of file diff --git a/mypy/mypy_3.11.ini b/mypy/mypy_3.11.ini new file mode 100644 index 00000000..0413b9fc --- /dev/null +++ b/mypy/mypy_3.11.ini @@ -0,0 +1,3 @@ +[mypy] +python_version = 3.11 +ignore_missing_imports = True \ No newline at end of file diff --git a/mypy.ini b/mypy/mypy_3.12.ini similarity index 100% rename from mypy.ini rename to mypy/mypy_3.12.ini diff --git a/mypy/mypy_3.9.ini b/mypy/mypy_3.9.ini new file mode 100644 index 00000000..28f27a56 --- /dev/null +++ b/mypy/mypy_3.9.ini @@ -0,0 +1,3 @@ +[mypy] +python_version = 3.9 +ignore_missing_imports = True \ No newline at end of file diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py index 8f8bad96..40231e2a 100644 --- a/src/access_nri_intake/catalog/translators.py +++ b/src/access_nri_intake/catalog/translators.py @@ -6,8 +6,6 @@ like the ACCESS-NRI catalog """ -from __future__ import annotations - from functools import partial from typing import Callable diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py index ed41a098..d43ffb6b 100644 --- a/src/access_nri_intake/source/builders.py +++ b/src/access_nri_intake/source/builders.py @@ -389,9 +389,9 @@ def __init__(self, path): @classmethod def parser(cls, file) -> dict: try: - # Need to check, but I think that the .groups() method that mypy is - # getting upset about is what the try/catch is for here - if the regex - # doesn't match, then it will throw an exception. + # mypy gets upset as match can return None. I assume this is why we + # have try/except block in the first place? If so, we might be able + # to make this more explicit? match_groups = re.match(r".*/output\d+/([^/]*)/.*\.nc", file).groups() # type: ignore realm = match_groups[0] From bc0265ca8eb5d98c9dcd226a80aefd846fcfe153 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Thu, 26 Sep 2024 09:32:10 +0800 Subject: [PATCH 09/23] Rewrote type hints to be compatible with Python3.9 (T | None => Optional[T], etc) --- src/access_nri_intake/catalog/manager.py | 9 ++++---- src/access_nri_intake/catalog/translators.py | 4 ++-- src/access_nri_intake/source/builders.py | 23 +++++++++++--------- src/access_nri_intake/source/utils.py | 11 +++++----- 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/src/access_nri_intake/catalog/manager.py b/src/access_nri_intake/catalog/manager.py index f52e2126..37b6cf01 100644 --- a/src/access_nri_intake/catalog/manager.py +++ b/src/access_nri_intake/catalog/manager.py @@ -4,6 +4,7 @@ """ Manager for adding/updating intake sources in an intake-dataframe-catalog like the ACCESS-NRI catalog """ import os +from typing import Optional, Union import intake from intake_dataframe_catalog.core import DfFileCatalog @@ -61,10 +62,10 @@ def build_esm( name: str, description: str, builder, - path: list[str] | str, + path: Union[str, list[str]], translator=DefaultTranslator, - metadata: dict | None = None, - directory: str | None = None, + metadata: Optional[dict] = None, + directory: Optional[str] = None, overwrite: bool = False, **kwargs, ): @@ -129,7 +130,7 @@ def load( path: str, driver: str = "esm_datastore", translator=DefaultTranslator, - metadata: dict | None = None, + metadata: Optional[dict] = None, **kwargs, ): """ diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py index 40231e2a..f237c165 100644 --- a/src/access_nri_intake/catalog/translators.py +++ b/src/access_nri_intake/catalog/translators.py @@ -7,7 +7,7 @@ """ from functools import partial -from typing import Callable +from typing import Callable, Optional import pandas as pd import tlz @@ -98,7 +98,7 @@ def _default_translator(self, column: str) -> pd.Series: return pd.Series([val] * len_df) - def translate(self, groupby: list[str] | None = None) -> pd.DataFrame: + def translate(self, groupby: Optional[list[str]] = None) -> pd.DataFrame: """ Return the translated :py:class:`~pandas.DataFrame` of metadata and merge into set of set of rows with unique values of the columns specified. diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py index d43ffb6b..57480a91 100644 --- a/src/access_nri_intake/source/builders.py +++ b/src/access_nri_intake/source/builders.py @@ -7,6 +7,7 @@ import re import traceback from pathlib import Path +from typing import Optional, Union import xarray as xr from ecgtools.builder import INVALID_ASSET, TRACEBACK, Builder @@ -57,14 +58,14 @@ class BaseBuilder(Builder): def __init__( self, - path: str | list[str], + path: Union[str, list[str]], depth: int = 0, - exclude_patterns: list[str] | None = None, - include_patterns: list[str] | None = None, + exclude_patterns: Optional[list[str]] = None, + include_patterns: Optional[list[str]] = None, data_format: str = "netcdf", - groupby_attrs: list[str] | None = None, - aggregations: list[dict] | None = None, - storage_options: dict | None = None, + groupby_attrs: Optional[list[str]] = None, + aggregations: Optional[list[dict]] = None, + storage_options: Optional[dict] = None, joblib_parallel_kwargs: dict = {"n_jobs": multiprocessing.cpu_count()}, ): """ @@ -119,7 +120,7 @@ def parse(self): self._parse() return self - def _save(self, name: str, description: str, directory: str | None): + def _save(self, name: str, description: str, directory: Union[str, None]): super().save( name=name, path_column_name=PATH_COLUMN, @@ -134,7 +135,9 @@ def _save(self, name: str, description: str, directory: str | None): to_csv_kwargs={"compression": "gzip"}, ) - def save(self, name: str, description: str, directory: str | None = None) -> None: + def save( + self, name: str, description: str, directory: Optional[str] = None + ) -> None: """ Save datastore contents to a file. @@ -218,10 +221,10 @@ def parser(file): def parse_access_filename( cls, filename: str, - patterns: list[str] | None = None, + patterns: Optional[list[str]] = None, frequencies: dict = FREQUENCIES, redaction_fill: str = "X", - ) -> tuple[str, str | None, str | None]: + ) -> tuple[str, Union[str, None], Union[str, None]]: """ Parse an ACCESS model filename and return a file id and any time information diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index 816bb3b4..06e82827 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -7,6 +7,7 @@ from dataclasses import asdict, dataclass, field from datetime import timedelta from pathlib import Path +from typing import Union import cftime import xarray as xr @@ -23,9 +24,9 @@ class _AccessNCFileInfo: catalog entry. """ - filename: str | Path + filename: Union[str, Path] file_id: str - filename_timestamp: str | None + filename_timestamp: Union[str, None] frequency: str start_date: str end_date: str @@ -45,7 +46,7 @@ class _AccessNCFileInfo: def __post_init__(self): self.path = str(self.filename) - def to_dict(self) -> dict[str, str | list[str]]: + def to_dict(self) -> dict[str, Union[str, list[str]]]: """ Return a dictionary representation of the NcFileInfo object """ @@ -189,7 +190,7 @@ def _guess_start_end_dates(ts, te, frequency): def get_timeinfo( ds: xr.Dataset, - filename_frequency: str | None, + filename_frequency: Union[str, None], time_dim: str, ) -> tuple[str, str, str]: """ @@ -227,7 +228,7 @@ def _todate(t): time_format = "%Y-%m-%d, %H:%M:%S" ts = None te = None - frequency: str | tuple[int | None, str] = "fx" + frequency: Union[str, tuple[Union[int, None], str]] = "fx" has_time = time_dim in ds if has_time: From f84d383f2c48214d114cb7fa070d4943caf13c47 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Fri, 27 Sep 2024 09:56:49 +0800 Subject: [PATCH 10/23] Fixed some issues tests weren't catching --- src/access_nri_intake/source/builders.py | 4 ++-- src/access_nri_intake/source/utils.py | 5 +---- tests/test_builders.py | 22 ++++++++++++++++++++++ 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py index 57480a91..27a2235c 100644 --- a/src/access_nri_intake/source/builders.py +++ b/src/access_nri_intake/source/builders.py @@ -303,7 +303,6 @@ def parse_access_ncfile( """ file_path = Path(file) - filename = file_path.name file_id, filename_timestamp, filename_frequency = cls.parse_access_filename( file_path.stem @@ -335,7 +334,8 @@ def parse_access_ncfile( raise EmptyFileError("This file contains no variables") output_ncfile = _AccessNCFileInfo( - filename=filename, + filename=file_path.name, + path=file, file_id=file_id, filename_timestamp=filename_timestamp, frequency=frequency, diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index 06e82827..7d74e7d4 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -26,6 +26,7 @@ class _AccessNCFileInfo: filename: Union[str, Path] file_id: str + path: str filename_timestamp: Union[str, None] frequency: str start_date: str @@ -41,10 +42,6 @@ class _AccessNCFileInfo: coord_calendar_types: list[str] coord_bounds: list[str] coord_units: list[str] - path: str = field(init=False) - - def __post_init__(self): - self.path = str(self.filename) def to_dict(self) -> dict[str, Union[str, list[str]]]: """ diff --git a/tests/test_builders.py b/tests/test_builders.py index 2cf523eb..032ad94d 100644 --- a/tests/test_builders.py +++ b/tests/test_builders.py @@ -366,6 +366,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessOm2Builder, "access-om2/output000/ocean/ocean_grid.nc", _AccessNCFileInfo( + path=None, # type: ignore filename="ocean_grid.nc", file_id="ocean_grid", filename_timestamp=None, @@ -389,6 +390,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessOm2Builder, "access-om2/output000/ocean/ocean.nc", _AccessNCFileInfo( + path=None, # type: ignore filename="ocean.nc", file_id="ocean", filename_timestamp=None, @@ -424,6 +426,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessOm2Builder, "access-om2/output000/ocean/ocean_month.nc", _AccessNCFileInfo( + path=None, # type: ignore filename="ocean_month.nc", file_id="ocean_month", filename_timestamp=None, @@ -463,6 +466,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessOm2Builder, "access-om2/output000/ocean/ocean_month_inst_nobounds.nc", _AccessNCFileInfo( + path=None, # type: ignore filename="ocean_month_inst_nobounds.nc", file_id="ocean_month_inst_nobounds", filename_timestamp=None, @@ -492,6 +496,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessOm2Builder, "access-om2/output000/ice/OUTPUT/iceh.1900-01.nc", _AccessNCFileInfo( + path=None, # type: ignore filename="iceh.1900-01.nc", file_id="iceh_XXXX_XX", filename_timestamp="1900-01", @@ -527,6 +532,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessCm2Builder, "access-cm2/by578/history/atm/netCDF/by578a.pd201501_dai.nc", _AccessNCFileInfo( + path=None, # type: ignore filename="by578a.pd201501_dai.nc", file_id="by578a_pdXXXXXX_dai", filename_timestamp="201501", @@ -550,6 +556,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessCm2Builder, "access-cm2/by578/history/ice/iceh_d.2015-01.nc", _AccessNCFileInfo( + path=None, # type: ignore filename="iceh_d.2015-01.nc", file_id="iceh_d_XXXX_XX", filename_timestamp="2015-01", @@ -585,6 +592,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessCm2Builder, "access-cm2/by578/history/ocn/ocean_daily.nc-20150630", _AccessNCFileInfo( + path=None, # type: ignore filename="ocean_daily.nc-20150630", file_id="ocean_daily", filename_timestamp=None, @@ -618,6 +626,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessCm2Builder, "access-cm2/by578/history/ocn/ocean_scalar.nc-20150630", _AccessNCFileInfo( + path=None, # type: ignore filename="ocean_scalar.nc-20150630", file_id="ocean_scalar", filename_timestamp=None, @@ -644,6 +653,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessEsm15Builder, "access-esm1-5/history/atm/netCDF/HI-C-05-r1.pa-185001_mon.nc", _AccessNCFileInfo( + path=None, # type: ignore filename="HI-C-05-r1.pa-185001_mon.nc", file_id="HI_C_05_r1_pa_XXXXXX_mon", filename_timestamp="185001", @@ -667,6 +677,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessEsm15Builder, "access-esm1-5/history/ice/iceh.1850-01.nc", _AccessNCFileInfo( + path=None, # type: ignore filename="iceh.1850-01.nc", file_id="iceh_XXXX_XX", filename_timestamp="1850-01", @@ -702,6 +713,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessEsm15Builder, "access-esm1-5/history/ocn/ocean_bgc_ann.nc-18501231", _AccessNCFileInfo( + path=None, # type: ignore filename="ocean_bgc_ann.nc-18501231", file_id="ocean_bgc_ann", filename_timestamp=None, @@ -738,6 +750,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessEsm15Builder, "access-esm1-5/history/ocn/ocean_bgc.nc-18501231", _AccessNCFileInfo( + path=None, # type: ignore filename="ocean_bgc.nc-18501231", file_id="ocean_bgc", filename_timestamp=None, @@ -773,6 +786,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.native_1900_01.nc", _AccessNCFileInfo( + path=None, # type: ignore filename="GMOM_JRA_WD.mom6.h.native_1900_01.nc", file_id="GMOM_JRA_WD_mom6_h_native_XXXX_XX", filename_timestamp="1900_01", @@ -838,6 +852,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc", _AccessNCFileInfo( + path=None, # type: ignore filename="GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc", file_id="GMOM_JRA_WD_mom6_h_sfc_XXXX_XX_XX", filename_timestamp="1900_01_02", @@ -889,6 +904,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.static.nc", _AccessNCFileInfo( + path=None, # type: ignore filename="GMOM_JRA_WD.mom6.h.static.nc", file_id="GMOM_JRA_WD_mom6_h_static", filename_timestamp=None, @@ -918,6 +934,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.z_1900_01.nc", _AccessNCFileInfo( + path=None, # type: ignore filename="GMOM_JRA_WD.mom6.h.z_1900_01.nc", file_id="GMOM_JRA_WD_mom6_h_z_XXXX_XX", filename_timestamp="1900_01", @@ -983,6 +1000,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.cice.h.1900-01-01.nc", _AccessNCFileInfo( + path=None, # type: ignore filename="GMOM_JRA_WD.cice.h.1900-01-01.nc", file_id="GMOM_JRA_WD_cice_h_XXXX_XX_XX", filename_timestamp="1900-01-01", @@ -1018,6 +1036,7 @@ def test_parse_access_filename(builder, filename, expected): builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc", _AccessNCFileInfo( + path=None, # type: ignore filename="GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc", file_id="GMOM_JRA_WD_ww3_hi_XXXX_XX_XX_XXXXX", filename_timestamp="1900-01-02-00000", @@ -1042,4 +1061,7 @@ def test_parse_access_filename(builder, filename, expected): def test_parse_access_ncfile(test_data, builder, filename, expected): file = str(test_data / Path(filename)) + # Set the path to the test data directory + expected.path = file + assert builder.parse_access_ncfile(file) == expected From 7799c8db7ead1671930ef5cf49800776bc07b379 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Fri, 27 Sep 2024 12:51:53 +1000 Subject: [PATCH 11/23] Renamed => - makes indexing more consistent --- src/access_nri_intake/source/utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index 7d74e7d4..a3a057e4 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -22,6 +22,12 @@ class _AccessNCFileInfo: """ Holds information about a NetCDF file that is used to create an intake-esm catalog entry. + + ______ + Notes: + Use of both path and filename seems redundant, but constructing filename from + the path using a __post_init__ method makes testing more difficult. On balance, + more explicit tests are probably more important than the slight redundancy. """ filename: Union[str, Path] @@ -130,7 +136,7 @@ def to_ncinfo_dict(self) -> dict[str, list[str]]: defined explicitly for use in the _AccessNCFileInfo constructor. """ return { - "coords": self.coord_list, + "coord": self.coord_list, "coord_long_name": self.long_name_list, "coord_cartesian_axes": self.cartesian_axis_list, "coord_calendar_types": self.calendar_type_list, From e6590c98db0b3309035ea9677f61da2a5ee1a858 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Fri, 27 Sep 2024 13:03:23 +1000 Subject: [PATCH 12/23] Fixed a couple of issues relating to changing from coords => coord in search (forgot to test) --- src/access_nri_intake/source/utils.py | 2 +- tests/test_builders.py | 38 +++++++++++++-------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index a3a057e4..3ec01e6d 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -42,7 +42,7 @@ class _AccessNCFileInfo: variable_standard_name: list[str] variable_cell_methods: list[str] variable_units: list[str] - coords: list[str] + coord: list[str] coord_long_name: list[str] coord_cartesian_axes: list[str] coord_calendar_types: list[str] diff --git a/tests/test_builders.py b/tests/test_builders.py index 032ad94d..d21857f0 100644 --- a/tests/test_builders.py +++ b/tests/test_builders.py @@ -379,7 +379,7 @@ def test_parse_access_filename(builder, filename, expected): variable_cell_methods=["time: point", "time: point"], variable_units=["degrees_N", "degrees_E"], coord_long_name=["tcell longitude", "tcell latitude"], - coords=["xt_ocean", "yt_ocean"], + coord=["xt_ocean", "yt_ocean"], coord_cartesian_axes=["X", "Y"], coord_calendar_types=["", ""], coord_bounds=["", ""], @@ -402,7 +402,7 @@ def test_parse_access_filename(builder, filename, expected): variable_standard_name=["sea_water_conservative_temperature", ""], variable_cell_methods=["time: mean", ""], variable_units=["K", "days"], - coords=["nv", "st_ocean", "time", "xt_ocean", "yt_ocean"], + coord=["nv", "st_ocean", "time", "xt_ocean", "yt_ocean"], coord_long_name=[ "vertex number", "tcell zstar depth", @@ -444,7 +444,7 @@ def test_parse_access_filename(builder, filename, expected): ], variable_cell_methods=["time: mean", ""], variable_units=["m", "days"], - coords=["nv", "time", "xt_ocean", "yt_ocean"], + coord=["nv", "time", "xt_ocean", "yt_ocean"], coord_long_name=[ "vertex number", "time", @@ -480,7 +480,7 @@ def test_parse_access_filename(builder, filename, expected): ], variable_cell_methods=["time: mean"], variable_units=["m"], - coords=["time", "xt_ocean", "yt_ocean"], + coord=["time", "xt_ocean", "yt_ocean"], coord_long_name=["time", "tcell longitude", "tcell latitude"], coord_cartesian_axes=["T", "X", "Y"], coord_calendar_types=["NOLEAP", "", ""], @@ -520,7 +520,7 @@ def test_parse_access_filename(builder, filename, expected): "m^2", "days since 1900-01-01 00:00:00", ], - coords=["time"], + coord=["time"], coord_long_name=["model time"], coord_cartesian_axes=[""], coord_calendar_types=[""], @@ -544,7 +544,7 @@ def test_parse_access_filename(builder, filename, expected): variable_standard_name=["air_temperature"], variable_cell_methods=["time: mean"], variable_units=["K"], - coords=[], + coord=[], coord_long_name=[], coord_cartesian_axes=[], coord_calendar_types=[], @@ -580,7 +580,7 @@ def test_parse_access_filename(builder, filename, expected): "m^2", "days since 1850-01-01 00:00:00", ], - coords=["time"], + coord=["time"], coord_long_name=["model time"], coord_cartesian_axes=[""], coord_calendar_types=[""], @@ -604,7 +604,7 @@ def test_parse_access_filename(builder, filename, expected): variable_standard_name=["sea_surface_temperature", ""], variable_cell_methods=["time: mean", ""], variable_units=["K", "days"], - coords=["nv", "time", "xt_ocean", "yt_ocean"], + coord=["nv", "time", "xt_ocean", "yt_ocean"], coord_long_name=[ "vertex number", "time", @@ -641,7 +641,7 @@ def test_parse_access_filename(builder, filename, expected): variable_standard_name=["sea_water_potential_temperature", ""], variable_cell_methods=["time: mean", ""], variable_units=["deg_C", "days"], - coords=["nv", "scalar_axis", "time"], + coord=["nv", "scalar_axis", "time"], coord_long_name=["vertex number", "none", "time"], coord_cartesian_axes=["N", "X", "T"], coord_calendar_types=["", "", "GREGORIAN"], @@ -665,7 +665,7 @@ def test_parse_access_filename(builder, filename, expected): variable_standard_name=["air_temperature"], variable_cell_methods=["time: mean"], variable_units=["K"], - coords=[], + coord=[], coord_long_name=[], coord_cartesian_axes=[], coord_calendar_types=[], @@ -701,7 +701,7 @@ def test_parse_access_filename(builder, filename, expected): "m^2", "days since 0001-01-01 00:00:00", ], - coords=["time"], + coord=["time"], coord_long_name=["model time"], coord_cartesian_axes=[""], coord_calendar_types=[""], @@ -728,7 +728,7 @@ def test_parse_access_filename(builder, filename, expected): variable_standard_name=["", ""], variable_cell_methods=["time: mean", ""], variable_units=["mmol/m^2/s", "days"], - coords=["nv", "time", "xt_ocean", "yt_ocean"], + coord=["nv", "time", "xt_ocean", "yt_ocean"], coord_long_name=[ "vertex number", "time", @@ -762,7 +762,7 @@ def test_parse_access_filename(builder, filename, expected): variable_standard_name=["", ""], variable_cell_methods=["time: mean", ""], variable_units=["mmol/m^3", "days"], - coords=["nv", "st_ocean", "time", "xt_ocean", "yt_ocean"], + coord=["nv", "st_ocean", "time", "xt_ocean", "yt_ocean"], coord_long_name=[ "vertex number", "tcell zstar depth", @@ -828,7 +828,7 @@ def test_parse_access_filename(builder, filename, expected): "degC", "days since 0001-01-01 00:00:00", ], - coords=["nv", "time", "xh", "yh", "zl"], + coord=["nv", "time", "xh", "yh", "zl"], coord_long_name=[ "vertex number", "time", @@ -882,7 +882,7 @@ def test_parse_access_filename(builder, filename, expected): "days since 0001-01-01 00:00:00", "degC", ], - coords=["nv", "time", "xh", "yh"], + coord=["nv", "time", "xh", "yh"], coord_long_name=[ "vertex number", "time", @@ -919,7 +919,7 @@ def test_parse_access_filename(builder, filename, expected): variable_standard_name=["", ""], variable_cell_methods=["time: point", "time: point"], variable_units=["degrees_north", "degrees_east"], - coords=["xh", "yh"], + coord=["xh", "yh"], coord_long_name=[ "h point nominal longitude", "h point nominal latitude", @@ -976,7 +976,7 @@ def test_parse_access_filename(builder, filename, expected): "degC", "days since 0001-01-01 00:00:00", ], - coords=["nv", "time", "xh", "yh", "z_l"], + coord=["nv", "time", "xh", "yh", "z_l"], coord_long_name=[ "vertex number", "time", @@ -1024,7 +1024,7 @@ def test_parse_access_filename(builder, filename, expected): "m^2", "days since 0000-01-01 00:00:00", ], - coords=["time"], + coord=["time"], coord_long_name=["time"], coord_cartesian_axes=[""], coord_calendar_types=[""], @@ -1048,7 +1048,7 @@ def test_parse_access_filename(builder, filename, expected): variable_standard_name=["", ""], variable_cell_methods=["", ""], variable_units=["m2 s", "unitless"], - coords=[], + coord=[], coord_long_name=[], coord_cartesian_axes=[], coord_calendar_types=[], From 0c96e28f347c553afdb22c335f89ab5fd6f19b0b Mon Sep 17 00:00:00 2001 From: Marc White Date: Mon, 30 Sep 2024 16:27:21 +1000 Subject: [PATCH 13/23] Add cmip6_ig45 to catalog --- config/cmip6.yaml | 4 ++++ config/experiments/cmip6_ig45/metadata.yaml | 26 +++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 config/experiments/cmip6_ig45/metadata.yaml diff --git a/config/cmip6.yaml b/config/cmip6.yaml index d50b0215..b446475b 100644 --- a/config/cmip6.yaml +++ b/config/cmip6.yaml @@ -11,3 +11,7 @@ sources: - metadata_yaml: /g/data/xp65/admin/intake/metadata/cmip6_oi10/metadata.yaml path: - /g/data/oi10/catalog/v2/esm/catalog.json + + - metadata_yaml: /g/data/xp65/admin/intake/metadata/cmip6_ig45/metadata.yaml + path: + - /g/data/ig45/catalog/v2/esm/catalog.json \ No newline at end of file diff --git a/config/experiments/cmip6_ig45/metadata.yaml b/config/experiments/cmip6_ig45/metadata.yaml new file mode 100644 index 00000000..8046f731 --- /dev/null +++ b/config/experiments/cmip6_ig45/metadata.yaml @@ -0,0 +1,26 @@ +name: cmip6_ig45 +experiment_uuid: c7021d1e-7ba2-11ef-beb5-000007d3fe80 +description: 20km regional projections for CORDEX-CMIP6 from the Queensland Future Climate Science Program +long_description: >- + This dataset includes projections at 20km, formatted to meet the CORDEX-CMIP6 data standards. + The 20km projections were derived from the 10km projections. +model: +- CMIP6 +frequency: +- +variable: +- +nominal_resolution: +- +version: +contact: NCI +email: help@nci.org.au +reference: +license: +url: https://geonetwork.nci.org.au/geonetwork/srv/eng/catalog.search#/metadata/f7465_8388_5100_7022 +parent_experiment: +related_experiments: +- +notes: +keywords: +- cmip \ No newline at end of file From 1d9f429cdc119ce9a1e68bb579ffd0d2123acf61 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Thu, 3 Oct 2024 11:06:43 +1000 Subject: [PATCH 14/23] - Moved coordinate variables back into variables (simpler interface) - Updated tests to better respect moving coordinate variables back into variables - Moved mypy setup stuff into to 'pre-commit-config.yaml' --- .pre-commit-config.yaml | 7 + mypy/mypy_3.10.ini | 3 - mypy/mypy_3.11.ini | 3 - mypy/mypy_3.12.ini | 3 - mypy/mypy_3.9.ini | 3 - src/access_nri_intake/source/builders.py | 9 +- src/access_nri_intake/source/utils.py | 52 --- tests/test_builders.py | 430 ++++++++++++----------- 8 files changed, 238 insertions(+), 272 deletions(-) delete mode 100644 mypy/mypy_3.10.ini delete mode 100644 mypy/mypy_3.11.ini delete mode 100644 mypy/mypy_3.12.ini delete mode 100644 mypy/mypy_3.9.ini diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 13106eeb..2843edd9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,3 +10,10 @@ repos: hooks: - id: black language_version: python3 +# Mypy + - repo: https://github.com/pre-commit/mirrors-mypy + rev: 'v1.11.2' + hooks: + - id: mypy + name: mypy + additional_dependencies: [types-PyYAML==6.0.12.20240808] diff --git a/mypy/mypy_3.10.ini b/mypy/mypy_3.10.ini deleted file mode 100644 index 99c9e230..00000000 --- a/mypy/mypy_3.10.ini +++ /dev/null @@ -1,3 +0,0 @@ -[mypy] -python_version = 3.10 -ignore_missing_imports = True \ No newline at end of file diff --git a/mypy/mypy_3.11.ini b/mypy/mypy_3.11.ini deleted file mode 100644 index 0413b9fc..00000000 --- a/mypy/mypy_3.11.ini +++ /dev/null @@ -1,3 +0,0 @@ -[mypy] -python_version = 3.11 -ignore_missing_imports = True \ No newline at end of file diff --git a/mypy/mypy_3.12.ini b/mypy/mypy_3.12.ini deleted file mode 100644 index a47639ed..00000000 --- a/mypy/mypy_3.12.ini +++ /dev/null @@ -1,3 +0,0 @@ -[mypy] -python_version = 3.12 -ignore_missing_imports = True \ No newline at end of file diff --git a/mypy/mypy_3.9.ini b/mypy/mypy_3.9.ini deleted file mode 100644 index 28f27a56..00000000 --- a/mypy/mypy_3.9.ini +++ /dev/null @@ -1,3 +0,0 @@ -[mypy] -python_version = 3.9 -ignore_missing_imports = True \ No newline at end of file diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py index 27a2235c..a05e8fed 100644 --- a/src/access_nri_intake/source/builders.py +++ b/src/access_nri_intake/source/builders.py @@ -17,7 +17,6 @@ from .utils import ( EmptyFileError, _AccessNCFileInfo, - _CoordVarInfo, _DataVarInfo, get_timeinfo, ) @@ -316,16 +315,11 @@ def parse_access_ncfile( decode_coords=False, ) as ds: dvars = _DataVarInfo() - cvars = _CoordVarInfo() - for var in ds.data_vars: + for var in ds.variables: attrs = ds[var].attrs dvars.append_attrs(var, attrs) # type: ignore - for var in ds.coords: - attrs = ds[var].attrs - cvars.append_attrs(var, attrs) # type: ignore - start_date, end_date, frequency = get_timeinfo( ds, filename_frequency, time_dim ) @@ -342,7 +336,6 @@ def parse_access_ncfile( start_date=start_date, end_date=end_date, **dvars.to_ncinfo_dict(), - **cvars.to_ncinfo_dict(), ) return output_ncfile diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index 3ec01e6d..c9082f32 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -42,12 +42,6 @@ class _AccessNCFileInfo: variable_standard_name: list[str] variable_cell_methods: list[str] variable_units: list[str] - coord: list[str] - coord_long_name: list[str] - coord_cartesian_axes: list[str] - coord_calendar_types: list[str] - coord_bounds: list[str] - coord_units: list[str] def to_dict(self) -> dict[str, Union[str, list[str]]]: """ @@ -99,52 +93,6 @@ def to_ncinfo_dict(self) -> dict[str, list[str]]: } -@dataclass -class _CoordVarInfo: - """ - Holds information about the coordinate variables in a NetCDF file that is - used to create an intake-esm catalog entry. - """ - - coord_list: list[str] = field(default_factory=list) - long_name_list: list[str] = field(default_factory=list) - cartesian_axis_list: list[str] = field(default_factory=list) - calendar_type_list: list[str] = field(default_factory=list) - bounds_list: list[str] = field(default_factory=list) - units_list: list[str] = field(default_factory=list) - - def append_attrs(self, var: str, attrs: dict) -> None: - """ - Append attributes to the CoordVarInfo object, if the attribute has a - 'long_name' key. - - TODO: Why do we need a long name key? seems important - """ - if "long_name" not in attrs: - return None - - self.coord_list.append(var) - self.long_name_list.append(attrs["long_name"]) - self.cartesian_axis_list.append(attrs.get("cartesian_axis", "")) - self.calendar_type_list.append(attrs.get("calendar_type", "")) - self.bounds_list.append(attrs.get("bounds", "")) - self.units_list.append(attrs.get("units", "")) - - def to_ncinfo_dict(self) -> dict[str, list[str]]: - """ - Return a dictionary representation of the CoordVarInfo object. Fields are - defined explicitly for use in the _AccessNCFileInfo constructor. - """ - return { - "coord": self.coord_list, - "coord_long_name": self.long_name_list, - "coord_cartesian_axes": self.cartesian_axis_list, - "coord_calendar_types": self.calendar_type_list, - "coord_bounds": self.bounds_list, - "coord_units": self.units_list, - } - - def _add_month_start(time, n: int): """Add months to cftime datetime and truncate to start""" year = time.year + ((time.month + n - 1) // 12) diff --git a/tests/test_builders.py b/tests/test_builders.py index d21857f0..c28f9eac 100644 --- a/tests/test_builders.py +++ b/tests/test_builders.py @@ -373,17 +373,16 @@ def test_parse_access_filename(builder, filename, expected): frequency="fx", start_date="none", end_date="none", - variable=["geolat_t", "geolon_t"], - variable_long_name=["tracer latitude", "tracer longitude"], - variable_standard_name=["", ""], - variable_cell_methods=["time: point", "time: point"], - variable_units=["degrees_N", "degrees_E"], - coord_long_name=["tcell longitude", "tcell latitude"], - coord=["xt_ocean", "yt_ocean"], - coord_cartesian_axes=["X", "Y"], - coord_calendar_types=["", ""], - coord_bounds=["", ""], - coord_units=["degrees_E", "degrees_N"], + variable=["geolat_t", "geolon_t", "xt_ocean", "yt_ocean"], + variable_long_name=[ + "tracer latitude", + "tracer longitude", + "tcell longitude", + "tcell latitude", + ], + variable_standard_name=["", "", "", ""], + variable_cell_methods=["time: point", "time: point", "", ""], + variable_units=["degrees_N", "degrees_E", "degrees_E", "degrees_N"], ), ), ( @@ -397,26 +396,40 @@ def test_parse_access_filename(builder, filename, expected): frequency="1yr", start_date="1900-01-01, 00:00:00", end_date="1910-01-01, 00:00:00", - variable=["temp", "time_bounds"], - variable_long_name=["Conservative temperature", "time axis boundaries"], - variable_standard_name=["sea_water_conservative_temperature", ""], - variable_cell_methods=["time: mean", ""], - variable_units=["K", "days"], - coord=["nv", "st_ocean", "time", "xt_ocean", "yt_ocean"], - coord_long_name=[ + variable=[ + "nv", + "st_ocean", + "temp", + "time", + "time_bounds", + "xt_ocean", + "yt_ocean", + ], + variable_long_name=[ "vertex number", "tcell zstar depth", + "Conservative temperature", "time", + "time axis boundaries", "tcell longitude", "tcell latitude", ], - coord_cartesian_axes=["N", "Z", "T", "X", "Y"], - coord_calendar_types=["", "", "NOLEAP", "", ""], - coord_bounds=["", "", "time_bounds", "", ""], - coord_units=[ + variable_standard_name=[ + "", + "", + "sea_water_conservative_temperature", + "", + "", + "", + "", + ], + variable_cell_methods=["", "", "time: mean", "", "", "", ""], + variable_units=[ "none", "meters", + "K", "days since 1900-01-01 00:00:00", + "days", "degrees_E", "degrees_N", ], @@ -433,30 +446,29 @@ def test_parse_access_filename(builder, filename, expected): frequency="1mon", start_date="1900-01-01, 00:00:00", end_date="1910-01-01, 00:00:00", - variable=["mld", "time_bounds"], + variable=["mld", "nv", "time", "time_bounds", "xt_ocean", "yt_ocean"], variable_long_name=[ "mixed layer depth determined by density criteria", + "vertex number", + "time", "time axis boundaries", + "tcell longitude", + "tcell latitude", ], variable_standard_name=[ "ocean_mixed_layer_thickness_defined_by_sigma_t", "", + "", + "", + "", + "", ], - variable_cell_methods=["time: mean", ""], - variable_units=["m", "days"], - coord=["nv", "time", "xt_ocean", "yt_ocean"], - coord_long_name=[ - "vertex number", - "time", - "tcell longitude", - "tcell latitude", - ], - coord_cartesian_axes=["N", "T", "X", "Y"], - coord_calendar_types=["", "NOLEAP", "", ""], - coord_bounds=["", "time_bounds", "", ""], - coord_units=[ + variable_cell_methods=["time: mean", "", "", "", "", ""], + variable_units=[ + "m", "none", "days since 1900-01-01 00:00:00", + "days", "degrees_E", "degrees_N", ], @@ -473,19 +485,22 @@ def test_parse_access_filename(builder, filename, expected): frequency="1mon", start_date="1900-01-01, 00:00:00", end_date="1900-02-01, 00:00:00", - variable=["mld"], - variable_long_name=["mixed layer depth determined by density criteria"], + variable=["mld", "time", "xt_ocean", "yt_ocean"], + variable_long_name=[ + "mixed layer depth determined by density criteria", + "time", + "tcell longitude", + "tcell latitude", + ], variable_standard_name=[ - "ocean_mixed_layer_thickness_defined_by_sigma_t" + "ocean_mixed_layer_thickness_defined_by_sigma_t", + "", + "", + "", ], - variable_cell_methods=["time: mean"], - variable_units=["m"], - coord=["time", "xt_ocean", "yt_ocean"], - coord_long_name=["time", "tcell longitude", "tcell latitude"], - coord_cartesian_axes=["T", "X", "Y"], - coord_calendar_types=["NOLEAP", "", ""], - coord_bounds=["time_bounds", "", ""], - coord_units=[ + variable_cell_methods=["time: mean", "", "", ""], + variable_units=[ + "m", "days since 1900-01-01 00:00:00", "degrees_E", "degrees_N", @@ -503,29 +518,25 @@ def test_parse_access_filename(builder, filename, expected): frequency="1mon", start_date="1900-01-01, 00:00:00", end_date="1900-02-01, 00:00:00", - variable=["TLAT", "TLON", "aice_m", "tarea", "time_bounds"], + variable=["TLAT", "TLON", "aice_m", "tarea", "time", "time_bounds"], variable_long_name=[ "T grid center latitude", "T grid center longitude", "ice area (aggregate)", "area of T grid cells", + "model time", "boundaries for time-averaging interval", ], - variable_standard_name=["", "", "", "", ""], - variable_cell_methods=["", "", "time: mean", "", ""], + variable_standard_name=["", "", "", "", "", ""], + variable_cell_methods=["", "", "time: mean", "", "", ""], variable_units=[ "degrees_north", "degrees_east", "1", "m^2", "days since 1900-01-01 00:00:00", + "days since 1900-01-01 00:00:00", ], - coord=["time"], - coord_long_name=["model time"], - coord_cartesian_axes=[""], - coord_calendar_types=[""], - coord_bounds=["time_bounds"], - coord_units=["days since 1900-01-01 00:00:00"], ), ), ( @@ -544,12 +555,6 @@ def test_parse_access_filename(builder, filename, expected): variable_standard_name=["air_temperature"], variable_cell_methods=["time: mean"], variable_units=["K"], - coord=[], - coord_long_name=[], - coord_cartesian_axes=[], - coord_calendar_types=[], - coord_bounds=[], - coord_units=[], ), ), ( @@ -563,29 +568,25 @@ def test_parse_access_filename(builder, filename, expected): frequency="1day", start_date="2015-01-01, 00:00:00", end_date="2015-02-01, 00:00:00", - variable=["TLAT", "TLON", "aice", "tarea", "time_bounds"], + variable=["TLAT", "TLON", "aice", "tarea", "time", "time_bounds"], variable_long_name=[ "T grid center latitude", "T grid center longitude", "ice area (aggregate)", "area of T grid cells", + "model time", "boundaries for time-averaging interval", ], - variable_standard_name=["", "", "", "", ""], - variable_cell_methods=["", "", "time: mean", "", ""], + variable_standard_name=["", "", "", "", "", ""], + variable_cell_methods=["", "", "time: mean", "", "", ""], variable_units=[ "degrees_north", "degrees_east", "1", "m^2", "days since 1850-01-01 00:00:00", + "days since 1850-01-01 00:00:00", ], - coord=["time"], - coord_long_name=["model time"], - coord_cartesian_axes=[""], - coord_calendar_types=[""], - coord_bounds=["time_bounds"], - coord_units=["days since 1850-01-01 00:00:00"], ), ), ( @@ -599,24 +600,22 @@ def test_parse_access_filename(builder, filename, expected): frequency="1day", start_date="2015-01-01, 00:00:00", end_date="2015-07-01, 00:00:00", - variable=["sst", "time_bounds"], - variable_long_name=["Potential temperature", "time axis boundaries"], - variable_standard_name=["sea_surface_temperature", ""], - variable_cell_methods=["time: mean", ""], - variable_units=["K", "days"], - coord=["nv", "time", "xt_ocean", "yt_ocean"], - coord_long_name=[ + variable=["nv", "sst", "time", "time_bounds", "xt_ocean", "yt_ocean"], + variable_long_name=[ "vertex number", + "Potential temperature", "time", + "time axis boundaries", "tcell longitude", "tcell latitude", ], - coord_cartesian_axes=["N", "T", "X", "Y"], - coord_calendar_types=["", "GREGORIAN", "", ""], - coord_bounds=["", "time_bounds", "", ""], - coord_units=[ + variable_standard_name=["", "sea_surface_temperature", "", "", "", ""], + variable_cell_methods=["", "time: mean", "", "", "", ""], + variable_units=[ "none", + "K", "days since 1850-01-01 00:00:00", + "days", "degrees_E", "degrees_N", ], @@ -633,20 +632,35 @@ def test_parse_access_filename(builder, filename, expected): frequency="1mon", start_date="2015-01-01, 00:00:00", end_date="2015-07-01, 00:00:00", - variable=["temp_global_ave", "time_bounds"], + variable=[ + "nv", + "scalar_axis", + "temp_global_ave", + "time", + "time_bounds", + ], variable_long_name=[ + "vertex number", + "none", "Global mean temp in liquid seawater", + "time", "time axis boundaries", ], - variable_standard_name=["sea_water_potential_temperature", ""], - variable_cell_methods=["time: mean", ""], - variable_units=["deg_C", "days"], - coord=["nv", "scalar_axis", "time"], - coord_long_name=["vertex number", "none", "time"], - coord_cartesian_axes=["N", "X", "T"], - coord_calendar_types=["", "", "GREGORIAN"], - coord_bounds=["", "", "time_bounds"], - coord_units=["none", "none", "days since 1850-01-01 00:00:00"], + variable_standard_name=[ + "", + "", + "sea_water_potential_temperature", + "", + "", + ], + variable_cell_methods=["", "", "time: mean", "", ""], + variable_units=[ + "none", + "none", + "deg_C", + "days since 1850-01-01 00:00:00", + "days", + ], ), ), ( @@ -665,12 +679,6 @@ def test_parse_access_filename(builder, filename, expected): variable_standard_name=["air_temperature"], variable_cell_methods=["time: mean"], variable_units=["K"], - coord=[], - coord_long_name=[], - coord_cartesian_axes=[], - coord_calendar_types=[], - coord_bounds=[], - coord_units=[], ), ), ( @@ -684,29 +692,25 @@ def test_parse_access_filename(builder, filename, expected): frequency="1mon", start_date="1850-01-01, 00:00:00", end_date="1850-02-01, 00:00:00", - variable=["TLAT", "TLON", "aice", "tarea", "time_bounds"], + variable=["TLAT", "TLON", "aice", "tarea", "time", "time_bounds"], variable_long_name=[ "T grid center latitude", "T grid center longitude", "ice area (aggregate)", "area of T grid cells", + "model time", "boundaries for time-averaging interval", ], - variable_standard_name=["", "", "", "", ""], - variable_cell_methods=["", "", "time: mean", "", ""], + variable_standard_name=["", "", "", "", "", ""], + variable_cell_methods=["", "", "time: mean", "", "", ""], variable_units=[ "degrees_north", "degrees_east", "1", "m^2", "days since 0001-01-01 00:00:00", + "days since 0001-01-01 00:00:00", ], - coord=["time"], - coord_long_name=["model time"], - coord_cartesian_axes=[""], - coord_calendar_types=[""], - coord_bounds=["time_bounds"], - coord_units=["days since 0001-01-01 00:00:00"], ), ), ( @@ -720,27 +724,29 @@ def test_parse_access_filename(builder, filename, expected): frequency="1yr", start_date="1849-12-30, 00:00:00", end_date="1850-12-30, 00:00:00", - variable=["fgco2_raw", "time_bounds"], + variable=[ + "fgco2_raw", + "nv", + "time", + "time_bounds", + "xt_ocean", + "yt_ocean", + ], variable_long_name=[ "Flux into ocean - DIC, inc. anth.", - "time axis boundaries", - ], - variable_standard_name=["", ""], - variable_cell_methods=["time: mean", ""], - variable_units=["mmol/m^2/s", "days"], - coord=["nv", "time", "xt_ocean", "yt_ocean"], - coord_long_name=[ "vertex number", "time", + "time axis boundaries", "tcell longitude", "tcell latitude", ], - coord_cartesian_axes=["N", "T", "X", "Y"], - coord_calendar_types=["", "GREGORIAN", "", ""], - coord_bounds=["", "time_bounds", "", ""], - coord_units=[ + variable_standard_name=["", "", "", "", "", ""], + variable_cell_methods=["time: mean", "", "", "", "", ""], + variable_units=[ + "mmol/m^2/s", "none", "days since 0001-01-01 00:00:00", + "days", "degrees_E", "degrees_N", ], @@ -757,26 +763,32 @@ def test_parse_access_filename(builder, filename, expected): frequency="1mon", start_date="1849-12-30, 00:00:00", end_date="1850-12-30, 00:00:00", - variable=["o2", "time_bounds"], - variable_long_name=["o2", "time axis boundaries"], - variable_standard_name=["", ""], - variable_cell_methods=["time: mean", ""], - variable_units=["mmol/m^3", "days"], - coord=["nv", "st_ocean", "time", "xt_ocean", "yt_ocean"], - coord_long_name=[ + variable=[ + "nv", + "o2", + "st_ocean", + "time", + "time_bounds", + "xt_ocean", + "yt_ocean", + ], + variable_long_name=[ "vertex number", + "o2", "tcell zstar depth", "time", + "time axis boundaries", "tcell longitude", "tcell latitude", ], - coord_cartesian_axes=["N", "Z", "T", "X", "Y"], - coord_calendar_types=["", "", "GREGORIAN", "", ""], - coord_bounds=["", "", "time_bounds", "", ""], - coord_units=[ + variable_standard_name=["", "", "", "", "", "", ""], + variable_cell_methods=["", "time: mean", "", "", "", "", ""], + variable_units=[ "none", + "mmol/m^3", "meters", "days since 0001-01-01 00:00:00", + "days", "degrees_E", "degrees_N", ], @@ -797,50 +809,57 @@ def test_parse_access_filename(builder, filename, expected): "average_DT", "average_T1", "average_T2", + "nv", "thetao", + "time", "time_bnds", + "xh", + "yh", + "zl", ], variable_long_name=[ "Length of average period", "Start time for average period", "End time for average period", + "vertex number", "Sea Water Potential Temperature", + "time", "time axis boundaries", + "h point nominal longitude", + "h point nominal latitude", + "Layer pseudo-depth, -z*", ], variable_standard_name=[ "", "", "", + "", "sea_water_potential_temperature", "", + "", + "", + "", + "", ], variable_cell_methods=[ "", "", "", + "", "area:mean zl:mean yh:mean xh:mean time: mean", "", + "", + "", + "", + "", ], variable_units=[ "days", "days since 0001-01-01 00:00:00", "days since 0001-01-01 00:00:00", + "", "degC", "days since 0001-01-01 00:00:00", - ], - coord=["nv", "time", "xh", "yh", "zl"], - coord_long_name=[ - "vertex number", - "time", - "h point nominal longitude", - "h point nominal latitude", - "Layer pseudo-depth, -z*", - ], - coord_cartesian_axes=["", "", "", "", ""], - coord_calendar_types=["", "NOLEAP", "", "", ""], - coord_bounds=["", "time_bnds", "", "", ""], - coord_units=[ - "", "days since 0001-01-01 00:00:00", "degrees_east", "degrees_north", @@ -859,42 +878,58 @@ def test_parse_access_filename(builder, filename, expected): frequency="1day", start_date="1900-01-01, 00:00:00", end_date="1900-01-02, 00:00:00", - variable=["average_DT", "average_T1", "average_T2", "time_bnds", "tos"], + variable=[ + "average_DT", + "average_T1", + "average_T2", + "nv", + "time", + "time_bnds", + "tos", + "xh", + "yh", + ], variable_long_name=[ "Length of average period", "Start time for average period", "End time for average period", + "vertex number", + "time", "time axis boundaries", "Sea Surface Temperature", + "h point nominal longitude", + "h point nominal latitude", + ], + variable_standard_name=[ + "", + "", + "", + "", + "", + "", + "sea_surface_temperature", + "", + "", ], - variable_standard_name=["", "", "", "", "sea_surface_temperature"], variable_cell_methods=[ "", "", "", "", + "", + "", "area:mean yh:mean xh:mean time: mean", + "", + "", ], variable_units=[ "days", "days since 0001-01-01 00:00:00", "days since 0001-01-01 00:00:00", - "days since 0001-01-01 00:00:00", - "degC", - ], - coord=["nv", "time", "xh", "yh"], - coord_long_name=[ - "vertex number", - "time", - "h point nominal longitude", - "h point nominal latitude", - ], - coord_cartesian_axes=["", "", "", ""], - coord_calendar_types=["", "NOLEAP", "", ""], - coord_bounds=["", "time_bnds", "", ""], - coord_units=[ "", "days since 0001-01-01 00:00:00", + "days since 0001-01-01 00:00:00", + "degC", "degrees_east", "degrees_north", ], @@ -911,23 +946,21 @@ def test_parse_access_filename(builder, filename, expected): frequency="fx", start_date="none", end_date="none", - variable=["geolat", "geolon"], + variable=["geolat", "geolon", "xh", "yh"], variable_long_name=[ "Latitude of tracer (T) points", "Longitude of tracer (T) points", - ], - variable_standard_name=["", ""], - variable_cell_methods=["time: point", "time: point"], - variable_units=["degrees_north", "degrees_east"], - coord=["xh", "yh"], - coord_long_name=[ "h point nominal longitude", "h point nominal latitude", ], - coord_cartesian_axes=["", ""], - coord_calendar_types=["", ""], - coord_bounds=["", ""], - coord_units=["degrees_east", "degrees_north"], + variable_standard_name=["", "", "", ""], + variable_cell_methods=["time: point", "time: point", "", ""], + variable_units=[ + "degrees_north", + "degrees_east", + "degrees_east", + "degrees_north", + ], ), ), ( @@ -945,50 +978,57 @@ def test_parse_access_filename(builder, filename, expected): "average_DT", "average_T1", "average_T2", + "nv", "thetao", + "time", "time_bnds", + "xh", + "yh", + "z_l", ], variable_long_name=[ "Length of average period", "Start time for average period", "End time for average period", + "vertex number", "Sea Water Potential Temperature", + "time", "time axis boundaries", + "h point nominal longitude", + "h point nominal latitude", + "Depth at cell center", ], variable_standard_name=[ "", "", "", + "", "sea_water_potential_temperature", "", + "", + "", + "", + "", ], variable_cell_methods=[ "", "", "", + "", "area:mean z_l:mean yh:mean xh:mean time: mean", "", + "", + "", + "", + "", ], variable_units=[ "days", "days since 0001-01-01 00:00:00", "days since 0001-01-01 00:00:00", + "", "degC", "days since 0001-01-01 00:00:00", - ], - coord=["nv", "time", "xh", "yh", "z_l"], - coord_long_name=[ - "vertex number", - "time", - "h point nominal longitude", - "h point nominal latitude", - "Depth at cell center", - ], - coord_cartesian_axes=["", "", "", "", ""], - coord_calendar_types=["", "NOLEAP", "", "", ""], - coord_bounds=["", "time_bnds", "", "", ""], - coord_units=[ - "", "days since 0001-01-01 00:00:00", "degrees_east", "degrees_north", @@ -1007,29 +1047,25 @@ def test_parse_access_filename(builder, filename, expected): frequency="1day", start_date="1900-01-01, 00:00:00", end_date="1900-01-02, 00:00:00", - variable=["TLAT", "TLON", "aice", "tarea", "time_bounds"], + variable=["TLAT", "TLON", "aice", "tarea", "time", "time_bounds"], variable_long_name=[ "T grid center latitude", "T grid center longitude", "ice area (aggregate)", "area of T grid cells", + "time", "time interval endpoints", ], - variable_standard_name=["", "", "", "", ""], - variable_cell_methods=["", "", "time: mean", "", ""], + variable_standard_name=["", "", "", "", "", ""], + variable_cell_methods=["", "", "time: mean", "", "", ""], variable_units=[ "degrees_north", "degrees_east", "1", "m^2", "days since 0000-01-01 00:00:00", + "days since 0000-01-01 00:00:00", ], - coord=["time"], - coord_long_name=["time"], - coord_cartesian_axes=[""], - coord_calendar_types=[""], - coord_bounds=["time_bounds"], - coord_units=["days since 0000-01-01 00:00:00"], ), ), ( @@ -1048,12 +1084,6 @@ def test_parse_access_filename(builder, filename, expected): variable_standard_name=["", ""], variable_cell_methods=["", ""], variable_units=["m2 s", "unitless"], - coord=[], - coord_long_name=[], - coord_cartesian_axes=[], - coord_calendar_types=[], - coord_bounds=[], - coord_units=[], ), ), ], From 396df8e8b0edaadfe742baeda6a1f1fc926ed8b2 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Thu, 10 Oct 2024 07:54:56 +0800 Subject: [PATCH 15/23] Cleaned up _cmip_realm_translator & added sorting to test (order unimportant, causing unnecessary test failures) --- src/access_nri_intake/catalog/translators.py | 6 +++--- tests/test_translators.py | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py index f237c165..21d34ec5 100644 --- a/src/access_nri_intake/catalog/translators.py +++ b/src/access_nri_intake/catalog/translators.py @@ -18,6 +18,7 @@ class TranslatorError(Exception): "Generic Exception for the Translator classes" + pass @@ -311,11 +312,10 @@ def _translate(string: str) -> tuple[str, ...]: } raw_realms = string.split(" ") - realms = [] + realms = set() for realm in raw_realms: realm = translations.get(realm, realm) - if realm not in realms: - realms.append(realm) + realms |= {realm} return tuple(realms) return series.apply(lambda string: _translate(string)) diff --git a/tests/test_translators.py b/tests/test_translators.py index 8b2b174f..baa4d73f 100644 --- a/tests/test_translators.py +++ b/tests/test_translators.py @@ -120,6 +120,9 @@ def test_cmip_realm_translator(input, expected): """Test translation of entries in the CMIP realm column""" series = pd.Series(input) translated = _cmip_realm_translator(series) + # Sort expected & translated to make the test less brittle + translated = translated.apply(lambda x: tuple(sorted(x))) + expected = [tuple(sorted(x)) for x in expected] assert list(translated) == expected From babc3da4a1a5e5aaf35443c95cf8af5826f3afeb Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Thu, 10 Oct 2024 14:48:58 +0800 Subject: [PATCH 16/23] Added _DispatchKeys dataclass to hold tagnames --- src/access_nri_intake/catalog/translators.py | 57 +++++++++++++++----- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py index 3f01b9cf..3f40d72b 100644 --- a/src/access_nri_intake/catalog/translators.py +++ b/src/access_nri_intake/catalog/translators.py @@ -6,6 +6,7 @@ like the ACCESS-NRI catalog """ +from dataclasses import dataclass from functools import partial from typing import Callable, Optional @@ -67,6 +68,7 @@ def __init__(self, source: DataSource, columns: list[str]): column: partial(self._default_translator, column=column) for column in columns } + self._dispatch_keys = _DispatchKeys() def _default_translator(self, column: str) -> pd.Series: """ @@ -185,24 +187,31 @@ def __init__(self, source: DataSource, columns: list[str]): self._dispatch["frequency"] = self._frequency_translator self._dispatch["variable"] = self._variable_translator + self._dispatch_keys = _DispatchKeys( + model="source_id", + realm="realm", + frequency="frequency", + variable="variable_id", + ) + def _model_translator(self): """ Return model from source_id """ - return _to_tuple(self.source.df["source_id"]) + return _to_tuple(self.source.df[self._dispatch_keys.model]) def _realm_translator(self): """ Return realm, fixing a few issues """ - return _cmip_realm_translator(self.source.df["realm"]) + return _cmip_realm_translator(self.source.df[self._dispatch_keys.realm]) def _frequency_translator(self): """ Return frequency, fixing a few issues """ return _to_tuple( - self.source.df["frequency"].apply( + self.source.df[self._dispatch_keys.frequency].apply( lambda x: frequency_translations.get(x, x) ) ) @@ -211,7 +220,7 @@ def _variable_translator(self): """ Return variable as a tuple """ - return _to_tuple(self.source.df["variable_id"]) + return _to_tuple(self.source.df[self._dispatch_keys.variable]) class Cmip5Translator(DefaultTranslator): @@ -237,24 +246,31 @@ def __init__(self, source: DataSource, columns: list[str]): self._dispatch["frequency"] = self._frequency_translator self._dispatch["variable"] = self._variable_translator + self._dispatch_keys = _DispatchKeys( + model="model", + realm="realm", + frequency="frequency", + variable="variable", + ) + def _model_translator(self): """ Return variable as a tuple """ - return _to_tuple(self.source.df["model"]) + return _to_tuple(self.source.df[self._dispatch_keys.model]) def _realm_translator(self): """ Return realm, fixing a few issues """ - return _cmip_realm_translator(self.source.df["realm"]) + return _cmip_realm_translator(self.source.df[self._dispatch_keys.realm]) def _frequency_translator(self): """ Return frequency, fixing a few issues """ return _to_tuple( - self.source.df["frequency"].apply( + self.source.df[self._dispatch_keys.frequency].apply( lambda x: frequency_translations.get(x, x) ) ) @@ -263,7 +279,7 @@ def _variable_translator(self): """ Return variable as a tuple """ - return _to_tuple(self.source.df["variable"]) + return _to_tuple(self.source.df[self._dispatch_keys.variable]) class EraiTranslator(DefaultTranslator): @@ -285,12 +301,13 @@ def __init__(self, source: DataSource, columns: list[str]): super().__init__(source, columns) self._dispatch["variable"] = self._variable_translator + self._dispatch_keys = _DispatchKeys(variable="variable") def _variable_translator(self): """ Return variable as a tuple """ - return _to_tuple(self.source.df["variable"]) + return _to_tuple(self.source.df[self._dispatch_keys.variable]) class BarpaTranslator(DefaultTranslator): @@ -315,12 +332,18 @@ def __init__(self, source, columns): self._dispatch["realm"] = self._realm_translator self._dispatch["frequency"] = self._frequency_translator self._dispatch["variable"] = self._variable_translator + self._dispatch_keys = _DispatchKeys( + model="source_id", + realm="realm", + variable="variable_id", + frequency="freq", + ) def _model_translator(self): """ Return model from source_id """ - return _to_tuple(self.source.df["source_id"]) + return _to_tuple(self.source.df[self._dispatch_keys.model]) def _realm_translator(self): """ @@ -333,14 +356,24 @@ def _frequency_translator(self): Return frequency, fixing a few issues """ return _to_tuple( - self.source.df["freq"].apply(lambda x: frequency_translations.get(x, x)) + self.source.df[self._dispatch_keys.frequency].apply( + lambda x: frequency_translations.get(x, x) + ) ) def _variable_translator(self): """ Return variable as a tuple """ - return _to_tuple(self.source.df["variable_id"]) + return _to_tuple(self.source.df[self._dispatch_keys.variable]) + + +@dataclass +class _DispatchKeys: + model: Optional[str] = None + realm: Optional[str] = None + frequency: Optional[str] = None + variable: Optional[str] = None def _cmip_realm_translator(series) -> pd.Series: From 0e2d6698c130bebc6f451ece669cde8358a13210 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Thu, 10 Oct 2024 15:31:51 +0800 Subject: [PATCH 17/23] Refactored a bunch of the translations out to the DefaultTranslator --- src/access_nri_intake/catalog/translators.py | 161 ++++++++----------- tests/test_translators.py | 4 +- 2 files changed, 67 insertions(+), 98 deletions(-) diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py index 3f40d72b..8027f906 100644 --- a/src/access_nri_intake/catalog/translators.py +++ b/src/access_nri_intake/catalog/translators.py @@ -16,7 +16,7 @@ from . import COLUMNS_WITH_ITERABLES -frequency_translations = { +FREQUENCY_TRANSLATIONS = { "3hrPt": "3hr", "6hrPt": "6hr", "daily": "1day", @@ -32,6 +32,36 @@ } +def _to_tuple(series: pd.Series) -> pd.Series: + """ + Make each entry in the provided series a tuple + + Parameters + ---------- + series: :py:class:`~pandas.Series` + A pandas Series or another object with an `apply` method + """ + return series.apply(lambda x: (x,)) + + +def tuplify_series(func: Callable) -> Callable: + """ + Decorator that wraps a function that returns a pandas Series and converts + each entry in the series to a tuple + """ + + def wrapper(*args, **kwargs): + # Check if the first argument is 'self' + if len(args) > 0 and hasattr(args[0], "__class__"): + self = args[0] + series = func(self, *args[1:], **kwargs) + else: + series = func(*args, **kwargs) + return _to_tuple(series) + + return wrapper + + class TranslatorError(Exception): "Generic Exception for the Translator classes" @@ -163,6 +193,35 @@ def _unique_values(series): return df[self.columns] # Preserve ordering + def _realm_translator(self) -> pd.Series: + """ + Return realm, fixing a few issues + """ + return _cmip_realm_translator(self.source.df[self._dispatch_keys.realm]) + + @tuplify_series + def _model_translator(self) -> pd.Series: + """ + Return model from dispatch_keys.model + """ + return self.source.df[self._dispatch_keys.model] + + @tuplify_series + def _frequency_translator(self) -> pd.Series: + """ + Return frequency, fixing a few issues + """ + return self.source.df[self._dispatch_keys.frequency].apply( + lambda x: FREQUENCY_TRANSLATIONS.get(x, x) + ) + + @tuplify_series + def _variable_translator(self) -> pd.Series: + """ + Return variable as a tuple + """ + return self.source.df[self._dispatch_keys.variable] + class Cmip6Translator(DefaultTranslator): """ @@ -194,34 +253,6 @@ def __init__(self, source: DataSource, columns: list[str]): variable="variable_id", ) - def _model_translator(self): - """ - Return model from source_id - """ - return _to_tuple(self.source.df[self._dispatch_keys.model]) - - def _realm_translator(self): - """ - Return realm, fixing a few issues - """ - return _cmip_realm_translator(self.source.df[self._dispatch_keys.realm]) - - def _frequency_translator(self): - """ - Return frequency, fixing a few issues - """ - return _to_tuple( - self.source.df[self._dispatch_keys.frequency].apply( - lambda x: frequency_translations.get(x, x) - ) - ) - - def _variable_translator(self): - """ - Return variable as a tuple - """ - return _to_tuple(self.source.df[self._dispatch_keys.variable]) - class Cmip5Translator(DefaultTranslator): """ @@ -253,34 +284,6 @@ def __init__(self, source: DataSource, columns: list[str]): variable="variable", ) - def _model_translator(self): - """ - Return variable as a tuple - """ - return _to_tuple(self.source.df[self._dispatch_keys.model]) - - def _realm_translator(self): - """ - Return realm, fixing a few issues - """ - return _cmip_realm_translator(self.source.df[self._dispatch_keys.realm]) - - def _frequency_translator(self): - """ - Return frequency, fixing a few issues - """ - return _to_tuple( - self.source.df[self._dispatch_keys.frequency].apply( - lambda x: frequency_translations.get(x, x) - ) - ) - - def _variable_translator(self): - """ - Return variable as a tuple - """ - return _to_tuple(self.source.df[self._dispatch_keys.variable]) - class EraiTranslator(DefaultTranslator): """ @@ -303,11 +306,11 @@ def __init__(self, source: DataSource, columns: list[str]): self._dispatch["variable"] = self._variable_translator self._dispatch_keys = _DispatchKeys(variable="variable") - def _variable_translator(self): - """ - Return variable as a tuple - """ - return _to_tuple(self.source.df[self._dispatch_keys.variable]) + def _realm_translator(self) -> pd.Series: + raise AttributeError("ERAI data does not have a realm column") + + def _frequency_translator(self) -> pd.Series: + raise AttributeError("ERAI data does not have a frequency column") class BarpaTranslator(DefaultTranslator): @@ -339,34 +342,12 @@ def __init__(self, source, columns): frequency="freq", ) - def _model_translator(self): - """ - Return model from source_id - """ - return _to_tuple(self.source.df[self._dispatch_keys.model]) - def _realm_translator(self): """ Return realm, fixing a few issues """ return self.source.df.apply(lambda x: ("none",), 1) - def _frequency_translator(self): - """ - Return frequency, fixing a few issues - """ - return _to_tuple( - self.source.df[self._dispatch_keys.frequency].apply( - lambda x: frequency_translations.get(x, x) - ) - ) - - def _variable_translator(self): - """ - Return variable as a tuple - """ - return _to_tuple(self.source.df[self._dispatch_keys.variable]) - @dataclass class _DispatchKeys: @@ -400,15 +381,3 @@ def _translate(string: str) -> tuple[str, ...]: return tuple(realms) return series.apply(lambda string: _translate(string)) - - -def _to_tuple(series: pd.Series) -> pd.Series: - """ - Make each entry in the provided series a tuple - - Parameters - ---------- - series: :py:class:`~pandas.Series` - A pandas Series or another object with an `apply` method - """ - return series.apply(lambda x: (x,)) diff --git a/tests/test_translators.py b/tests/test_translators.py index deea65da..825e7a5f 100644 --- a/tests/test_translators.py +++ b/tests/test_translators.py @@ -7,6 +7,7 @@ from access_nri_intake.catalog import CORE_COLUMNS, TRANSLATOR_GROUPBY_COLUMNS from access_nri_intake.catalog.translators import ( + FREQUENCY_TRANSLATIONS, BarpaTranslator, Cmip5Translator, Cmip6Translator, @@ -15,7 +16,6 @@ TranslatorError, _cmip_realm_translator, _to_tuple, - frequency_translations, ) @@ -68,7 +68,7 @@ def test_cmip_frequency_translator(input, expected): """Test translation of entries in the CMIP frequency column""" series = pd.Series(input) - translated = series.apply(lambda x: frequency_translations.get(x, x)) + translated = series.apply(lambda x: FREQUENCY_TRANSLATIONS.get(x, x)) assert list(translated) == expected From 28a8a460183178b841a11205de280692139e2aaa Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Fri, 11 Oct 2024 09:29:46 +0800 Subject: [PATCH 18/23] Updated translators.py to include CordexTranslator, added config/cordex.yaml and config/metadata_sources/cordex-ig45/metadata.yaml --- config/cordex.yaml | 9 ++ .../cordex-ig45/metadata.yaml | 127 ++++++++++++++++++ src/access_nri_intake/catalog/translators.py | 47 ++++++- 3 files changed, 181 insertions(+), 2 deletions(-) create mode 100644 config/cordex.yaml create mode 100644 config/metadata_sources/cordex-ig45/metadata.yaml diff --git a/config/cordex.yaml b/config/cordex.yaml new file mode 100644 index 00000000..da5338f4 --- /dev/null +++ b/config/cordex.yaml @@ -0,0 +1,9 @@ +builder: null + +translator: CordexTranslator + +sources: + + - metadata_yaml: /g/data/xp65/admin/access-nri-intake-catalog/config/metadata_sources/cordex-ig45/metadata.yaml + path: + - /g/data/ig45/catalog/v2/esm/catalog.json diff --git a/config/metadata_sources/cordex-ig45/metadata.yaml b/config/metadata_sources/cordex-ig45/metadata.yaml new file mode 100644 index 00000000..c118c402 --- /dev/null +++ b/config/metadata_sources/cordex-ig45/metadata.yaml @@ -0,0 +1,127 @@ +name: cmip6_ig45 +experiment_uuid: c7021d1e-7ba2-11ef-beb5-000007d3fe80 +description: 20km regional projections for CORDEX-CMIP6 from the Queensland Future Climate Science Program +long_description: >- + This dataset includes projections at 20km and 10km, formatted to meet the CORDEX-CMIP6 data standards. + The 20km projections were derived from the 10km projections. +model: +- CMIP6 +frequency: +- day +- mon +- 1hr +- fx +variable: +- clt +- tauv +- clh +- clwvi +- ua850 +- sund +- ua100m +- va250 +- uas +- prc +- vas +- mrfso +- rlds +- ta200 +- hus1000 +- hus600 +- prw +- hus850 +- va200 +- tas +- clivi +- zg200 +- rsut +- va600 +- rsdt +- tasmax +- sfcWindmax +- va850 +- mrso +- ps +- hus400 +- ta1000 +- ua250 +- tauu +- pr +- va925 +- snc +- hus200 +- clm +- zg500 +- hurs +- rlut +- hus300 +- rsds +- ua200 +- psl +- ta850 +- va400 +- zg400 +- snm +- ta925 +- prsn +- hus250 +- zg1000 +- ta600 +- zg925 +- huss +- ta500 +- va1000 +- zg700 +- zmla +- hfss +- zg850 +- ua925 +- zg600 +- ua300 +- rsus +- hus500 +- sfcWind +- ts +- va500 +- va100m +- ua500 +- ua700 +- va700 +- soilt +- snd +- ua1000 +- ta700 +- hfls +- tasmin +- zg250 +- cll +- hus700 +- rlus +- va300 +- ua600 +- hus925 +- ta250 +- ua400 +- prhmax +- sftlf +- ta400 +- ta300 +- snw +- zg300 +- orog +- sftlaf +nominal_resolution: +- 20km +- 10km +version: +contact: NCI +email: help@nci.org.au +reference: +license: +url: https://geonetwork.nci.org.au/geonetwork/srv/eng/catalog.search#/metadata/f7465_8388_5100_7022 +parent_experiment: +related_experiments: +- +notes: +keywords: +- cmip \ No newline at end of file diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py index 8027f906..51398ca0 100644 --- a/src/access_nri_intake/catalog/translators.py +++ b/src/access_nri_intake/catalog/translators.py @@ -307,10 +307,14 @@ def __init__(self, source: DataSource, columns: list[str]): self._dispatch_keys = _DispatchKeys(variable="variable") def _realm_translator(self) -> pd.Series: - raise AttributeError("ERAI data does not have a realm column") + raise AttributeError( + f"{self.__class__.__name__}: data does not have a realm column" + ) def _frequency_translator(self) -> pd.Series: - raise AttributeError("ERAI data does not have a frequency column") + raise AttributeError( + f"{self.__class__.__name__}: data does not have a frequency column" + ) class BarpaTranslator(DefaultTranslator): @@ -349,8 +353,47 @@ def _realm_translator(self): return self.source.df.apply(lambda x: ("none",), 1) +class CordexTranslator(DefaultTranslator): + """ + Cordex Translator for translating metadata from the NCI CORDEX intake datastores. + """ + + def __init__(self, source, columns): + """ + Initialise a CordexTranslator + + Parameters + ---------- + source: :py:class:`~intake.DataSource` + The NCI CORDEX intake-esm datastore + columns: list of str + The columns to translate to (these are the core columns in the intake-dataframe-catalog) + """ + + super().__init__(source, columns) + self._dispatch["model"] = self._model_translator + self._dispatch["realm"] = self._realm_translator + self._dispatch["frequency"] = self._frequency_translator + self._dispatch["variable"] = self._variable_translator + + self._dispatch_keys = _DispatchKeys( + model="source_id", + frequency="frequency", + variable="variable_id", + ) + + def _realm_translator(self) -> pd.Series: + raise AttributeError( + f"{self.__class__.__name__}: data does not have a realm column" + ) + + @dataclass class _DispatchKeys: + """ + Data class to store the keys for the dispatch dictionary in the Translator classes + """ + model: Optional[str] = None realm: Optional[str] = None frequency: Optional[str] = None From 8dc5c2a5d39514b0b35ba977b52ccea32ba7fa8f Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Fri, 11 Oct 2024 11:29:50 +0800 Subject: [PATCH 19/23] Working cordex translator --- src/access_nri_intake/catalog/translators.py | 16 +++-- tests/data/esm_datastore/cordex-ig45.csv | 6 ++ tests/data/esm_datastore/cordex-ig45.json | 70 ++++++++++++++++++++ tests/test_translators.py | 18 +++++ 4 files changed, 103 insertions(+), 7 deletions(-) create mode 100644 tests/data/esm_datastore/cordex-ig45.csv create mode 100644 tests/data/esm_datastore/cordex-ig45.json diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py index 51398ca0..32749411 100644 --- a/src/access_nri_intake/catalog/translators.py +++ b/src/access_nri_intake/catalog/translators.py @@ -308,12 +308,12 @@ def __init__(self, source: DataSource, columns: list[str]): def _realm_translator(self) -> pd.Series: raise AttributeError( - f"{self.__class__.__name__}: data does not have a realm column" + f"{self.__class__.__name__}: 'realm' does not require translation" ) def _frequency_translator(self) -> pd.Series: raise AttributeError( - f"{self.__class__.__name__}: data does not have a frequency column" + f"{self.__class__.__name__}: 'data' does not require translation" ) @@ -372,20 +372,22 @@ def __init__(self, source, columns): super().__init__(source, columns) self._dispatch["model"] = self._model_translator - self._dispatch["realm"] = self._realm_translator self._dispatch["frequency"] = self._frequency_translator self._dispatch["variable"] = self._variable_translator + self._dispatch["realm"] = self._variable_translator self._dispatch_keys = _DispatchKeys( model="source_id", frequency="frequency", variable="variable_id", + realm="realm", ) - def _realm_translator(self) -> pd.Series: - raise AttributeError( - f"{self.__class__.__name__}: data does not have a realm column" - ) + def _realm_translator(self): + """ + Return realm, fixing a few issues + """ + return self.source.df.apply(lambda x: ("none",), 1) @dataclass diff --git a/tests/data/esm_datastore/cordex-ig45.csv b/tests/data/esm_datastore/cordex-ig45.csv new file mode 100644 index 00000000..06f6a2fd --- /dev/null +++ b/tests/data/esm_datastore/cordex-ig45.csv @@ -0,0 +1,6 @@ +path,file_type,project_id,resolution,institution_id,source_id,experiment_id,member_id,frequency,variable_id,version,time_range +/g/data/ig45/QldFCP-2/output/CMIP6/DD/AUS-10i/UQ-DEC/ACCESS-CM2/ssp126/r2i1p1f1/CCAMoc-v2112/v1-r1/day/hus200/v20240709/hus200_AUS-10i_ACCESS-CM2_ssp126_r2i1p1f1_UQ-DEC_CCAMoc-v2112_v1-r1_day_20580101-20581231.nc,f,output,AUS-10i,UQ-DEC,ACCESS-CM2,ssp126,r2i1p1f1,day,hus200,v20240709,20580101-20581231 +/g/data/ig45/QldFCP-2/CORDEX/CMIP6/DD/AUS-20i/UQ-DEC/ACCESS-ESM1-5/ssp126/r20i1p1f1/CCAMoc-v2112/v1-r1/mon/va925/v20240722/va925_AUS-20i_ACCESS-ESM1-5_ssp126_r20i1p1f1_UQ-DEC_CCAMoc-v2112_v1-r1_mon_208101-209012.nc,f,CORDEX,AUS-20i,UQ-DEC,ACCESS-ESM1-5,ssp126,r20i1p1f1,mon,va925,v20240722,208101-209012 +/g/data/ig45/QldFCP-2/CORDEX/CMIP6/DD/AUS-20i/UQ-DEC/ACCESS-ESM1-5/ssp370/r6i1p1f1/CCAM-v2105/v1-r1/mon/clh/v20240722/clh_AUS-20i_ACCESS-ESM1-5_ssp370_r6i1p1f1_UQ-DEC_CCAM-v2105_v1-r1_mon_201501-202012.nc,f,CORDEX,AUS-20i,UQ-DEC,ACCESS-ESM1-5,ssp370,r6i1p1f1,mon,clh,v20240722,201501-202012 +/g/data/ig45/QldFCP-2/output/CMIP6/DD/AUS-10i/UQ-DEC/ACCESS-CM2/ssp126/r2i1p1f1/CCAMoc-v2112/v1-r1/day/ta850/v20240709/ta850_AUS-10i_ACCESS-CM2_ssp126_r2i1p1f1_UQ-DEC_CCAMoc-v2112_v1-r1_day_20340101-20341231.nc,f,output,AUS-10i,UQ-DEC,ACCESS-CM2,ssp126,r2i1p1f1,day,ta850,v20240709,20340101-20341231 +/g/data/ig45/QldFCP-2/CORDEX/CMIP6/DD/AUS-20i/UQ-DEC/NorESM2-MM/ssp126/r1i1p1f1/CCAMoc-v2112/v1-r1/mon/hus200/v20240722/hus200_AUS-20i_NorESM2-MM_ssp126_r1i1p1f1_UQ-DEC_CCAMoc-v2112_v1-r1_mon_201501-202012.nc,f,CORDEX,AUS-20i,UQ-DEC,NorESM2-MM,ssp126,r1i1p1f1,mon,hus200,v20240722,201501-202012 diff --git a/tests/data/esm_datastore/cordex-ig45.json b/tests/data/esm_datastore/cordex-ig45.json new file mode 100644 index 00000000..5fc783b9 --- /dev/null +++ b/tests/data/esm_datastore/cordex-ig45.json @@ -0,0 +1,70 @@ +{ + "id": "qldfcp-2-ig45", + "title": "qldfcp-2-ig45", + "description": "Datasets on Gadi, both publised and replicated. All file versions present are in the listing\nMaintained By: NCI\nContact: help@nci.org.au", + "assets": { + "column_name": "path", + "format": "netcdf" + }, + "aggregation_control": { + "variable_column_name": "variable_id", + "groupby_attrs": [ + "file_type", + "project_id", + "resolution", + "institution_id", + "source_id", + "experiment_id", + "member_id", + "frequency", + "variable_id", + "version" + ], + "aggregations": [ + { + "type": "join_existing", + "attribute_name": "time_range", + "options": { + "dim": "time" + } + } + ] + }, + "esmcat_version": "0.1.0", + "catalog_file": "cordex-ig45.csv", + "attributes": [ + { + "column_name": "file_type" + }, + { + "column_name": "project_id" + }, + { + "column_name": "resolution" + }, + { + "column_name": "institution_id" + }, + { + "column_name": "source_id" + }, + { + "column_name": "experiment_id" + }, + { + "column_name": "member_id" + }, + { + "column_name": "frequency" + }, + { + "column_name": "variable_id" + }, + { + "column_name": "version" + }, + { + "column_name": "time_range" + } + ] +} \ No newline at end of file diff --git a/tests/test_translators.py b/tests/test_translators.py index 825e7a5f..bf3d6708 100644 --- a/tests/test_translators.py +++ b/tests/test_translators.py @@ -11,6 +11,7 @@ BarpaTranslator, Cmip5Translator, Cmip6Translator, + CordexTranslator, DefaultTranslator, EraiTranslator, TranslatorError, @@ -263,3 +264,20 @@ def test_BarpaTranslator(test_data, groupby, n_entries): esmds.description = "description" df = BarpaTranslator(esmds, CORE_COLUMNS).translate(groupby) assert len(df) == n_entries + + +@pytest.mark.parametrize( + "groupby, n_entries", + [ + (None, 5), + (["variable"], 4), + (["frequency"], 2), + ], +) +def test_CordexTranslator(test_data, groupby, n_entries): + """Test CORDEX datastore translator""" + esmds = intake.open_esm_datastore(test_data / "esm_datastore/cordex-ig45.json") + esmds.name = "name" + esmds.description = "description" + df = CordexTranslator(esmds, CORE_COLUMNS).translate(groupby) + assert len(df) == n_entries From b3da5772a245b7eb2ec1086b3c77905727e646ec Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Mon, 14 Oct 2024 11:20:24 +0800 Subject: [PATCH 20/23] Revert "Merge branch '660-coordinate-variables' into 199-data-request-20km-regional-projections-for-cordex-cmip6-queensland-future-climate-science" This reverts commit 27e460f7d5b632a7ca7acb14cee8aae9ec942791, reversing changes made to 0c96e28f347c553afdb22c335f89ab5fd6f19b0b. --- .gitignore | 3 - .pre-commit-config.yaml | 7 - src/access_nri_intake/catalog/manager.py | 25 +- src/access_nri_intake/catalog/translators.py | 24 +- src/access_nri_intake/source/builders.py | 253 +++--- src/access_nri_intake/source/utils.py | 108 +-- src/access_nri_intake/utils.py | 13 +- tests/test_builders.py | 774 ++++++------------- 8 files changed, 418 insertions(+), 789 deletions(-) diff --git a/.gitignore b/.gitignore index b18b515f..da4e917f 100644 --- a/.gitignore +++ b/.gitignore @@ -131,6 +131,3 @@ dmypy.json sandpit.ipynb *.DS_Store bin/build_all.sh.o* - -# Vs Code -.vscode/ \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2843edd9..13106eeb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,10 +10,3 @@ repos: hooks: - id: black language_version: python3 -# Mypy - - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.11.2' - hooks: - - id: mypy - name: mypy - additional_dependencies: [types-PyYAML==6.0.12.20240808] diff --git a/src/access_nri_intake/catalog/manager.py b/src/access_nri_intake/catalog/manager.py index 37b6cf01..f3d03243 100644 --- a/src/access_nri_intake/catalog/manager.py +++ b/src/access_nri_intake/catalog/manager.py @@ -4,7 +4,6 @@ """ Manager for adding/updating intake sources in an intake-dataframe-catalog like the ACCESS-NRI catalog """ import os -from typing import Optional, Union import intake from intake_dataframe_catalog.core import DfFileCatalog @@ -31,7 +30,7 @@ class CatalogManager: Add/update intake sources in an intake-dataframe-catalog like the ACCESS-NRI catalog """ - def __init__(self, path: str): + def __init__(self, path): """ Initialise a CatalogManager instance to add/update intake sources in a intake-dataframe-catalog like the ACCESS-NRI catalog @@ -59,14 +58,14 @@ def __init__(self, path: str): def build_esm( self, - name: str, - description: str, + name, + description, builder, - path: Union[str, list[str]], + path, translator=DefaultTranslator, - metadata: Optional[dict] = None, - directory: Optional[str] = None, - overwrite: bool = False, + metadata=None, + directory=None, + overwrite=False, **kwargs, ): """ @@ -125,12 +124,12 @@ def build_esm( def load( self, - name: str, - description: str, - path: str, - driver: str = "esm_datastore", + name, + description, + path, + driver="esm_datastore", translator=DefaultTranslator, - metadata: Optional[dict] = None, + metadata=None, **kwargs, ): """ diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py index 59cf678a..c048d0a7 100644 --- a/src/access_nri_intake/catalog/translators.py +++ b/src/access_nri_intake/catalog/translators.py @@ -12,7 +12,6 @@ import pandas as pd import tlz -from intake import DataSource from . import COLUMNS_WITH_ITERABLES @@ -74,7 +73,7 @@ class DefaultTranslator: of metadata for use in an intake-dataframe-catalog. """ - def __init__(self, source: DataSource, columns: list[str]): + def __init__(self, source, columns): """ Initialise a DefaultTranslator. This Translator works as follows: @@ -94,13 +93,13 @@ def __init__(self, source: DataSource, columns: list[str]): self.source = source self.columns = columns - self._dispatch: dict[str, Callable[[], pd.Series]] = { + self._dispatch = { column: partial(self._default_translator, column=column) for column in columns } self._dispatch_keys = _DispatchKeys() - def _default_translator(self, column: str) -> pd.Series: + def _default_translator(self, column): """ Try to translate a column from a source using the default translator. This translator works as follows: - If the input source is an intake-esm datastore, the translator will first look for the column in the @@ -146,7 +145,7 @@ def _default_translator(self, column: str) -> pd.Series: return pd.Series([val] * len_df) - def translate(self, groupby: Optional[list[str]] = None) -> pd.DataFrame: + def translate(self, groupby=None): """ Return the translated :py:class:`~pandas.DataFrame` of metadata and merge into set of set of rows with unique values of the columns specified. @@ -228,7 +227,7 @@ class Cmip6Translator(DefaultTranslator): CMIP6 Translator for translating metadata from the NCI CMIP6 intake datastores. """ - def __init__(self, source: DataSource, columns: list[str]): + def __init__(self, source, columns): """ Initialise a Cmip6Translator @@ -259,7 +258,7 @@ class Cmip5Translator(DefaultTranslator): CMIP5 Translator for translating metadata from the NCI CMIP5 intake datastores. """ - def __init__(self, source: DataSource, columns: list[str]): + def __init__(self, source, columns): """ Initialise a Cmip5Translator @@ -290,7 +289,7 @@ class EraiTranslator(DefaultTranslator): ERAI Translator for translating metadata from the NCI ERA-Interim intake datastore. """ - def __init__(self, source: DataSource, columns: list[str]): + def __init__(self, source, columns): """ Initialise a EraiTranslator @@ -402,14 +401,13 @@ class _DispatchKeys: variable: Optional[str] = None -def _cmip_realm_translator(series) -> pd.Series: +def _cmip_realm_translator(series): """ - Return realm from CMIP realm metadata, fixing some issues. This function takes - a series of strings and returns a series of tuples as there are sometimes multiple - realms per cmip asset + Return realm from CMIP realm metadata, fixing some issues. This function returns + a tuple as there are sometimes multiple realms per cmip asset """ - def _translate(string: str) -> tuple[str, ...]: + def _translate(string): translations = { "na": "none", "landonly": "land", diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py index a05e8fed..13aa5249 100644 --- a/src/access_nri_intake/source/builders.py +++ b/src/access_nri_intake/source/builders.py @@ -7,22 +7,16 @@ import re import traceback from pathlib import Path -from typing import Optional, Union import xarray as xr from ecgtools.builder import INVALID_ASSET, TRACEBACK, Builder from ..utils import validate_against_schema from . import ESM_JSONSCHEMA, PATH_COLUMN, VARIABLE_COLUMN -from .utils import ( - EmptyFileError, - _AccessNCFileInfo, - _DataVarInfo, - get_timeinfo, -) +from .utils import EmptyFileError, get_timeinfo # Frequency translations -FREQUENCIES: dict[str, tuple[int, str]] = { +FREQUENCIES = { "daily": (1, "day"), "_dai$": (1, "day"), "month": (1, "mon"), @@ -53,19 +47,19 @@ class BaseBuilder(Builder): """ # Base class carries an empty set - PATTERNS: list = [] + PATTERNS = [] def __init__( self, - path: Union[str, list[str]], - depth: int = 0, - exclude_patterns: Optional[list[str]] = None, - include_patterns: Optional[list[str]] = None, - data_format: str = "netcdf", - groupby_attrs: Optional[list[str]] = None, - aggregations: Optional[list[dict]] = None, - storage_options: Optional[dict] = None, - joblib_parallel_kwargs: dict = {"n_jobs": multiprocessing.cpu_count()}, + path, + depth=0, + exclude_patterns=None, + include_patterns=None, + data_format="netcdf", + groupby_attrs=None, + aggregations=None, + storage_options=None, + joblib_parallel_kwargs={"n_jobs": multiprocessing.cpu_count()}, ): """ This method should be overwritten. The expection is that some of these arguments @@ -119,7 +113,7 @@ def parse(self): self._parse() return self - def _save(self, name: str, description: str, directory: Union[str, None]): + def _save(self, name, description, directory): super().save( name=name, path_column_name=PATH_COLUMN, @@ -134,9 +128,7 @@ def _save(self, name: str, description: str, directory: Union[str, None]): to_csv_kwargs={"compression": "gzip"}, ) - def save( - self, name: str, description: str, directory: Optional[str] = None - ) -> None: + def save(self, name, description, directory=None): """ Save datastore contents to a file. @@ -218,12 +210,8 @@ def parser(file): @classmethod def parse_access_filename( - cls, - filename: str, - patterns: Optional[list[str]] = None, - frequencies: dict = FREQUENCIES, - redaction_fill: str = "X", - ) -> tuple[str, Union[str, None], Union[str, None]]: + cls, filename, patterns=None, frequencies=FREQUENCIES, redaction_fill: str = "X" + ): """ Parse an ACCESS model filename and return a file id and any time information @@ -231,22 +219,16 @@ def parse_access_filename( ---------- filename: str The filename to parse with the extension removed - patterns: list of str, optional - A list of regex patterns to match against the filename. If None, use the class PATTERNS - frequencies: dict, optional - A dictionary of regex patterns to match against the filename to determine the frequency - redaction_fill: str, optional - The character to replace time information with. Defaults to "X" Returns ------- file_id: str The file id constructed by redacting time information and replacing non-python characters with underscores - timestamp: str | None - A string of the redacted time information (e.g. "1990-01") if available, otherwise None - frequency: str | None - The frequency of the file if available in the filename, otherwise None + timestamp: str + A string of the redacted time information (e.g. "1990-01") + frequency: str + The frequency of the file if available in the filename """ if patterns is None: patterns = cls.PATTERNS @@ -278,9 +260,7 @@ def parse_access_filename( return file_id, timestamp, frequency @classmethod - def parse_access_ncfile( - cls, file: str, time_dim: str = "time" - ) -> _AccessNCFileInfo: + def parse_access_ncfile(cls, file, time_dim="time"): """ Get Intake-ESM datastore entry info from an ACCESS netcdf file @@ -293,18 +273,13 @@ def parse_access_ncfile( Returns ------- - output_nc_info: AccessNCFileInfo - A dataclass containing the information parsed from the file - - Raises - ------ - EmptyFileError: If the file contains no variables """ - file_path = Path(file) + file = Path(file) + filename = file.name file_id, filename_timestamp, filename_frequency = cls.parse_access_filename( - file_path.stem + file.stem ) with xr.open_dataset( @@ -314,31 +289,51 @@ def parse_access_ncfile( decode_times=False, decode_coords=False, ) as ds: - dvars = _DataVarInfo() - - for var in ds.variables: + variable_list = [] + variable_long_name_list = [] + variable_standard_name_list = [] + variable_cell_methods_list = [] + variable_units_list = [] + for var in ds.data_vars: attrs = ds[var].attrs - dvars.append_attrs(var, attrs) # type: ignore + if "long_name" in attrs: + variable_list.append(var) + variable_long_name_list.append(attrs["long_name"]) + if "standard_name" in attrs: + variable_standard_name_list.append(attrs["standard_name"]) + else: + variable_standard_name_list.append("") + if "cell_methods" in attrs: + variable_cell_methods_list.append(attrs["cell_methods"]) + else: + variable_cell_methods_list.append("") + if "units" in attrs: + variable_units_list.append(attrs["units"]) + else: + variable_units_list.append("") start_date, end_date, frequency = get_timeinfo( ds, filename_frequency, time_dim ) - if not dvars.variable_list: + if not variable_list: raise EmptyFileError("This file contains no variables") - output_ncfile = _AccessNCFileInfo( - filename=file_path.name, - path=file, - file_id=file_id, - filename_timestamp=filename_timestamp, - frequency=frequency, - start_date=start_date, - end_date=end_date, - **dvars.to_ncinfo_dict(), + outputs = ( + filename, + file_id, + filename_timestamp, + frequency, + start_date, + end_date, + variable_list, + variable_long_name_list, + variable_standard_name_list, + variable_cell_methods_list, + variable_units_list, ) - return output_ncfile + return outputs class AccessOm2Builder(BaseBuilder): @@ -383,23 +378,44 @@ def __init__(self, path): super().__init__(**kwargs) @classmethod - def parser(cls, file) -> dict: + def parser(cls, file): try: - # mypy gets upset as match can return None. I assume this is why we - # have try/except block in the first place? If so, we might be able - # to make this more explicit? - match_groups = re.match(r".*/output\d+/([^/]*)/.*\.nc", file).groups() # type: ignore + match_groups = re.match(r".*/output\d+/([^/]*)/.*\.nc", file).groups() realm = match_groups[0] if realm == "ice": realm = "seaIce" - nc_info = cls.parse_access_ncfile(file) - ncinfo_dict = nc_info.to_dict() - - ncinfo_dict["realm"] = realm - - return ncinfo_dict + ( + filename, + file_id, + _, + frequency, + start_date, + end_date, + variable_list, + variable_long_name_list, + variable_standard_name_list, + variable_cell_methods_list, + variable_units_list, + ) = cls.parse_access_ncfile(file) + + info = { + "path": str(file), + "realm": realm, + "variable": variable_list, + "frequency": frequency, + "start_date": start_date, + "end_date": end_date, + "variable_long_name": variable_long_name_list, + "variable_standard_name": variable_standard_name_list, + "variable_cell_methods": variable_cell_methods_list, + "variable_units": variable_units_list, + "filename": filename, + "file_id": file_id, + } + + return info except Exception: return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()} @@ -450,22 +466,47 @@ def __init__(self, path): super().__init__(**kwargs) @classmethod - def parser(cls, file) -> dict: + def parser(cls, file): try: - output_nc_info = cls.parse_access_ncfile(file) - ncinfo_dict = output_nc_info.to_dict() - - if "mom6" in ncinfo_dict["filename"]: + ( + filename, + file_id, + _, + frequency, + start_date, + end_date, + variable_list, + variable_long_name_list, + variable_standard_name_list, + variable_cell_methods_list, + variable_units_list, + ) = cls.parse_access_ncfile(file) + + if "mom6" in filename: realm = "ocean" - elif "ww3" in ncinfo_dict["filename"]: + elif "ww3" in filename: realm = "wave" - elif "cice" in ncinfo_dict["filename"]: + elif "cice" in filename: realm = "seaIce" else: raise ParserError(f"Cannot determine realm for file {file}") - ncinfo_dict["realm"] = realm - return ncinfo_dict + info = { + "path": str(file), + "realm": realm, + "variable": variable_list, + "frequency": frequency, + "start_date": start_date, + "end_date": end_date, + "variable_long_name": variable_long_name_list, + "variable_standard_name": variable_standard_name_list, + "variable_cell_methods": variable_cell_methods_list, + "variable_units": variable_units_list, + "filename": filename, + "file_id": file_id, + } + + return info except Exception: return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()} @@ -529,18 +570,42 @@ def parser(cls, file): realm = match_groups[1] realm_mapping = {"atm": "atmos", "ocn": "ocean", "ice": "seaIce"} - - nc_info = cls.parse_access_ncfile(file) - ncinfo_dict = nc_info.to_dict() + realm = realm_mapping[realm] + + ( + filename, + file_id, + _, + frequency, + start_date, + end_date, + variable_list, + variable_long_name_list, + variable_standard_name_list, + variable_cell_methods_list, + variable_units_list, + ) = cls.parse_access_ncfile(file) # Remove exp_id from file id so that members can be part of the same dataset - ncinfo_dict["file_id"] = re.sub(exp_id, "", ncinfo_dict["file_id"]).strip( - "_" - ) - ncinfo_dict["realm"] = realm_mapping[realm] - ncinfo_dict["member"] = exp_id - - return ncinfo_dict + file_id = re.sub(exp_id, "", file_id).strip("_") + + info = { + "path": str(file), + "realm": realm, + "variable": variable_list, + "frequency": frequency, + "start_date": start_date, + "end_date": end_date, + "member": exp_id, + "variable_long_name": variable_long_name_list, + "variable_standard_name": variable_standard_name_list, + "variable_cell_methods": variable_cell_methods_list, + "variable_units": variable_units_list, + "filename": filename, + "file_id": file_id, + } + + return info except Exception: return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()} diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index c9082f32..a3a8cfe9 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -4,96 +4,16 @@ """ Shared utilities for writing Intake-ESM builders and their parsers """ import warnings -from dataclasses import asdict, dataclass, field from datetime import timedelta -from pathlib import Path -from typing import Union import cftime -import xarray as xr class EmptyFileError(Exception): pass -@dataclass -class _AccessNCFileInfo: - """ - Holds information about a NetCDF file that is used to create an intake-esm - catalog entry. - - ______ - Notes: - Use of both path and filename seems redundant, but constructing filename from - the path using a __post_init__ method makes testing more difficult. On balance, - more explicit tests are probably more important than the slight redundancy. - """ - - filename: Union[str, Path] - file_id: str - path: str - filename_timestamp: Union[str, None] - frequency: str - start_date: str - end_date: str - variable: list[str] - variable_long_name: list[str] - variable_standard_name: list[str] - variable_cell_methods: list[str] - variable_units: list[str] - - def to_dict(self) -> dict[str, Union[str, list[str]]]: - """ - Return a dictionary representation of the NcFileInfo object - """ - return asdict(self) - - -@dataclass -class _DataVarInfo: - """ - Holds information about the data variables in a NetCDF file that is used to - create an intake-esm catalog entry. - """ - - variable_list: list[str] = field(default_factory=list) - long_name_list: list[str] = field(default_factory=list) - standard_name_list: list[str] = field(default_factory=list) - cell_methods_list: list[str] = field(default_factory=list) - units_list: list[str] = field(default_factory=list) - - def append_attrs(self, var: str, attrs: dict) -> None: - """ - Append attributes to the DataVarInfo object, if the attribute has a - 'long_name' key. - - TODO: Why do we need a long name key? seems important - """ - if "long_name" not in attrs: - return None - - self.variable_list.append(var) - self.long_name_list.append(attrs["long_name"]) - self.standard_name_list.append(attrs.get("standard_name", "")) - self.cell_methods_list.append(attrs.get("cell_methods", "")) - self.units_list.append(attrs.get("units", "")) - - def to_ncinfo_dict(self) -> dict[str, list[str]]: - """ - Return a dictionary representation of the DataVarInfo object. Fields are - defined explicitly for use in the _AccessNCFileInfo constructor. - """ - return { - "variable": self.variable_list, - "variable_long_name": self.long_name_list, - "variable_standard_name": self.standard_name_list, - "variable_cell_methods": self.cell_methods_list, - "variable_units": self.units_list, - } - - -def _add_month_start(time, n: int): +def _add_month_start(time, n): """Add months to cftime datetime and truncate to start""" year = time.year + ((time.month + n - 1) // 12) month = (time.month + n - 1) % 12 + 1 @@ -102,7 +22,7 @@ def _add_month_start(time, n: int): ) -def _add_year_start(time, n: int): +def _add_year_start(time, n): """Add years to cftime datetime and truncate to start""" return time.replace( year=time.year + n, month=1, day=1, hour=0, minute=0, second=0, microsecond=0 @@ -139,11 +59,7 @@ def _guess_start_end_dates(ts, te, frequency): return ts, te -def get_timeinfo( - ds: xr.Dataset, - filename_frequency: Union[str, None], - time_dim: str, -) -> tuple[str, str, str]: +def get_timeinfo(ds, filename_frequency, time_dim): """ Get start date, end date and frequency of a xarray dataset. Stolen and adapted from the cosima cookbook, see @@ -153,24 +69,8 @@ def get_timeinfo( ---------- ds: :py:class:`xarray.Dataset` The dataset to parse the time info from - filename_frequency: str - Frequency as determined from the filename time_dim: str The name of the time dimension - - Returns - ------- - start_date: str - The start date of the dataset - end_date: str - The end date of the dataset - frequency: str - The frequency of the dataset - - Raises - ------ - EmptyFileError - If the dataset has a valid unlimited dimension, but no data """ def _todate(t): @@ -179,7 +79,7 @@ def _todate(t): time_format = "%Y-%m-%d, %H:%M:%S" ts = None te = None - frequency: Union[str, tuple[Union[int, None], str]] = "fx" + frequency = "fx" has_time = time_dim in ds if has_time: diff --git a/src/access_nri_intake/utils.py b/src/access_nri_intake/utils.py index aed413dc..b2895f62 100644 --- a/src/access_nri_intake/utils.py +++ b/src/access_nri_intake/utils.py @@ -11,7 +11,7 @@ import yaml -def get_jsonschema(metadata_file: str, required: list) -> tuple[dict, dict]: +def get_jsonschema(metadata_file, required): """ Read in the required JSON schema, and annotate it with "required" fields. @@ -22,7 +22,7 @@ def get_jsonschema(metadata_file: str, required: list) -> tuple[dict, dict]: """ schema_file = rsr.files("access_nri_intake").joinpath(metadata_file) - with schema_file.open(mode="r") as fpath: # type: ignore + with schema_file.open(mode="r") as fpath: schema = json.load(fpath) schema_required = schema.copy() @@ -40,7 +40,7 @@ def get_jsonschema(metadata_file: str, required: list) -> tuple[dict, dict]: return schema, schema_required -def load_metadata_yaml(path: str, jsonschema: dict) -> dict: +def load_metadata_yaml(path, jsonschema): """ Load a metadata.yaml file, leaving dates as strings, and validate against a jsonschema, allowing for tuples as arrays @@ -77,7 +77,7 @@ def remove_implicit_resolver(cls, tag_to_remove): return metadata -def validate_against_schema(instance: dict, schema: dict) -> None: +def validate_against_schema(instance, schema): """ Validate a dictionary against a jsonschema, allowing for tuples as arrays @@ -87,11 +87,6 @@ def validate_against_schema(instance: dict, schema: dict) -> None: The instance to validate schema: dict The jsonschema - - Raises - ------ - jsonschema.exceptions.ValidationError - If the instance does not match the schema """ Validator = jsonschema.validators.validator_for(schema) diff --git a/tests/test_builders.py b/tests/test_builders.py index c28f9eac..7b8dc5d3 100644 --- a/tests/test_builders.py +++ b/tests/test_builders.py @@ -8,7 +8,6 @@ import pytest from access_nri_intake.source import CORE_COLUMNS, builders -from access_nri_intake.source.utils import _AccessNCFileInfo @pytest.mark.parametrize( @@ -365,725 +364,411 @@ def test_parse_access_filename(builder, filename, expected): ( builders.AccessOm2Builder, "access-om2/output000/ocean/ocean_grid.nc", - _AccessNCFileInfo( - path=None, # type: ignore - filename="ocean_grid.nc", - file_id="ocean_grid", - filename_timestamp=None, - frequency="fx", - start_date="none", - end_date="none", - variable=["geolat_t", "geolon_t", "xt_ocean", "yt_ocean"], - variable_long_name=[ - "tracer latitude", - "tracer longitude", - "tcell longitude", - "tcell latitude", - ], - variable_standard_name=["", "", "", ""], - variable_cell_methods=["time: point", "time: point", "", ""], - variable_units=["degrees_N", "degrees_E", "degrees_E", "degrees_N"], + ( + "ocean_grid.nc", + "ocean_grid", + None, + "fx", + "none", + "none", + ["geolat_t", "geolon_t"], + ["tracer latitude", "tracer longitude"], + ["", ""], + ["time: point", "time: point"], + ["degrees_N", "degrees_E"], ), ), ( builders.AccessOm2Builder, "access-om2/output000/ocean/ocean.nc", - _AccessNCFileInfo( - path=None, # type: ignore - filename="ocean.nc", - file_id="ocean", - filename_timestamp=None, - frequency="1yr", - start_date="1900-01-01, 00:00:00", - end_date="1910-01-01, 00:00:00", - variable=[ - "nv", - "st_ocean", - "temp", - "time", - "time_bounds", - "xt_ocean", - "yt_ocean", - ], - variable_long_name=[ - "vertex number", - "tcell zstar depth", - "Conservative temperature", - "time", - "time axis boundaries", - "tcell longitude", - "tcell latitude", - ], - variable_standard_name=[ - "", - "", - "sea_water_conservative_temperature", - "", - "", - "", - "", - ], - variable_cell_methods=["", "", "time: mean", "", "", "", ""], - variable_units=[ - "none", - "meters", - "K", - "days since 1900-01-01 00:00:00", - "days", - "degrees_E", - "degrees_N", - ], + ( + "ocean.nc", + "ocean", + None, + "1yr", + "1900-01-01, 00:00:00", + "1910-01-01, 00:00:00", + ["temp", "time_bounds"], + ["Conservative temperature", "time axis boundaries"], + ["sea_water_conservative_temperature", ""], + ["time: mean", ""], + ["K", "days"], ), ), ( builders.AccessOm2Builder, "access-om2/output000/ocean/ocean_month.nc", - _AccessNCFileInfo( - path=None, # type: ignore - filename="ocean_month.nc", - file_id="ocean_month", - filename_timestamp=None, - frequency="1mon", - start_date="1900-01-01, 00:00:00", - end_date="1910-01-01, 00:00:00", - variable=["mld", "nv", "time", "time_bounds", "xt_ocean", "yt_ocean"], - variable_long_name=[ + ( + "ocean_month.nc", + "ocean_month", + None, + "1mon", + "1900-01-01, 00:00:00", + "1910-01-01, 00:00:00", + ["mld", "time_bounds"], + [ "mixed layer depth determined by density criteria", - "vertex number", - "time", "time axis boundaries", - "tcell longitude", - "tcell latitude", - ], - variable_standard_name=[ - "ocean_mixed_layer_thickness_defined_by_sigma_t", - "", - "", - "", - "", - "", - ], - variable_cell_methods=["time: mean", "", "", "", "", ""], - variable_units=[ - "m", - "none", - "days since 1900-01-01 00:00:00", - "days", - "degrees_E", - "degrees_N", ], + ["ocean_mixed_layer_thickness_defined_by_sigma_t", ""], + ["time: mean", ""], + ["m", "days"], ), ), ( builders.AccessOm2Builder, "access-om2/output000/ocean/ocean_month_inst_nobounds.nc", - _AccessNCFileInfo( - path=None, # type: ignore - filename="ocean_month_inst_nobounds.nc", - file_id="ocean_month_inst_nobounds", - filename_timestamp=None, - frequency="1mon", - start_date="1900-01-01, 00:00:00", - end_date="1900-02-01, 00:00:00", - variable=["mld", "time", "xt_ocean", "yt_ocean"], - variable_long_name=[ - "mixed layer depth determined by density criteria", - "time", - "tcell longitude", - "tcell latitude", - ], - variable_standard_name=[ - "ocean_mixed_layer_thickness_defined_by_sigma_t", - "", - "", - "", - ], - variable_cell_methods=["time: mean", "", "", ""], - variable_units=[ - "m", - "days since 1900-01-01 00:00:00", - "degrees_E", - "degrees_N", - ], + ( + "ocean_month_inst_nobounds.nc", + "ocean_month_inst_nobounds", + None, + "1mon", + "1900-01-01, 00:00:00", + "1900-02-01, 00:00:00", + ["mld"], + ["mixed layer depth determined by density criteria"], + ["ocean_mixed_layer_thickness_defined_by_sigma_t"], + ["time: mean"], + ["m"], ), ), ( builders.AccessOm2Builder, "access-om2/output000/ice/OUTPUT/iceh.1900-01.nc", - _AccessNCFileInfo( - path=None, # type: ignore - filename="iceh.1900-01.nc", - file_id="iceh_XXXX_XX", - filename_timestamp="1900-01", - frequency="1mon", - start_date="1900-01-01, 00:00:00", - end_date="1900-02-01, 00:00:00", - variable=["TLAT", "TLON", "aice_m", "tarea", "time", "time_bounds"], - variable_long_name=[ + ( + "iceh.1900-01.nc", + "iceh_XXXX_XX", + "1900-01", + "1mon", + "1900-01-01, 00:00:00", + "1900-02-01, 00:00:00", + ["TLAT", "TLON", "aice_m", "tarea", "time_bounds"], + [ "T grid center latitude", "T grid center longitude", "ice area (aggregate)", "area of T grid cells", - "model time", "boundaries for time-averaging interval", ], - variable_standard_name=["", "", "", "", "", ""], - variable_cell_methods=["", "", "time: mean", "", "", ""], - variable_units=[ + ["", "", "", "", ""], + ["", "", "time: mean", "", ""], + [ "degrees_north", "degrees_east", "1", "m^2", "days since 1900-01-01 00:00:00", - "days since 1900-01-01 00:00:00", ], ), ), ( builders.AccessCm2Builder, "access-cm2/by578/history/atm/netCDF/by578a.pd201501_dai.nc", - _AccessNCFileInfo( - path=None, # type: ignore - filename="by578a.pd201501_dai.nc", - file_id="by578a_pdXXXXXX_dai", - filename_timestamp="201501", - frequency="1day", - start_date="2015-01-01, 00:00:00", - end_date="2015-02-01, 00:00:00", - variable=["fld_s03i236"], - variable_long_name=["TEMPERATURE AT 1.5M"], - variable_standard_name=["air_temperature"], - variable_cell_methods=["time: mean"], - variable_units=["K"], + ( + "by578a.pd201501_dai.nc", + "by578a_pdXXXXXX_dai", + "201501", + "1day", + "2015-01-01, 00:00:00", + "2015-02-01, 00:00:00", + ["fld_s03i236"], + ["TEMPERATURE AT 1.5M"], + ["air_temperature"], + ["time: mean"], + ["K"], ), ), ( builders.AccessCm2Builder, "access-cm2/by578/history/ice/iceh_d.2015-01.nc", - _AccessNCFileInfo( - path=None, # type: ignore - filename="iceh_d.2015-01.nc", - file_id="iceh_d_XXXX_XX", - filename_timestamp="2015-01", - frequency="1day", - start_date="2015-01-01, 00:00:00", - end_date="2015-02-01, 00:00:00", - variable=["TLAT", "TLON", "aice", "tarea", "time", "time_bounds"], - variable_long_name=[ + ( + "iceh_d.2015-01.nc", + "iceh_d_XXXX_XX", + "2015-01", + "1day", + "2015-01-01, 00:00:00", + "2015-02-01, 00:00:00", + ["TLAT", "TLON", "aice", "tarea", "time_bounds"], + [ "T grid center latitude", "T grid center longitude", "ice area (aggregate)", "area of T grid cells", - "model time", "boundaries for time-averaging interval", ], - variable_standard_name=["", "", "", "", "", ""], - variable_cell_methods=["", "", "time: mean", "", "", ""], - variable_units=[ + ["", "", "", "", ""], + ["", "", "time: mean", "", ""], + [ "degrees_north", "degrees_east", "1", "m^2", "days since 1850-01-01 00:00:00", - "days since 1850-01-01 00:00:00", ], ), ), ( builders.AccessCm2Builder, "access-cm2/by578/history/ocn/ocean_daily.nc-20150630", - _AccessNCFileInfo( - path=None, # type: ignore - filename="ocean_daily.nc-20150630", - file_id="ocean_daily", - filename_timestamp=None, - frequency="1day", - start_date="2015-01-01, 00:00:00", - end_date="2015-07-01, 00:00:00", - variable=["nv", "sst", "time", "time_bounds", "xt_ocean", "yt_ocean"], - variable_long_name=[ - "vertex number", - "Potential temperature", - "time", - "time axis boundaries", - "tcell longitude", - "tcell latitude", - ], - variable_standard_name=["", "sea_surface_temperature", "", "", "", ""], - variable_cell_methods=["", "time: mean", "", "", "", ""], - variable_units=[ - "none", - "K", - "days since 1850-01-01 00:00:00", - "days", - "degrees_E", - "degrees_N", - ], + ( + "ocean_daily.nc-20150630", + "ocean_daily", + None, + "1day", + "2015-01-01, 00:00:00", + "2015-07-01, 00:00:00", + ["sst", "time_bounds"], + ["Potential temperature", "time axis boundaries"], + ["sea_surface_temperature", ""], + ["time: mean", ""], + ["K", "days"], ), ), ( builders.AccessCm2Builder, "access-cm2/by578/history/ocn/ocean_scalar.nc-20150630", - _AccessNCFileInfo( - path=None, # type: ignore - filename="ocean_scalar.nc-20150630", - file_id="ocean_scalar", - filename_timestamp=None, - frequency="1mon", - start_date="2015-01-01, 00:00:00", - end_date="2015-07-01, 00:00:00", - variable=[ - "nv", - "scalar_axis", - "temp_global_ave", - "time", - "time_bounds", - ], - variable_long_name=[ - "vertex number", - "none", - "Global mean temp in liquid seawater", - "time", - "time axis boundaries", - ], - variable_standard_name=[ - "", - "", - "sea_water_potential_temperature", - "", - "", - ], - variable_cell_methods=["", "", "time: mean", "", ""], - variable_units=[ - "none", - "none", - "deg_C", - "days since 1850-01-01 00:00:00", - "days", - ], + ( + "ocean_scalar.nc-20150630", + "ocean_scalar", + None, + "1mon", + "2015-01-01, 00:00:00", + "2015-07-01, 00:00:00", + ["temp_global_ave", "time_bounds"], + ["Global mean temp in liquid seawater", "time axis boundaries"], + ["sea_water_potential_temperature", ""], + ["time: mean", ""], + ["deg_C", "days"], ), ), ( builders.AccessEsm15Builder, "access-esm1-5/history/atm/netCDF/HI-C-05-r1.pa-185001_mon.nc", - _AccessNCFileInfo( - path=None, # type: ignore - filename="HI-C-05-r1.pa-185001_mon.nc", - file_id="HI_C_05_r1_pa_XXXXXX_mon", - filename_timestamp="185001", - frequency="1mon", - start_date="1850-01-01, 00:00:00", - end_date="1850-02-01, 00:00:00", - variable=["fld_s03i236"], - variable_long_name=["TEMPERATURE AT 1.5M"], - variable_standard_name=["air_temperature"], - variable_cell_methods=["time: mean"], - variable_units=["K"], + ( + "HI-C-05-r1.pa-185001_mon.nc", + "HI_C_05_r1_pa_XXXXXX_mon", + "185001", + "1mon", + "1850-01-01, 00:00:00", + "1850-02-01, 00:00:00", + ["fld_s03i236"], + ["TEMPERATURE AT 1.5M"], + ["air_temperature"], + ["time: mean"], + ["K"], ), ), ( builders.AccessEsm15Builder, "access-esm1-5/history/ice/iceh.1850-01.nc", - _AccessNCFileInfo( - path=None, # type: ignore - filename="iceh.1850-01.nc", - file_id="iceh_XXXX_XX", - filename_timestamp="1850-01", - frequency="1mon", - start_date="1850-01-01, 00:00:00", - end_date="1850-02-01, 00:00:00", - variable=["TLAT", "TLON", "aice", "tarea", "time", "time_bounds"], - variable_long_name=[ + ( + "iceh.1850-01.nc", + "iceh_XXXX_XX", + "1850-01", + "1mon", + "1850-01-01, 00:00:00", + "1850-02-01, 00:00:00", + ["TLAT", "TLON", "aice", "tarea", "time_bounds"], + [ "T grid center latitude", "T grid center longitude", "ice area (aggregate)", "area of T grid cells", - "model time", "boundaries for time-averaging interval", ], - variable_standard_name=["", "", "", "", "", ""], - variable_cell_methods=["", "", "time: mean", "", "", ""], - variable_units=[ + ["", "", "", "", ""], + ["", "", "time: mean", "", ""], + [ "degrees_north", "degrees_east", "1", "m^2", "days since 0001-01-01 00:00:00", - "days since 0001-01-01 00:00:00", ], ), ), ( builders.AccessEsm15Builder, "access-esm1-5/history/ocn/ocean_bgc_ann.nc-18501231", - _AccessNCFileInfo( - path=None, # type: ignore - filename="ocean_bgc_ann.nc-18501231", - file_id="ocean_bgc_ann", - filename_timestamp=None, - frequency="1yr", - start_date="1849-12-30, 00:00:00", - end_date="1850-12-30, 00:00:00", - variable=[ - "fgco2_raw", - "nv", - "time", - "time_bounds", - "xt_ocean", - "yt_ocean", - ], - variable_long_name=[ - "Flux into ocean - DIC, inc. anth.", - "vertex number", - "time", - "time axis boundaries", - "tcell longitude", - "tcell latitude", - ], - variable_standard_name=["", "", "", "", "", ""], - variable_cell_methods=["time: mean", "", "", "", "", ""], - variable_units=[ - "mmol/m^2/s", - "none", - "days since 0001-01-01 00:00:00", - "days", - "degrees_E", - "degrees_N", - ], + ( + "ocean_bgc_ann.nc-18501231", + "ocean_bgc_ann", + None, + "1yr", + "1849-12-30, 00:00:00", + "1850-12-30, 00:00:00", + ["fgco2_raw", "time_bounds"], + ["Flux into ocean - DIC, inc. anth.", "time axis boundaries"], + ["", ""], + ["time: mean", ""], + ["mmol/m^2/s", "days"], ), ), ( builders.AccessEsm15Builder, "access-esm1-5/history/ocn/ocean_bgc.nc-18501231", - _AccessNCFileInfo( - path=None, # type: ignore - filename="ocean_bgc.nc-18501231", - file_id="ocean_bgc", - filename_timestamp=None, - frequency="1mon", - start_date="1849-12-30, 00:00:00", - end_date="1850-12-30, 00:00:00", - variable=[ - "nv", - "o2", - "st_ocean", - "time", - "time_bounds", - "xt_ocean", - "yt_ocean", - ], - variable_long_name=[ - "vertex number", - "o2", - "tcell zstar depth", - "time", - "time axis boundaries", - "tcell longitude", - "tcell latitude", - ], - variable_standard_name=["", "", "", "", "", "", ""], - variable_cell_methods=["", "time: mean", "", "", "", "", ""], - variable_units=[ - "none", - "mmol/m^3", - "meters", - "days since 0001-01-01 00:00:00", - "days", - "degrees_E", - "degrees_N", - ], + ( + "ocean_bgc.nc-18501231", + "ocean_bgc", + None, + "1mon", + "1849-12-30, 00:00:00", + "1850-12-30, 00:00:00", + ["o2", "time_bounds"], + ["o2", "time axis boundaries"], + ["", ""], + ["time: mean", ""], + ["mmol/m^3", "days"], ), ), ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.native_1900_01.nc", - _AccessNCFileInfo( - path=None, # type: ignore - filename="GMOM_JRA_WD.mom6.h.native_1900_01.nc", - file_id="GMOM_JRA_WD_mom6_h_native_XXXX_XX", - filename_timestamp="1900_01", - frequency="1mon", - start_date="1900-01-01, 00:00:00", - end_date="1900-02-01, 00:00:00", - variable=[ - "average_DT", - "average_T1", - "average_T2", - "nv", - "thetao", - "time", - "time_bnds", - "xh", - "yh", - "zl", - ], - variable_long_name=[ + ( + "GMOM_JRA_WD.mom6.h.native_1900_01.nc", + "GMOM_JRA_WD_mom6_h_native_XXXX_XX", + "1900_01", + "1mon", + "1900-01-01, 00:00:00", + "1900-02-01, 00:00:00", + ["average_DT", "average_T1", "average_T2", "thetao", "time_bnds"], + [ "Length of average period", "Start time for average period", "End time for average period", - "vertex number", "Sea Water Potential Temperature", - "time", "time axis boundaries", - "h point nominal longitude", - "h point nominal latitude", - "Layer pseudo-depth, -z*", ], - variable_standard_name=[ - "", - "", - "", - "", - "sea_water_potential_temperature", - "", - "", - "", - "", - "", - ], - variable_cell_methods=[ - "", - "", - "", - "", - "area:mean zl:mean yh:mean xh:mean time: mean", - "", - "", - "", - "", - "", - ], - variable_units=[ + ["", "", "", "sea_water_potential_temperature", ""], + ["", "", "", "area:mean zl:mean yh:mean xh:mean time: mean", ""], + [ "days", "days since 0001-01-01 00:00:00", "days since 0001-01-01 00:00:00", - "", "degC", "days since 0001-01-01 00:00:00", - "days since 0001-01-01 00:00:00", - "degrees_east", - "degrees_north", - "meter", ], ), ), ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc", - _AccessNCFileInfo( - path=None, # type: ignore - filename="GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc", - file_id="GMOM_JRA_WD_mom6_h_sfc_XXXX_XX_XX", - filename_timestamp="1900_01_02", - frequency="1day", - start_date="1900-01-01, 00:00:00", - end_date="1900-01-02, 00:00:00", - variable=[ - "average_DT", - "average_T1", - "average_T2", - "nv", - "time", - "time_bnds", - "tos", - "xh", - "yh", - ], - variable_long_name=[ + ( + "GMOM_JRA_WD.mom6.h.sfc_1900_01_02.nc", + "GMOM_JRA_WD_mom6_h_sfc_XXXX_XX_XX", + "1900_01_02", + "1day", + "1900-01-01, 00:00:00", + "1900-01-02, 00:00:00", + ["average_DT", "average_T1", "average_T2", "time_bnds", "tos"], + [ "Length of average period", "Start time for average period", "End time for average period", - "vertex number", - "time", "time axis boundaries", "Sea Surface Temperature", - "h point nominal longitude", - "h point nominal latitude", - ], - variable_standard_name=[ - "", - "", - "", - "", - "", - "", - "sea_surface_temperature", - "", - "", - ], - variable_cell_methods=[ - "", - "", - "", - "", - "", - "", - "area:mean yh:mean xh:mean time: mean", - "", - "", ], - variable_units=[ + ["", "", "", "", "sea_surface_temperature"], + ["", "", "", "", "area:mean yh:mean xh:mean time: mean"], + [ "days", "days since 0001-01-01 00:00:00", "days since 0001-01-01 00:00:00", - "", - "days since 0001-01-01 00:00:00", "days since 0001-01-01 00:00:00", "degC", - "degrees_east", - "degrees_north", ], ), ), ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.static.nc", - _AccessNCFileInfo( - path=None, # type: ignore - filename="GMOM_JRA_WD.mom6.h.static.nc", - file_id="GMOM_JRA_WD_mom6_h_static", - filename_timestamp=None, - frequency="fx", - start_date="none", - end_date="none", - variable=["geolat", "geolon", "xh", "yh"], - variable_long_name=[ - "Latitude of tracer (T) points", - "Longitude of tracer (T) points", - "h point nominal longitude", - "h point nominal latitude", - ], - variable_standard_name=["", "", "", ""], - variable_cell_methods=["time: point", "time: point", "", ""], - variable_units=[ - "degrees_north", - "degrees_east", - "degrees_east", - "degrees_north", - ], + ( + "GMOM_JRA_WD.mom6.h.static.nc", + "GMOM_JRA_WD_mom6_h_static", + None, + "fx", + "none", + "none", + ["geolat", "geolon"], + ["Latitude of tracer (T) points", "Longitude of tracer (T) points"], + ["", ""], + ["time: point", "time: point"], + ["degrees_north", "degrees_east"], ), ), ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.mom6.h.z_1900_01.nc", - _AccessNCFileInfo( - path=None, # type: ignore - filename="GMOM_JRA_WD.mom6.h.z_1900_01.nc", - file_id="GMOM_JRA_WD_mom6_h_z_XXXX_XX", - filename_timestamp="1900_01", - frequency="1mon", - start_date="1900-01-01, 00:00:00", - end_date="1900-02-01, 00:00:00", - variable=[ - "average_DT", - "average_T1", - "average_T2", - "nv", - "thetao", - "time", - "time_bnds", - "xh", - "yh", - "z_l", - ], - variable_long_name=[ + ( + "GMOM_JRA_WD.mom6.h.z_1900_01.nc", + "GMOM_JRA_WD_mom6_h_z_XXXX_XX", + "1900_01", + "1mon", + "1900-01-01, 00:00:00", + "1900-02-01, 00:00:00", + ["average_DT", "average_T1", "average_T2", "thetao", "time_bnds"], + [ "Length of average period", "Start time for average period", "End time for average period", - "vertex number", "Sea Water Potential Temperature", - "time", "time axis boundaries", - "h point nominal longitude", - "h point nominal latitude", - "Depth at cell center", - ], - variable_standard_name=[ - "", - "", - "", - "", - "sea_water_potential_temperature", - "", - "", - "", - "", - "", - ], - variable_cell_methods=[ - "", - "", - "", - "", - "area:mean z_l:mean yh:mean xh:mean time: mean", - "", - "", - "", - "", - "", ], - variable_units=[ + ["", "", "", "sea_water_potential_temperature", ""], + ["", "", "", "area:mean z_l:mean yh:mean xh:mean time: mean", ""], + [ "days", "days since 0001-01-01 00:00:00", "days since 0001-01-01 00:00:00", - "", "degC", "days since 0001-01-01 00:00:00", - "days since 0001-01-01 00:00:00", - "degrees_east", - "degrees_north", - "meters", ], ), ), ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.cice.h.1900-01-01.nc", - _AccessNCFileInfo( - path=None, # type: ignore - filename="GMOM_JRA_WD.cice.h.1900-01-01.nc", - file_id="GMOM_JRA_WD_cice_h_XXXX_XX_XX", - filename_timestamp="1900-01-01", - frequency="1day", - start_date="1900-01-01, 00:00:00", - end_date="1900-01-02, 00:00:00", - variable=["TLAT", "TLON", "aice", "tarea", "time", "time_bounds"], - variable_long_name=[ + ( + "GMOM_JRA_WD.cice.h.1900-01-01.nc", + "GMOM_JRA_WD_cice_h_XXXX_XX_XX", + "1900-01-01", + "1day", + "1900-01-01, 00:00:00", + "1900-01-02, 00:00:00", + ["TLAT", "TLON", "aice", "tarea", "time_bounds"], + [ "T grid center latitude", "T grid center longitude", "ice area (aggregate)", "area of T grid cells", - "time", "time interval endpoints", ], - variable_standard_name=["", "", "", "", "", ""], - variable_cell_methods=["", "", "time: mean", "", "", ""], - variable_units=[ + ["", "", "", "", ""], + ["", "", "time: mean", "", ""], + [ "degrees_north", "degrees_east", "1", "m^2", "days since 0000-01-01 00:00:00", - "days since 0000-01-01 00:00:00", ], ), ), ( builders.AccessOm3Builder, "access-om3/output000/GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc", - _AccessNCFileInfo( - path=None, # type: ignore - filename="GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc", - file_id="GMOM_JRA_WD_ww3_hi_XXXX_XX_XX_XXXXX", - filename_timestamp="1900-01-02-00000", - frequency="fx", # WW3 provides no time bounds - start_date="1900-01-02, 00:00:00", - end_date="1900-01-02, 00:00:00", - variable=["EF", "mapsta"], - variable_long_name=["1D spectral density", "map status"], - variable_standard_name=["", ""], - variable_cell_methods=["", ""], - variable_units=["m2 s", "unitless"], + ( + "GMOM_JRA_WD.ww3.hi.1900-01-02-00000.nc", + "GMOM_JRA_WD_ww3_hi_XXXX_XX_XX_XXXXX", + "1900-01-02-00000", + "fx", # WW3 provides no time bounds + "1900-01-02, 00:00:00", + "1900-01-02, 00:00:00", + ["EF", "mapsta"], + ["1D spectral density", "map status"], + ["", ""], + ["", ""], + ["m2 s", "unitless"], ), ), ], @@ -1091,7 +776,4 @@ def test_parse_access_filename(builder, filename, expected): def test_parse_access_ncfile(test_data, builder, filename, expected): file = str(test_data / Path(filename)) - # Set the path to the test data directory - expected.path = file - assert builder.parse_access_ncfile(file) == expected From b303840b75b1b366476555330072d7eab5b601f4 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Mon, 14 Oct 2024 14:40:00 +0800 Subject: [PATCH 21/23] - Updated tests (covered missing lines introduced by refactor found by sentry) - Updated cmip6.yaml as a different translator is required for Cordex experiments as main CMIP6 experiments. --- config/cmip6.yaml | 6 +- config/experiments/cmip6_ig45/metadata.yaml | 26 ---- .../cordex-ig45/metadata.yaml | 1 - src/access_nri_intake/catalog/translators.py | 133 +++++++++++------- tests/test_translators.py | 52 ++++++- 5 files changed, 132 insertions(+), 86 deletions(-) delete mode 100644 config/experiments/cmip6_ig45/metadata.yaml diff --git a/config/cmip6.yaml b/config/cmip6.yaml index 5af7bd5e..0ea165de 100644 --- a/config/cmip6.yaml +++ b/config/cmip6.yaml @@ -10,8 +10,4 @@ sources: - metadata_yaml: /g/data/xp65/admin/access-nri-intake-catalog/config/metadata_sources/cmip6-oi10/metadata.yaml path: - - /g/data/oi10/catalog/v2/esm/catalog.json - - - metadata_yaml: /g/data/xp65/admin/intake/metadata/cmip6_ig45/metadata.yaml - path: - - /g/data/ig45/catalog/v2/esm/catalog.json \ No newline at end of file + - /g/data/oi10/catalog/v2/esm/catalog.json \ No newline at end of file diff --git a/config/experiments/cmip6_ig45/metadata.yaml b/config/experiments/cmip6_ig45/metadata.yaml deleted file mode 100644 index 8046f731..00000000 --- a/config/experiments/cmip6_ig45/metadata.yaml +++ /dev/null @@ -1,26 +0,0 @@ -name: cmip6_ig45 -experiment_uuid: c7021d1e-7ba2-11ef-beb5-000007d3fe80 -description: 20km regional projections for CORDEX-CMIP6 from the Queensland Future Climate Science Program -long_description: >- - This dataset includes projections at 20km, formatted to meet the CORDEX-CMIP6 data standards. - The 20km projections were derived from the 10km projections. -model: -- CMIP6 -frequency: -- -variable: -- -nominal_resolution: -- -version: -contact: NCI -email: help@nci.org.au -reference: -license: -url: https://geonetwork.nci.org.au/geonetwork/srv/eng/catalog.search#/metadata/f7465_8388_5100_7022 -parent_experiment: -related_experiments: -- -notes: -keywords: -- cmip \ No newline at end of file diff --git a/config/metadata_sources/cordex-ig45/metadata.yaml b/config/metadata_sources/cordex-ig45/metadata.yaml index c118c402..1d013e63 100644 --- a/config/metadata_sources/cordex-ig45/metadata.yaml +++ b/config/metadata_sources/cordex-ig45/metadata.yaml @@ -121,7 +121,6 @@ license: url: https://geonetwork.nci.org.au/geonetwork/srv/eng/catalog.search#/metadata/f7465_8388_5100_7022 parent_experiment: related_experiments: -- notes: keywords: - cmip \ No newline at end of file diff --git a/src/access_nri_intake/catalog/translators.py b/src/access_nri_intake/catalog/translators.py index c048d0a7..fb8fb5ae 100644 --- a/src/access_nri_intake/catalog/translators.py +++ b/src/access_nri_intake/catalog/translators.py @@ -192,6 +192,31 @@ def _unique_values(series): return df[self.columns] # Preserve ordering + def set_dispatch( + self, core_colname: str, func: Callable, input_name: Optional[str] = None + ): + """ + Set a dispatch function for a column. Typically only required when either: + 1. `core_colname != input_name` + 2. A custom translation function (`func`) is required. + + Parameters + ---------- + core_colname: str + The core column name to translate to + input_name: str, optional + The name of the column in the source. If not provided, this defaults + to none, and no translation will occur + func: callable + The function to translate the column + """ + if core_colname not in ["model", "realm", "frequency", "variable"]: + raise TranslatorError( + f"'core_colname' must be one of 'model', 'realm', 'frequency', 'variable', not {core_colname}" + ) + self._dispatch[core_colname] = func + setattr(self._dispatch_keys, core_colname, input_name) + def _realm_translator(self) -> pd.Series: """ Return realm, fixing a few issues @@ -240,16 +265,21 @@ def __init__(self, source, columns): """ super().__init__(source, columns) - self._dispatch["model"] = self._model_translator - self._dispatch["realm"] = self._realm_translator - self._dispatch["frequency"] = self._frequency_translator - self._dispatch["variable"] = self._variable_translator - - self._dispatch_keys = _DispatchKeys( - model="source_id", - realm="realm", - frequency="frequency", - variable="variable_id", + self.set_dispatch( + input_name="source_id", core_colname="model", func=super()._model_translator + ) + self.set_dispatch( + input_name="realm", core_colname="realm", func=super()._realm_translator + ) + self.set_dispatch( + input_name="frequency", + core_colname="frequency", + func=super()._frequency_translator, + ) + self.set_dispatch( + input_name="variable_id", + core_colname="variable", + func=super()._variable_translator, ) @@ -271,16 +301,21 @@ def __init__(self, source, columns): """ super().__init__(source, columns) - self._dispatch["model"] = self._model_translator - self._dispatch["realm"] = self._realm_translator - self._dispatch["frequency"] = self._frequency_translator - self._dispatch["variable"] = self._variable_translator - - self._dispatch_keys = _DispatchKeys( - model="model", - realm="realm", - frequency="frequency", - variable="variable", + self.set_dispatch( + input_name="model", core_colname="model", func=super()._model_translator + ) + self.set_dispatch( + input_name="realm", core_colname="realm", func=super()._realm_translator + ) + self.set_dispatch( + input_name="frequency", + core_colname="frequency", + func=super()._frequency_translator, + ) + self.set_dispatch( + input_name="variable", + core_colname="variable", + func=super()._variable_translator, ) @@ -302,17 +337,11 @@ def __init__(self, source, columns): """ super().__init__(source, columns) - self._dispatch["variable"] = self._variable_translator - self._dispatch_keys = _DispatchKeys(variable="variable") - - def _realm_translator(self) -> pd.Series: - raise AttributeError( - f"{self.__class__.__name__}: 'realm' does not require translation" - ) - def _frequency_translator(self) -> pd.Series: - raise AttributeError( - f"{self.__class__.__name__}: 'data' does not require translation" + self.set_dispatch( + input_name="variable", + core_colname="variable", + func=super()._variable_translator, ) @@ -334,15 +363,21 @@ def __init__(self, source, columns): """ super().__init__(source, columns) - self._dispatch["model"] = self._model_translator - self._dispatch["realm"] = self._realm_translator - self._dispatch["frequency"] = self._frequency_translator - self._dispatch["variable"] = self._variable_translator - self._dispatch_keys = _DispatchKeys( - model="source_id", - realm="realm", - variable="variable_id", - frequency="freq", + self.set_dispatch( + input_name="source_id", core_colname="model", func=super()._model_translator + ) + self.set_dispatch( + input_name="realm", core_colname="realm", func=self._realm_translator + ) + self.set_dispatch( + input_name="freq", + core_colname="frequency", + func=super()._frequency_translator, + ) + self.set_dispatch( + input_name="variable_id", + core_colname="variable", + func=super()._variable_translator, ) def _realm_translator(self): @@ -370,16 +405,16 @@ def __init__(self, source, columns): """ super().__init__(source, columns) - self._dispatch["model"] = self._model_translator - self._dispatch["frequency"] = self._frequency_translator - self._dispatch["variable"] = self._variable_translator - self._dispatch["realm"] = self._variable_translator - - self._dispatch_keys = _DispatchKeys( - model="source_id", - frequency="frequency", - variable="variable_id", - realm="realm", + self.set_dispatch( + input_name="source_id", core_colname="model", func=super()._model_translator + ) + self.set_dispatch( + input_name="variable_id", + core_colname="variable", + func=super()._variable_translator, + ) + self.set_dispatch( + input_name="realm", core_colname="realm", func=self._realm_translator ) def _realm_translator(self): diff --git a/tests/test_translators.py b/tests/test_translators.py index aba1c880..29ecb756 100644 --- a/tests/test_translators.py +++ b/tests/test_translators.py @@ -17,6 +17,7 @@ TranslatorError, _cmip_realm_translator, _to_tuple, + tuplify_series, ) @@ -189,6 +190,29 @@ def test_DefaultTranslator_error(test_data): assert "Could not translate" in str(excinfo.value) +@pytest.mark.parametrize( + "colname, should_raise", + [ + ("model", False), + ("realm", False), + ("frequency", False), + ("variable", False), + ("random_string", True), + ], +) +def test_DefaultTranslator_set_dispatch(test_data, colname, should_raise): + """Test that only valid translation setups are allowed""" + esmds = intake.open_esm_datastore(test_data / "esm_datastore/cmip5-al33.json") + dtrans = DefaultTranslator(esmds, CORE_COLUMNS) + if should_raise: + with pytest.raises(TranslatorError) as excinfo: + dtrans.set_dispatch(colname, dtrans._model_translator, "model") + assert "'core_colname' must be one of" in str(excinfo.value) + else: + dtrans.set_dispatch(colname, dtrans._model_translator, colname) + assert dtrans._dispatch[colname] == dtrans._model_translator + + @pytest.mark.parametrize( "groupby, n_entries", [ @@ -271,11 +295,7 @@ def test_BarpaTranslator(test_data, groupby, n_entries): @pytest.mark.parametrize( "groupby, n_entries", - [ - (None, 5), - (["variable"], 4), - (["frequency"], 2), - ], + [(None, 5), (["variable"], 4), (["frequency"], 2), (["realm"], 1)], ) def test_CordexTranslator(test_data, groupby, n_entries): """Test CORDEX datastore translator""" @@ -284,3 +304,25 @@ def test_CordexTranslator(test_data, groupby, n_entries): esmds.description = "description" df = CordexTranslator(esmds, CORE_COLUMNS).translate(groupby) assert len(df) == n_entries + + +@pytest.mark.parametrize( + "input_series, expected_output", + [ + (pd.Series([1, 2, 3]), pd.Series([(1,), (2,), (3,)])), + ], +) +def test_tuplify_series(input_series, expected_output): + """Test the _tuplify_series function""" + + @tuplify_series + def tuplify_func(series): + return series + + class TestSeries: + @tuplify_series + def method(self, series): + return series + + assert all(tuplify_func(input_series) == expected_output) + assert all(TestSeries().method(input_series) == expected_output) From cd0e2412dd61248254b97e062042fe7c6a74e13f Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Wed, 16 Oct 2024 14:50:13 +0800 Subject: [PATCH 22/23] - Renamed experiment in metadata.yaml - Updated path for cordex.yaml file --- config/cordex.yaml | 2 +- config/metadata_sources/cordex-ig45/metadata.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config/cordex.yaml b/config/cordex.yaml index da5338f4..ca947025 100644 --- a/config/cordex.yaml +++ b/config/cordex.yaml @@ -4,6 +4,6 @@ translator: CordexTranslator sources: - - metadata_yaml: /g/data/xp65/admin/access-nri-intake-catalog/config/metadata_sources/cordex-ig45/metadata.yaml + - metadata_yaml: /g/data/xp65/admin/intake/metadata/cordex_ig45/metadata.yaml path: - /g/data/ig45/catalog/v2/esm/catalog.json diff --git a/config/metadata_sources/cordex-ig45/metadata.yaml b/config/metadata_sources/cordex-ig45/metadata.yaml index 1d013e63..f1fc6b9b 100644 --- a/config/metadata_sources/cordex-ig45/metadata.yaml +++ b/config/metadata_sources/cordex-ig45/metadata.yaml @@ -1,4 +1,4 @@ -name: cmip6_ig45 +name: cordex_ig45 experiment_uuid: c7021d1e-7ba2-11ef-beb5-000007d3fe80 description: 20km regional projections for CORDEX-CMIP6 from the Queensland Future Climate Science Program long_description: >- From 35c20fb93e4784e02946b86cfda252e63d1845b4 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Thu, 17 Oct 2024 12:13:01 +0800 Subject: [PATCH 23/23] Add missing newlines to end of yaml & json files --- config/cmip6.yaml | 2 +- config/metadata_sources/cordex-ig45/metadata.yaml | 2 +- tests/data/esm_datastore/cordex-ig45.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/config/cmip6.yaml b/config/cmip6.yaml index 0ea165de..acd39e1e 100644 --- a/config/cmip6.yaml +++ b/config/cmip6.yaml @@ -10,4 +10,4 @@ sources: - metadata_yaml: /g/data/xp65/admin/access-nri-intake-catalog/config/metadata_sources/cmip6-oi10/metadata.yaml path: - - /g/data/oi10/catalog/v2/esm/catalog.json \ No newline at end of file + - /g/data/oi10/catalog/v2/esm/catalog.json diff --git a/config/metadata_sources/cordex-ig45/metadata.yaml b/config/metadata_sources/cordex-ig45/metadata.yaml index f1fc6b9b..88b9a197 100644 --- a/config/metadata_sources/cordex-ig45/metadata.yaml +++ b/config/metadata_sources/cordex-ig45/metadata.yaml @@ -123,4 +123,4 @@ parent_experiment: related_experiments: notes: keywords: -- cmip \ No newline at end of file +- cmip diff --git a/tests/data/esm_datastore/cordex-ig45.json b/tests/data/esm_datastore/cordex-ig45.json index 5fc783b9..fab7b871 100644 --- a/tests/data/esm_datastore/cordex-ig45.json +++ b/tests/data/esm_datastore/cordex-ig45.json @@ -67,4 +67,4 @@ "column_name": "time_range" } ] -} \ No newline at end of file +}