From cfd42189f3eaff28de44737207bb7fcfc3a6e0b2 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Fri, 19 Jan 2024 15:36:27 -0500 Subject: [PATCH 01/37] add calendar --- xscen/ensembles.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index 063e966a..aee4ced4 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -10,6 +10,7 @@ import numpy as np import xarray as xr +import xclim as xc from xclim import ensembles from .config import parse_config @@ -679,6 +680,7 @@ def build_partition_data( subset_kw: dict = None, regrid_kw: dict = None, indicators_kw: dict = None, + calendar_kw: dict = None, rename_dict: dict = None, ): """Get the input for the xclim partition functions. @@ -688,7 +690,7 @@ def build_partition_data( (https://xclim.readthedocs.io/en/stable/api.html#uncertainty-partitioning). If the inputs have different grids, they have to be subsetted and regridded to a common grid/point. - Indicators can also be computed before combining the datasets. + Indicators can also be computed and calendar converted before combining the datasets. Parameters @@ -708,6 +710,14 @@ def build_partition_data( indicators_kw: Arguments to pass to `xs.indicators.compute_indicators()`. All indicators have to be for the same frequency, in order to be put on a single time axis. + calendar_kw : dict, optional + Arguments to pass to `xclim.core.calendar.convert_calendar`. + If None, the smallest common calendar is chosen. + For example, a mixed input of “noleap” and “360_day” will default to “noleap”. + ‘default’ is the standard calendar using np.datetime64 objects (xarray’s “standard” with use_cftime=False). + This is the same behavior as `calendar` in xclim.create_ensemble. + For conversions involving '360_day', the align_on='date' option is used by default. + If False, no conversion is done. rename_dict: Dictionary to rename the dimensions from xscen names to xclim names. The default is {'source': 'model', 'bias_adjust_project': 'downscaling', 'experiment': 'scenario'}. @@ -727,11 +737,17 @@ def build_partition_data( # initialize dict subset_kw = subset_kw or {} regrid_kw = regrid_kw or {} + calendar_kw = calendar_kw or {} list_ds = [] + calendars = [] for ds in datasets: if subset_kw: ds = subset(ds, **subset_kw) + # clean coords that might not match exactly + for c in ["rlat", "rlon", "lat", "lon", "rotated_pole"]: + if c in ds.coords: + ds = ds.drop_vars(c) if regrid_kw: ds = regrid_dataset(ds, **regrid_kw) @@ -745,6 +761,13 @@ def build_partition_data( else: ds = list(dict_ind.values())[0] + # get calendar of each dataset + if calendar_kw is None: + if "time" in ds.coords: + time = xr.decode_cf(ds).time + ds["time"] = time + calendars.append(xc.core.calendar.get_calendar(time)) + for dim in partition_dim: if f"cat:{dim}" in ds.attrs: ds = ds.expand_dims(**{dim: [ds.attrs[f"cat:{dim}"]]}) @@ -752,7 +775,17 @@ def build_partition_data( if "source" in partition_dim: new_source = f"{ds.attrs['cat:institution']}_{ds.attrs['cat:source']}_{ds.attrs['cat:member']}" ds = ds.assign_coords(source=[new_source]) + list_ds.append(ds) + + # convert calendars + if calendar_kw: + common_cal = xc.core.calendar.common_calendar(calendars, join="outer") + calendar_kw.setdefault("target", common_cal) + calendar_kw.setdefault("align_on", "date") + list_ds = [ + xc.core.calendar.convert_calendar(ds, **calendar_kw) for ds in list_ds + ] ens = xr.merge(list_ds) rename_dict = rename_dict or {} From 2dede98d6278b3cd690c3ced9f9bc2766761024f Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Thu, 25 Jan 2024 14:55:31 -0500 Subject: [PATCH 02/37] improve cal --- xscen/ensembles.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index aee4ced4..78917ef1 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -775,11 +775,12 @@ def build_partition_data( if "source" in partition_dim: new_source = f"{ds.attrs['cat:institution']}_{ds.attrs['cat:source']}_{ds.attrs['cat:member']}" ds = ds.assign_coords(source=[new_source]) - + ds = ds.chunk({"time": 5, "lat": 50, "lon": 50}) + print(ds.chunks) list_ds.append(ds) # convert calendars - if calendar_kw: + if isinstance(calendar_kw, dict): common_cal = xc.core.calendar.common_calendar(calendars, join="outer") calendar_kw.setdefault("target", common_cal) calendar_kw.setdefault("align_on", "date") From 48ed051ab47d6840b61d82a9e3a8ffa12e2b8d8d Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Fri, 26 Jan 2024 14:21:22 -0500 Subject: [PATCH 03/37] remove chunks --- xscen/ensembles.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index 78917ef1..8569c832 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -775,8 +775,6 @@ def build_partition_data( if "source" in partition_dim: new_source = f"{ds.attrs['cat:institution']}_{ds.attrs['cat:source']}_{ds.attrs['cat:member']}" ds = ds.assign_coords(source=[new_source]) - ds = ds.chunk({"time": 5, "lat": 50, "lon": 50}) - print(ds.chunks) list_ds.append(ds) # convert calendars From 34739f686c29dd5d558ba9563b1faee43ba1f2c7 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Tue, 30 Jan 2024 14:41:39 -0500 Subject: [PATCH 04/37] drop_vars --- xscen/ensembles.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index 5a861bf9..79c06230 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -745,10 +745,9 @@ def build_partition_data( for ds in datasets: if subset_kw: ds = subset(ds, **subset_kw) - # clean coords that might not match exactly - for c in ["rlat", "rlon", "lat", "lon", "rotated_pole"]: - if c in ds.coords: - ds = ds.drop_vars(c) + ds = ds.drop_vars( + ["lat", "lon", "rlat", "rlon", "rotated_pole"], errors="ignore" + ) if regrid_kw: ds = regrid_dataset(ds, **regrid_kw) From c2c648e7a5ed305c6b71d29dbe58cba247eb1a22 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Wed, 21 Feb 2024 16:04:31 -0500 Subject: [PATCH 05/37] add subcat possibility to avoid merge --- xscen/ensembles.py | 130 ++++++++++++++++++++++++++++++--------------- 1 file changed, 87 insertions(+), 43 deletions(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index 79c06230..4fbb61e1 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -683,6 +683,7 @@ def build_partition_data( indicators_kw: dict = None, calendar_kw: dict = None, rename_dict: dict = None, + to_dataset_kw: dict = None, ): """Get the input for the xclim partition functions. @@ -740,52 +741,95 @@ def build_partition_data( regrid_kw = regrid_kw or {} calendar_kw = calendar_kw or {} - list_ds = [] - calendars = [] - for ds in datasets: - if subset_kw: - ds = subset(ds, **subset_kw) - ds = ds.drop_vars( - ["lat", "lon", "rlat", "rlon", "rotated_pole"], errors="ignore" - ) + if isinstance(datasets, list): + list_ds = [] + calendars = [] + merged = False + for ds in datasets: + if subset_kw: + ds = subset(ds, **subset_kw) + ds = ds.drop_vars( + ["lat", "lon", "rlat", "rlon", "rotated_pole"], errors="ignore" + ) - if regrid_kw: - ds = regrid_dataset(ds, **regrid_kw) + if regrid_kw: + ds = regrid_dataset(ds, **regrid_kw) - if indicators_kw: - dict_ind = compute_indicators(ds, **indicators_kw) - if len(dict_ind) > 1: - raise ValueError( - f"The indicators computation should return only indicators of the same frequency.Returned frequencies: {dict_ind.keys()}" - ) + if indicators_kw: + dict_ind = compute_indicators(ds, **indicators_kw) + if len(dict_ind) > 1: + raise ValueError( + f"The indicators computation should return only indicators of the same frequency.Returned frequencies: {dict_ind.keys()}" + ) + else: + ds = list(dict_ind.values())[0] + + # get calendar of each dataset + if calendar_kw is None: + if "time" in ds.coords: + time = xr.decode_cf(ds).time + ds["time"] = time + calendars.append(xc.core.calendar.get_calendar(time)) + + for dim in partition_dim: + if f"cat:{dim}" in ds.attrs: + ds = ds.expand_dims(**{dim: [ds.attrs[f"cat:{dim}"]]}) + + if "source" in partition_dim: + new_source = f"{ds.attrs['cat:institution']}_{ds.attrs['cat:source']}_{ds.attrs['cat:member']}" + ds = ds.assign_coords(source=[new_source]) + list_ds.append(ds) + if not merged: + merged = ds else: - ds = list(dict_ind.values())[0] - - # get calendar of each dataset - if calendar_kw is None: - if "time" in ds.coords: - time = xr.decode_cf(ds).time - ds["time"] = time - calendars.append(xc.core.calendar.get_calendar(time)) - - for dim in partition_dim: - if f"cat:{dim}" in ds.attrs: - ds = ds.expand_dims(**{dim: [ds.attrs[f"cat:{dim}"]]}) - - if "source" in partition_dim: - new_source = f"{ds.attrs['cat:institution']}_{ds.attrs['cat:source']}_{ds.attrs['cat:member']}" - ds = ds.assign_coords(source=[new_source]) - list_ds.append(ds) - - # convert calendars - if isinstance(calendar_kw, dict): - common_cal = xc.core.calendar.common_calendar(calendars, join="outer") - calendar_kw.setdefault("target", common_cal) - calendar_kw.setdefault("align_on", "date") - list_ds = [ - xc.core.calendar.convert_calendar(ds, **calendar_kw) for ds in list_ds - ] - ens = xr.merge(list_ds) + merged = xr.merge([merged, ds]) + ens = merged + + # elif isinstance(datasets, xscen.DataCatalog): + # # special case to handle source (create one dimension with institution_source_member) + # ensemble_on_list = None + # if "source" in partition_dim: + # partition_dim.remove("source") + # ensemble_on_list = ["institution", "source", "member"] + # + # subcat = datasets + # + # # create a dataset for each bias_adjust_project, modify grid and concat them + # dim_with_different_grid = ( + # "bias_adjust_project" + # if "bias_adjust_project" in partition_dim + # else "source" + # ) + # list_ds = [] + # for d in subcat.df[dim_with_different_grid].unique(): + # ds = subcat.search(**{dim_with_different_grid: d}).to_dataset( + # concat_on=partition_dim, + # create_ensemble_on=ensemble_on_list, + # **to_dataset_kw, + # ) + # if "realization" in ds: + # ds = ds.rename({"realization": "source"}) + # if subset_kw: + # ds = subset(ds, **subset_kw) + # if regrid_kw: + # ds = regrid_dataset(ds, **regrid_kw) + # list_ds.append(ds) + # ens = xr.concat(list_ds, dim=dim_with_different_grid) + + else: + raise ValueError( + "datasets should be a list or a dictionary of xarray datasets or a xscen.DataCatalog" + ) + + # # convert calendars + # if isinstance(calendar_kw, dict): + # common_cal = xc.core.calendar.common_calendar(calendars, join="outer") + # calendar_kw.setdefault("target", common_cal) + # calendar_kw.setdefault("align_on", "date") + # list_ds = [ + # xc.core.calendar.convert_calendar(ds, **calendar_kw) for ds in list_ds + # ] + # ens = xr.merge(list_ds) rename_dict = rename_dict or {} rename_dict.setdefault("source", "model") From 4519048f831a33e4fde5ce336aa7a24d186f3cd4 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Wed, 21 Feb 2024 17:09:10 -0500 Subject: [PATCH 06/37] real --- xscen/ensembles.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index 4fbb61e1..d5735369 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -777,7 +777,7 @@ def build_partition_data( if "source" in partition_dim: new_source = f"{ds.attrs['cat:institution']}_{ds.attrs['cat:source']}_{ds.attrs['cat:member']}" - ds = ds.assign_coords(source=[new_source]) + ds = ds.assign_coords(realization=[new_source]) list_ds.append(ds) if not merged: merged = ds @@ -832,7 +832,7 @@ def build_partition_data( # ens = xr.merge(list_ds) rename_dict = rename_dict or {} - rename_dict.setdefault("source", "model") + rename_dict.setdefault("realization", "model") rename_dict.setdefault("experiment", "scenario") rename_dict.setdefault("bias_adjust_project", "downscaling") rename_dict = {k: v for k, v in rename_dict.items() if k in ens.dims} From 13fa1978cafe5dbb69810253d80601c11b12478a Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Tue, 27 Feb 2024 15:35:07 -0500 Subject: [PATCH 07/37] add datacatalog option --- xscen/ensembles.py | 114 ++++++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index d5735369..ae4df7cd 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -11,11 +11,10 @@ import numpy as np import xarray as xr -import xclim as xc from xclim import ensembles +from .catalog import DataCatalog from .config import parse_config -from .indicators import compute_indicators from .regrid import regrid_dataset from .spatial import subset from .utils import clean_up, get_cat_attrs @@ -698,9 +697,14 @@ def build_partition_data( Parameters ---------- datasets : dict - List or dictionnary of Dataset objects that will be included in the ensemble. + List, dictionnary or DataCatalog of Datasets that will be included in the ensemble. The datasets should include the necessary ("cat:") attributes to understand their metadata. - Tip: With a project catalog, you can do: `datasets = pcat.search(**search_dict).to_dataset_dict()`. + Tip: A dictionnary can be created with `datasets = pcat.search(**search_dict).to_dataset_dict()`. + + The use of a DataCatalog is recommended for large ensembles. + In that case, the ensembles will be loaded separately for each `bias_adjust_project`, + the subsetting or regridding can be applied before combining the datasets through concatenation. + If `bias_adjust_project` is not in `partition_dim`, `source` will be used instead. partition_dim: list[str] Components of the partition. They will become the dimension of the output. The default is ['source', 'experiment', 'bias_adjust_project']. @@ -743,8 +747,7 @@ def build_partition_data( if isinstance(datasets, list): list_ds = [] - calendars = [] - merged = False + # calendars = [] for ds in datasets: if subset_kw: ds = subset(ds, **subset_kw) @@ -755,21 +758,21 @@ def build_partition_data( if regrid_kw: ds = regrid_dataset(ds, **regrid_kw) - if indicators_kw: - dict_ind = compute_indicators(ds, **indicators_kw) - if len(dict_ind) > 1: - raise ValueError( - f"The indicators computation should return only indicators of the same frequency.Returned frequencies: {dict_ind.keys()}" - ) - else: - ds = list(dict_ind.values())[0] - - # get calendar of each dataset - if calendar_kw is None: - if "time" in ds.coords: - time = xr.decode_cf(ds).time - ds["time"] = time - calendars.append(xc.core.calendar.get_calendar(time)) + # if indicators_kw: + # dict_ind = compute_indicators(ds, **indicators_kw) + # if len(dict_ind) > 1: + # raise ValueError( + # f"The indicators computation should return only indicators of the same frequency.Returned frequencies: {dict_ind.keys()}" + # ) + # else: + # ds = list(dict_ind.values())[0] + + # # get calendar of each dataset + # if calendar_kw is None: + # if "time" in ds.coords: + # time = xr.decode_cf(ds).time + # ds["time"] = time + # calendars.append(xc.core.calendar.get_calendar(time)) for dim in partition_dim: if f"cat:{dim}" in ds.attrs: @@ -779,42 +782,39 @@ def build_partition_data( new_source = f"{ds.attrs['cat:institution']}_{ds.attrs['cat:source']}_{ds.attrs['cat:member']}" ds = ds.assign_coords(realization=[new_source]) list_ds.append(ds) - if not merged: - merged = ds - else: - merged = xr.merge([merged, ds]) - ens = merged - - # elif isinstance(datasets, xscen.DataCatalog): - # # special case to handle source (create one dimension with institution_source_member) - # ensemble_on_list = None - # if "source" in partition_dim: - # partition_dim.remove("source") - # ensemble_on_list = ["institution", "source", "member"] - # - # subcat = datasets - # - # # create a dataset for each bias_adjust_project, modify grid and concat them - # dim_with_different_grid = ( - # "bias_adjust_project" - # if "bias_adjust_project" in partition_dim - # else "source" - # ) - # list_ds = [] - # for d in subcat.df[dim_with_different_grid].unique(): - # ds = subcat.search(**{dim_with_different_grid: d}).to_dataset( - # concat_on=partition_dim, - # create_ensemble_on=ensemble_on_list, - # **to_dataset_kw, - # ) - # if "realization" in ds: - # ds = ds.rename({"realization": "source"}) - # if subset_kw: - # ds = subset(ds, **subset_kw) - # if regrid_kw: - # ds = regrid_dataset(ds, **regrid_kw) - # list_ds.append(ds) - # ens = xr.concat(list_ds, dim=dim_with_different_grid) + ens = xr.merge(list_ds) + + elif isinstance(datasets, DataCatalog): + # special case to handle source (create one dimension with institution_source_member) + ensemble_on_list = None + if "source" in partition_dim: + partition_dim.remove("source") + ensemble_on_list = ["institution", "source", "member"] + + subcat = datasets + + # create a dataset for each bias_adjust_project, modify grid and concat them + # if no bias_adjust_project, use source + dim_with_different_grid = ( + "bias_adjust_project" + if "bias_adjust_project" in partition_dim + else "source" + ) + list_ds = [] + for d in subcat.df[dim_with_different_grid].unique(): + ds = subcat.search(**{dim_with_different_grid: d}).to_dataset( + concat_on=partition_dim, + create_ensemble_on=ensemble_on_list, + **to_dataset_kw, + ) + if "realization" in ds: + ds = ds.rename({"realization": "source"}) + if subset_kw: + ds = subset(ds, **subset_kw) + if regrid_kw: + ds = regrid_dataset(ds, **regrid_kw) + list_ds.append(ds) + ens = xr.concat(list_ds, dim=dim_with_different_grid) else: raise ValueError( From d4ffe92469a08b3e24b3e976e63adaeee087bbd1 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Wed, 6 Mar 2024 10:20:30 -0500 Subject: [PATCH 08/37] add real to part_dim --- xscen/ensembles.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index ae4df7cd..9cbf4c14 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -676,7 +676,7 @@ def generate_weights( # noqa: C901 def build_partition_data( datasets: Union[dict, list[xr.Dataset]], - partition_dim: list[str] = ["source", "experiment", "bias_adjust_project"], + partition_dim: list[str] = ["realization", "experiment", "bias_adjust_project"], subset_kw: dict = None, regrid_kw: dict = None, indicators_kw: dict = None, @@ -738,11 +738,13 @@ def build_partition_data( xclim.ensembles """ + # TODO: add warning if both realization and source in partition_dim if isinstance(datasets, dict): datasets = list(datasets.values()) # initialize dict subset_kw = subset_kw or {} regrid_kw = regrid_kw or {} + to_dataset_kw = to_dataset_kw or {} calendar_kw = calendar_kw or {} if isinstance(datasets, list): @@ -778,9 +780,9 @@ def build_partition_data( if f"cat:{dim}" in ds.attrs: ds = ds.expand_dims(**{dim: [ds.attrs[f"cat:{dim}"]]}) - if "source" in partition_dim: + if "realization" in partition_dim: new_source = f"{ds.attrs['cat:institution']}_{ds.attrs['cat:source']}_{ds.attrs['cat:member']}" - ds = ds.assign_coords(realization=[new_source]) + ds = ds.expand_dims(realization=[new_source]) list_ds.append(ds) ens = xr.merge(list_ds) @@ -807,12 +809,11 @@ def build_partition_data( create_ensemble_on=ensemble_on_list, **to_dataset_kw, ) - if "realization" in ds: - ds = ds.rename({"realization": "source"}) if subset_kw: ds = subset(ds, **subset_kw) if regrid_kw: ds = regrid_dataset(ds, **regrid_kw) + list_ds.append(ds) ens = xr.concat(list_ds, dim=dim_with_different_grid) @@ -833,6 +834,7 @@ def build_partition_data( rename_dict = rename_dict or {} rename_dict.setdefault("realization", "model") + rename_dict.setdefault("source", "model") rename_dict.setdefault("experiment", "scenario") rename_dict.setdefault("bias_adjust_project", "downscaling") rename_dict = {k: v for k, v in rename_dict.items() if k in ens.dims} From d0a097d760f6a6ef5b3da7edbaf21f5b4a3bf025 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Thu, 9 May 2024 16:51:57 -0400 Subject: [PATCH 09/37] remove moving_rearly_window --- xscen/catalog.py | 5 +++-- xscen/ensembles.py | 2 ++ xscen/spatial.py | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/xscen/catalog.py b/xscen/catalog.py index d10d3fb0..e1c3b488 100644 --- a/xscen/catalog.py +++ b/xscen/catalog.py @@ -383,13 +383,14 @@ def check_variables(row): if len_df > 0: self.esmcat._df["variable"] = self.df.apply(check_variables, axis=1) - def exists_in_cat(self, **columns) -> bool: + def exists_in_cat(self, verbose=True, **columns) -> bool: """ Check if there is an entry in the catalogue corresponding to the arguments given. Parameters ---------- columns: Arguments that will be given to `catalog.search` + verbose: Log the result of the search. Returns ------- @@ -397,7 +398,7 @@ def exists_in_cat(self, **columns) -> bool: True if there is an entry in the catalogue corresponding to the arguments given. """ exists = bool(len(self.search(**columns))) - if exists: + if exists and verbose: logger.info(f"An entry exists for: {columns}") return exists diff --git a/xscen/ensembles.py b/xscen/ensembles.py index 9cbf4c14..15063bcc 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -787,6 +787,8 @@ def build_partition_data( ens = xr.merge(list_ds) elif isinstance(datasets, DataCatalog): + # TODO: add possibility of method and ref + # special case to handle source (create one dimension with institution_source_member) ensemble_on_list = None if "source" in partition_dim: diff --git a/xscen/spatial.py b/xscen/spatial.py index 48ffe9b3..ceaf8f76 100644 --- a/xscen/spatial.py +++ b/xscen/spatial.py @@ -184,6 +184,7 @@ def subset( # noqa: C901 else: tile_buffer = tile_buffer or region.get("tile_buffer", 0) kwargs = deepcopy(region[region["method"]]) + name = region.get("name", None) if uses_dask(ds.lon) or uses_dask(ds.lat): warnings.warn("Loading longitude and latitude for more efficient subsetting.") From 337875761cacd5bb40e7297839835c8c53800af6 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Mon, 13 May 2024 09:07:22 -0400 Subject: [PATCH 10/37] fix A-DEC --- xscen/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xscen/utils.py b/xscen/utils.py index 4e0adad3..35d97e2f 100644 --- a/xscen/utils.py +++ b/xscen/utils.py @@ -243,7 +243,7 @@ def _parse_date(date, fmts): quasiday = (pd.Timedelta(1, "d") - pd.Timedelta(1, "s")).as_unit(date.unit) if end_of_period == "Y" or "m" not in fmt: date = ( - pd.tseries.frequencies.to_offset("A-DEC").rollforward(date) + quasiday + pd.tseries.frequencies.to_offset("YE-DEC").rollforward(date) + quasiday ) elif end_of_period == "M" or "d" not in fmt: date = pd.tseries.frequencies.to_offset("M").rollforward(date) + quasiday From 95f50f9c74100599e6ee8abbbf84df286073c456 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Thu, 23 May 2024 11:21:39 -0400 Subject: [PATCH 11/37] subdivise --- xscen/ensembles.py | 170 +++++++++++++++++++++++++++------------------ 1 file changed, 103 insertions(+), 67 deletions(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index a9490d95..1c03f75a 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -675,6 +675,106 @@ def generate_weights( # noqa: C901 return weights +def _partition_from_list(datasets, partition_dim, subset_kw, regrid_kw): + list_ds = [] + # calendars = [] + for ds in datasets: + if subset_kw: + ds = subset(ds, **subset_kw) + ds = ds.drop_vars( + ["lat", "lon", "rlat", "rlon", "rotated_pole"], errors="ignore" + ) + + if regrid_kw: + ds = regrid_dataset(ds, **regrid_kw) + + # if indicators_kw: + # dict_ind = compute_indicators(ds, **indicators_kw) + # if len(dict_ind) > 1: + # raise ValueError( + # f"The indicators computation should return only indicators of the same frequency.Returned frequencies: {dict_ind.keys()}" + # ) + # else: + # ds = list(dict_ind.values())[0] + + # # get calendar of each dataset + # if calendar_kw is None: + # if "time" in ds.coords: + # time = xr.decode_cf(ds).time + # ds["time"] = time + # calendars.append(xc.core.calendar.get_calendar(time)) + + for dim in partition_dim: + if f"cat:{dim}" in ds.attrs: + ds = ds.expand_dims(**{dim: [ds.attrs[f"cat:{dim}"]]}) + + if "bias_adjust_project" in ds.dims: + ds = ds.assign_coords( + method=("bias_adjust_project", ds.attrs.get("cat:method", np.nan)) + ) + ds = ds.assign_coords( + reference=( + "bias_adjust_project", + ds.attrs.get("cat:reference", np.nan), + ) + ) + + if "realization" in partition_dim: + new_source = f"{ds.attrs['cat:institution']}_{ds.attrs['cat:source']}_{ds.attrs['cat:member']}" + ds = ds.expand_dims(realization=[new_source]) + list_ds.append(ds) + ens = xr.merge(list_ds, combine_attrs="drop_conflicts") + return ens + + +def _partition_from_catalog( + datasets, partition_dim, subset_kw, regrid_kw, to_dataset_kw +): + # TODO: add possibility of method and ref + + # special case to handle source (create one dimension with institution_source_member) + ensemble_on_list = None + if "source" in partition_dim: + partition_dim.remove("source") + ensemble_on_list = ["institution", "source", "member"] + + subcat = datasets + + # create a dataset for each bias_adjust_project, modify grid and concat them + # if no bias_adjust_project, use source + dim_with_different_grid = ( + "bias_adjust_project" if "bias_adjust_project" in partition_dim else "source" + ) + list_ds = [] + for d in subcat.df[dim_with_different_grid].unique(): + ds = subcat.search(**{dim_with_different_grid: d}).to_dataset( + concat_on=partition_dim, + create_ensemble_on=ensemble_on_list, + **to_dataset_kw, + ) + if subset_kw: + ds = subset(ds, **subset_kw) + if regrid_kw: + ds = regrid_dataset(ds, **regrid_kw) + + if "bias_adjust_project" in ds.dims: + ds = ds.assign_coords( + method=("bias_adjust_project", ds.attrs.get("cat:method", np.nan)) + ) + ds = ds.assign_coords( + reference=( + "bias_adjust_project", + ds.attrs.get("cat:reference", np.nan), + ) + ) + + list_ds.append(ds) + ens = xr.concat( + list_ds, dim=dim_with_different_grid, combine_attrs="drop_conflicts" + ) + return ens + + def build_partition_data( datasets: Union[dict, list[xr.Dataset]], partition_dim: list[str] = ["realization", "experiment", "bias_adjust_project"], @@ -749,76 +849,12 @@ def build_partition_data( calendar_kw = calendar_kw or {} if isinstance(datasets, list): - list_ds = [] - # calendars = [] - for ds in datasets: - if subset_kw: - ds = subset(ds, **subset_kw) - ds = ds.drop_vars( - ["lat", "lon", "rlat", "rlon", "rotated_pole"], errors="ignore" - ) - - if regrid_kw: - ds = regrid_dataset(ds, **regrid_kw) - - # if indicators_kw: - # dict_ind = compute_indicators(ds, **indicators_kw) - # if len(dict_ind) > 1: - # raise ValueError( - # f"The indicators computation should return only indicators of the same frequency.Returned frequencies: {dict_ind.keys()}" - # ) - # else: - # ds = list(dict_ind.values())[0] - - # # get calendar of each dataset - # if calendar_kw is None: - # if "time" in ds.coords: - # time = xr.decode_cf(ds).time - # ds["time"] = time - # calendars.append(xc.core.calendar.get_calendar(time)) - - for dim in partition_dim: - if f"cat:{dim}" in ds.attrs: - ds = ds.expand_dims(**{dim: [ds.attrs[f"cat:{dim}"]]}) - - if "realization" in partition_dim: - new_source = f"{ds.attrs['cat:institution']}_{ds.attrs['cat:source']}_{ds.attrs['cat:member']}" - ds = ds.expand_dims(realization=[new_source]) - list_ds.append(ds) - ens = xr.merge(list_ds) + ens = _partition_from_list(datasets, partition_dim, subset_kw, regrid_kw) elif isinstance(datasets, DataCatalog): - # TODO: add possibility of method and ref - - # special case to handle source (create one dimension with institution_source_member) - ensemble_on_list = None - if "source" in partition_dim: - partition_dim.remove("source") - ensemble_on_list = ["institution", "source", "member"] - - subcat = datasets - - # create a dataset for each bias_adjust_project, modify grid and concat them - # if no bias_adjust_project, use source - dim_with_different_grid = ( - "bias_adjust_project" - if "bias_adjust_project" in partition_dim - else "source" + ens = _partition_from_catalog( + datasets, partition_dim, subset_kw, regrid_kw, to_dataset_kw ) - list_ds = [] - for d in subcat.df[dim_with_different_grid].unique(): - ds = subcat.search(**{dim_with_different_grid: d}).to_dataset( - concat_on=partition_dim, - create_ensemble_on=ensemble_on_list, - **to_dataset_kw, - ) - if subset_kw: - ds = subset(ds, **subset_kw) - if regrid_kw: - ds = regrid_dataset(ds, **regrid_kw) - - list_ds.append(ds) - ens = xr.concat(list_ds, dim=dim_with_different_grid) else: raise ValueError( From 650803e4c4c878c0d45ddf1850e3a62db9ad6a14 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Thu, 23 May 2024 12:03:34 -0400 Subject: [PATCH 12/37] common_attrs --- xscen/ensembles.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index 1c03f75a..93cda8c1 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -678,6 +678,8 @@ def generate_weights( # noqa: C901 def _partition_from_list(datasets, partition_dim, subset_kw, regrid_kw): list_ds = [] # calendars = [] + # only keep attrs common to all datasets + common_attrs = False for ds in datasets: if subset_kw: ds = subset(ds, **subset_kw) @@ -710,20 +712,25 @@ def _partition_from_list(datasets, partition_dim, subset_kw, regrid_kw): if "bias_adjust_project" in ds.dims: ds = ds.assign_coords( - method=("bias_adjust_project", ds.attrs.get("cat:method", np.nan)) + method=("bias_adjust_project", [ds.attrs.get("cat:method", np.nan)]) ) ds = ds.assign_coords( reference=( "bias_adjust_project", - ds.attrs.get("cat:reference", np.nan), + [ds.attrs.get("cat:reference", np.nan)], ) ) if "realization" in partition_dim: new_source = f"{ds.attrs['cat:institution']}_{ds.attrs['cat:source']}_{ds.attrs['cat:member']}" ds = ds.expand_dims(realization=[new_source]) + + a = ds.attrs + a.pop("intake_esm_vars", None) # remove list for intersection to work + common_attrs = dict(common_attrs.items() & a.items()) if common_attrs else a list_ds.append(ds) - ens = xr.merge(list_ds, combine_attrs="drop_conflicts") + ens = xr.merge(list_ds) + ens.attrs = common_attrs return ens @@ -746,6 +753,7 @@ def _partition_from_catalog( "bias_adjust_project" if "bias_adjust_project" in partition_dim else "source" ) list_ds = [] + common_attrs = False for d in subcat.df[dim_with_different_grid].unique(): ds = subcat.search(**{dim_with_different_grid: d}).to_dataset( concat_on=partition_dim, @@ -759,19 +767,20 @@ def _partition_from_catalog( if "bias_adjust_project" in ds.dims: ds = ds.assign_coords( - method=("bias_adjust_project", ds.attrs.get("cat:method", np.nan)) + method=("bias_adjust_project", [ds.attrs.get("cat:method", np.nan)]) ) ds = ds.assign_coords( reference=( "bias_adjust_project", - ds.attrs.get("cat:reference", np.nan), + [ds.attrs.get("cat:reference", np.nan)], ) ) - + a = ds.attrs + a.pop("intake_esm_vars", None) # remove list for intersection to work + common_attrs = dict(common_attrs.items() & a.items()) if common_attrs else a list_ds.append(ds) - ens = xr.concat( - list_ds, dim=dim_with_different_grid, combine_attrs="drop_conflicts" - ) + ens = xr.concat(list_ds, dim=dim_with_different_grid) + ens.attrs = common_attrs return ens From b8d66d0045ce46da946b3744e4ade1184af7ccad Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Thu, 23 May 2024 14:42:13 -0400 Subject: [PATCH 13/37] add ref and method to cat --- xscen/ensembles.py | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index 93cda8c1..e89e1094 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -14,6 +14,7 @@ from xclim import ensembles from .catalog import DataCatalog +from .catutils import generate_id from .config import parse_config from .regrid import regrid_dataset from .spatial import subset @@ -737,7 +738,6 @@ def _partition_from_list(datasets, partition_dim, subset_kw, regrid_kw): def _partition_from_catalog( datasets, partition_dim, subset_kw, regrid_kw, to_dataset_kw ): - # TODO: add possibility of method and ref # special case to handle source (create one dimension with institution_source_member) ensemble_on_list = None @@ -748,18 +748,46 @@ def _partition_from_catalog( subcat = datasets # create a dataset for each bias_adjust_project, modify grid and concat them - # if no bias_adjust_project, use source - dim_with_different_grid = ( - "bias_adjust_project" if "bias_adjust_project" in partition_dim else "source" - ) + # choose with dim that exists in partition_dim and is the first in the order of preference + order_of_preference = ["reference", "bias_adjust_project", "source"] + dim_with_different_grid = list(set(partition_dim) & set(order_of_preference))[0] + # dim_with_different_grid = ( + # "bias_adjust_project" if "bias_adjust_project" in partition_dim else "source" + # ) + + # trick for method + if "method" in partition_dim: + # replace id with bias_adjust_project with method and ref. + datasets.df["id"] = generate_id( + datasets.df, + [ + "method", + "reference", + "mip_era", + "activity", + "driving_model", + "institution", + "source", + "experiment", + "member", + "domain", + ], + ) + + # get attrs that are common to all datasets + common_attrs = {} + for col, series in subcat.df.items(): + if (series[0] == series).all(): + common_attrs[f"cat:{col}"] = series[0] + list_ds = [] - common_attrs = False for d in subcat.df[dim_with_different_grid].unique(): ds = subcat.search(**{dim_with_different_grid: d}).to_dataset( concat_on=partition_dim, create_ensemble_on=ensemble_on_list, **to_dataset_kw, ) + if subset_kw: ds = subset(ds, **subset_kw) if regrid_kw: @@ -775,9 +803,6 @@ def _partition_from_catalog( [ds.attrs.get("cat:reference", np.nan)], ) ) - a = ds.attrs - a.pop("intake_esm_vars", None) # remove list for intersection to work - common_attrs = dict(common_attrs.items() & a.items()) if common_attrs else a list_ds.append(ds) ens = xr.concat(list_ds, dim=dim_with_different_grid) ens.attrs = common_attrs From 2a4198491da964024b8521080bef4abb8b1fd0ad Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Thu, 23 May 2024 16:18:22 -0400 Subject: [PATCH 14/37] index --- xscen/ensembles.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index e89e1094..1d0d9d6e 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -747,22 +747,20 @@ def _partition_from_catalog( subcat = datasets - # create a dataset for each bias_adjust_project, modify grid and concat them - # choose with dim that exists in partition_dim and is the first in the order of preference - order_of_preference = ["reference", "bias_adjust_project", "source"] - dim_with_different_grid = list(set(partition_dim) & set(order_of_preference))[0] - # dim_with_different_grid = ( - # "bias_adjust_project" if "bias_adjust_project" in partition_dim else "source" - # ) + # get attrs that are common to all datasets + common_attrs = {} + for col, series in subcat.df.items(): + if (series[0] == series).all(): + common_attrs[f"cat:{col}"] = series[0] - # trick for method + # trick when using method/ref, instead of bias_adjust_project if "method" in partition_dim: # replace id with bias_adjust_project with method and ref. datasets.df["id"] = generate_id( datasets.df, [ - "method", - "reference", + "method", # instead of bias_adjust_project + "reference", # instead of bias_adjust_project "mip_era", "activity", "driving_model", @@ -774,11 +772,10 @@ def _partition_from_catalog( ], ) - # get attrs that are common to all datasets - common_attrs = {} - for col, series in subcat.df.items(): - if (series[0] == series).all(): - common_attrs[f"cat:{col}"] = series[0] + # create a dataset for each bias_adjust_project, modify grid and concat them + # choose with dim that exists in partition_dim and is the first in the order of preference + order_of_preference = ["reference", "bias_adjust_project", "source"] + dim_with_different_grid = list(set(partition_dim) & set(order_of_preference))[0] list_ds = [] for d in subcat.df[dim_with_different_grid].unique(): @@ -793,6 +790,7 @@ def _partition_from_catalog( if regrid_kw: ds = regrid_dataset(ds, **regrid_kw) + # add coords method and reference if "bias_adjust_project" in ds.dims: ds = ds.assign_coords( method=("bias_adjust_project", [ds.attrs.get("cat:method", np.nan)]) @@ -913,6 +911,9 @@ def build_partition_data( rename_dict = {k: v for k, v in rename_dict.items() if k in ens.dims} ens = ens.rename(rename_dict) + ens.attrs["cat:processing_level"] = "partition_ensemble" + ens.attrs["cat:id"] = generate_id(ens)[0] + return ens From f667026bd3f241d447b3b348415d7017ad7004d2 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Fri, 24 May 2024 10:25:47 -0400 Subject: [PATCH 15/37] real in from cat --- xscen/ensembles.py | 54 ++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index 1d0d9d6e..a41e53a5 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -738,11 +738,22 @@ def _partition_from_list(datasets, partition_dim, subset_kw, regrid_kw): def _partition_from_catalog( datasets, partition_dim, subset_kw, regrid_kw, to_dataset_kw ): + if ("method" in partition_dim or "reference" in partition_dim) and ( + "bias_adjust_project" in partition_dim + ): + raise ValueError( + "The partition_dim can have either method and reference or bias_adjust_project, not both." + ) + + if ("realization" in partition_dim) and ("source" in partition_dim): + raise ValueError( + "The partition_dim can have either realization or source, not both." + ) # special case to handle source (create one dimension with institution_source_member) ensemble_on_list = None - if "source" in partition_dim: - partition_dim.remove("source") + if "realization" in partition_dim: + partition_dim.remove("realization") ensemble_on_list = ["institution", "source", "member"] subcat = datasets @@ -753,24 +764,25 @@ def _partition_from_catalog( if (series[0] == series).all(): common_attrs[f"cat:{col}"] = series[0] - # trick when using method/ref, instead of bias_adjust_project - if "method" in partition_dim: - # replace id with bias_adjust_project with method and ref. - datasets.df["id"] = generate_id( - datasets.df, - [ - "method", # instead of bias_adjust_project - "reference", # instead of bias_adjust_project - "mip_era", - "activity", - "driving_model", - "institution", - "source", - "experiment", - "member", - "domain", - ], - ) + col_id = [ + ( + "method" if "method" in partition_dim else None + ), # instead of bias_adjust_project + ( + "reference" if "reference" in partition_dim else None + ), # instead of bias_adjust_project + "bias_adjust_project" if "bias_adjust_project" in partition_dim else None, + "mip_era", + "activity", + "driving_model", + "institution" if "realization" in partition_dim else None, + "source", + "experiment", + "member" if "realization" in partition_dim else None, + "domain", + ] + + datasets.df["id"] = generate_id(datasets.df, col_id) # create a dataset for each bias_adjust_project, modify grid and concat them # choose with dim that exists in partition_dim and is the first in the order of preference @@ -911,7 +923,7 @@ def build_partition_data( rename_dict = {k: v for k, v in rename_dict.items() if k in ens.dims} ens = ens.rename(rename_dict) - ens.attrs["cat:processing_level"] = "partition_ensemble" + ens.attrs["cat:processing_level"] = "partition-ensemble" ens.attrs["cat:id"] = generate_id(ens)[0] return ens From 8414c10a90692f628281d810c5a3bf1efd34309f Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Fri, 7 Jun 2024 15:08:25 -0400 Subject: [PATCH 16/37] to level --- xscen/ensembles.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index a41e53a5..7e601c13 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -828,6 +828,7 @@ def build_partition_data( calendar_kw: dict = None, rename_dict: dict = None, to_dataset_kw: dict = None, + to_level: dict = "partition-ensemble", ): """Get the input for the xclim partition functions. @@ -872,6 +873,8 @@ def build_partition_data( rename_dict: Dictionary to rename the dimensions from xscen names to xclim names. The default is {'source': 'model', 'bias_adjust_project': 'downscaling', 'experiment': 'scenario'}. + to_level: str + The processing level of the output dataset. Default is 'partition-ensemble'. Returns ------- @@ -923,7 +926,7 @@ def build_partition_data( rename_dict = {k: v for k, v in rename_dict.items() if k in ens.dims} ens = ens.rename(rename_dict) - ens.attrs["cat:processing_level"] = "partition-ensemble" + ens.attrs["cat:processing_level"] = to_level ens.attrs["cat:id"] = generate_id(ens)[0] return ens From e27ebb73138c456b6cd10f95ab0715e9ac33ae1c Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Fri, 7 Jun 2024 16:07:52 -0400 Subject: [PATCH 17/37] to level type --- xscen/ensembles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index 7e601c13..2dfdbdc1 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -828,7 +828,7 @@ def build_partition_data( calendar_kw: dict = None, rename_dict: dict = None, to_dataset_kw: dict = None, - to_level: dict = "partition-ensemble", + to_level: str = "partition-ensemble", ): """Get the input for the xclim partition functions. From d9d05353f61d0d58651f33fd7384ff8e1a33aefa Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Mon, 10 Jun 2024 10:34:07 -0400 Subject: [PATCH 18/37] adjustment instead --- xscen/ensembles.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index 2dfdbdc1..f5045607 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -713,7 +713,10 @@ def _partition_from_list(datasets, partition_dim, subset_kw, regrid_kw): if "bias_adjust_project" in ds.dims: ds = ds.assign_coords( - method=("bias_adjust_project", [ds.attrs.get("cat:method", np.nan)]) + adjustment=( + "bias_adjust_project", + [ds.attrs.get("cat:adjustment", np.nan)], + ) ) ds = ds.assign_coords( reference=( @@ -738,11 +741,12 @@ def _partition_from_list(datasets, partition_dim, subset_kw, regrid_kw): def _partition_from_catalog( datasets, partition_dim, subset_kw, regrid_kw, to_dataset_kw ): - if ("method" in partition_dim or "reference" in partition_dim) and ( + + if ("adjustment" in partition_dim or "reference" in partition_dim) and ( "bias_adjust_project" in partition_dim ): raise ValueError( - "The partition_dim can have either method and reference or bias_adjust_project, not both." + "The partition_dim can have either adjustment and reference or bias_adjust_project, not both." ) if ("realization" in partition_dim) and ("source" in partition_dim): @@ -766,8 +770,8 @@ def _partition_from_catalog( col_id = [ ( - "method" if "method" in partition_dim else None - ), # instead of bias_adjust_project + "adjustment" if "adjustment" in partition_dim else None + ), # instead of bias_adjust_project, need to use adjustment, not method bc .sel ( "reference" if "reference" in partition_dim else None ), # instead of bias_adjust_project @@ -802,11 +806,14 @@ def _partition_from_catalog( if regrid_kw: ds = regrid_dataset(ds, **regrid_kw) - # add coords method and reference + # add coords adjustment and reference if "bias_adjust_project" in ds.dims: ds = ds.assign_coords( - method=("bias_adjust_project", [ds.attrs.get("cat:method", np.nan)]) - ) + adjustment=( + "bias_adjust_project", + [ds.attrs.get("cat:adjustment", np.nan)], + ) + ) # need to use adjustment, not method bc .sel ds = ds.assign_coords( reference=( "bias_adjust_project", From 79dd1b0f340f0d239a24ff9379c19719a2aa8c26 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Mon, 10 Jun 2024 17:03:39 -0400 Subject: [PATCH 19/37] subcat --- xscen/ensembles.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index f5045607..fa9fe392 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -786,10 +786,10 @@ def _partition_from_catalog( "domain", ] - datasets.df["id"] = generate_id(datasets.df, col_id) + subcat.df["id"] = generate_id(subcat.df, col_id) # create a dataset for each bias_adjust_project, modify grid and concat them - # choose with dim that exists in partition_dim and is the first in the order of preference + # choose dim that exists in partition_dim and first in the order of preference order_of_preference = ["reference", "bias_adjust_project", "source"] dim_with_different_grid = list(set(partition_dim) & set(order_of_preference))[0] From 0f53e79227a5a4eee1094fd8543c122a68e19d34 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Mon, 6 Jan 2025 16:36:26 -0500 Subject: [PATCH 20/37] cleanup --- src/xscen/ensembles.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/src/xscen/ensembles.py b/src/xscen/ensembles.py index b1897c9b..2bf7e7ff 100644 --- a/src/xscen/ensembles.py +++ b/src/xscen/ensembles.py @@ -671,22 +671,6 @@ def _partition_from_list(datasets, partition_dim, subset_kw, regrid_kw): if regrid_kw: ds = regrid_dataset(ds, **regrid_kw) - # if indicators_kw: - # dict_ind = compute_indicators(ds, **indicators_kw) - # if len(dict_ind) > 1: - # raise ValueError( - # f"The indicators computation should return only indicators of the same frequency.Returned frequencies: {dict_ind.keys()}" - # ) - # else: - # ds = list(dict_ind.values())[0] - - # # get calendar of each dataset - # if calendar_kw is None: - # if "time" in ds.coords: - # time = xr.decode_cf(ds).time - # ds["time"] = time - # calendars.append(xc.core.calendar.get_calendar(time)) - for dim in partition_dim: if f"cat:{dim}" in ds.attrs: ds = ds.expand_dims(**{dim: [ds.attrs[f"cat:{dim}"]]}) @@ -811,8 +795,6 @@ def build_partition_data( partition_dim: list[str] = ["realization", "experiment", "bias_adjust_project"], subset_kw: dict | None = None, regrid_kw: dict | None = None, - indicators_kw: dict | None = None, - calendar_kw: dict | None = None, rename_dict: dict | None = None, to_dataset_kw: dict | None = None, to_level: str = "partition-ensemble", @@ -847,17 +829,6 @@ def build_partition_data( Arguments to pass to `xs.spatial.subset()`. regrid_kw : dict, optional Arguments to pass to `xs.regrid_dataset()`. - indicators_kw : dict, optional - Arguments to pass to `xs.indicators.compute_indicators()`. - All indicators have to be for the same frequency, in order to be put on a single time axis. - calendar_kw : dict, optional - Arguments to pass to `xclim.core.calendar.convert_calendar`. - If None, the smallest common calendar is chosen. - For example, a mixed input of “noleap” and “360_day” will default to “noleap”. - ‘default’ is the standard calendar using np.datetime64 objects (xarray’s “standard” with use_cftime=False). - This is the same behavior as `calendar` in xclim.create_ensemble. - For conversions involving '360_day', the align_on='date' option is used by default. - If False, no conversion is done. rename_dict : dict, optional Dictionary to rename the dimensions from xscen names to xclim names. The default is {'source': 'model', 'bias_adjust_project': 'downscaling', 'experiment': 'scenario'}. @@ -880,7 +851,6 @@ def build_partition_data( subset_kw = subset_kw or {} regrid_kw = regrid_kw or {} to_dataset_kw = to_dataset_kw or {} - calendar_kw = calendar_kw or {} if isinstance(datasets, list): ens = _partition_from_list(datasets, partition_dim, subset_kw, regrid_kw) From 7894515b6b9f3fd7b921d76114d7740cd91f5160 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Tue, 7 Jan 2025 11:44:10 -0500 Subject: [PATCH 21/37] add tests --- CHANGELOG.rst | 3 ++- src/xscen/ensembles.py | 16 +++++++----- tests/test_ensembles.py | 55 ++++++++++++++++++++++++++++++----------- 3 files changed, 52 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 26043b0c..7e6946c8 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,11 +4,12 @@ Changelog v0.11.0 (unreleased) -------------------- -Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`). +Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`), Juliette Lavoie (:user:`juliettelavoie`). New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ * ``xs.io.make_toc`` now includes the global attributes of the dataset after the information about the variables. (:pull:`473`). +* Improve ``xs.ensemles.build_partition_data``. (:pull:``). Bug fixes ^^^^^^^^^ diff --git a/src/xscen/ensembles.py b/src/xscen/ensembles.py index 2bf7e7ff..36f7e92f 100644 --- a/src/xscen/ensembles.py +++ b/src/xscen/ensembles.py @@ -767,6 +767,9 @@ def _partition_from_catalog( if subset_kw: ds = subset(ds, **subset_kw) + ds = ds.drop_vars( + ["lat", "lon", "rlat", "rlon", "rotated_pole"], errors="ignore" + ) if regrid_kw: ds = regrid_dataset(ds, **regrid_kw) @@ -806,14 +809,12 @@ def build_partition_data( `partition_dim` dimensions (and time) to pass to one of the xclim partition functions (https://xclim.readthedocs.io/en/stable/api.html#uncertainty-partitioning). If the inputs have different grids, - they have to be subsetted and regridded to a common grid/point. - Indicators can also be computed and calendar converted before combining the datasets. - + they have to be subsetted and/or regridded to a common grid/point. Parameters ---------- - datasets : dict - List, dictionnary or DataCatalog of Datasets that will be included in the ensemble. + datasets : list, dict, DataCatalog + List or dictionnary of Datasets or DataCatalog that will be included in the ensemble. The datasets should include the necessary ("cat:") attributes to understand their metadata. Tip: A dictionnary can be created with `datasets = pcat.search(**search_dict).to_dataset_dict()`. @@ -829,9 +830,13 @@ def build_partition_data( Arguments to pass to `xs.spatial.subset()`. regrid_kw : dict, optional Arguments to pass to `xs.regrid_dataset()`. + Note thet regriding is computationnaly expensive. For large datasets, + it might be worth it to do do regridding first, outside of this function. rename_dict : dict, optional Dictionary to rename the dimensions from xscen names to xclim names. The default is {'source': 'model', 'bias_adjust_project': 'downscaling', 'experiment': 'scenario'}. + to_dataset_kw : dict, optional + Arguments to pass to `xscen.DataCatalog.to_dataset()` if datasets is a DataCatalog. to_level: str The processing level of the output dataset. Default is 'partition-ensemble'. @@ -844,7 +849,6 @@ def build_partition_data( -------- xclim.ensembles """ - # TODO: add warning if both realization and source in partition_dim if isinstance(datasets, dict): datasets = list(datasets.values()) # initialize dict diff --git a/tests/test_ensembles.py b/tests/test_ensembles.py index 6aa596e2..b7c33541 100644 --- a/tests/test_ensembles.py +++ b/tests/test_ensembles.py @@ -1071,21 +1071,18 @@ class TestEnsemblePartition: @pytest.mark.skipif(xe is None, reason="xesmf needed for testing regrdding") def test_build_partition_data(self, samplecat, tmp_path): # test subset - datasets = samplecat.search(variable="tas").to_dataset_dict( + datasets = samplecat.search(variable="tas", member="r1i1p1f1").to_dataset_dict( xarray_open_kwargs={"engine": "h5netcdf"} ) ds = xs.ensembles.build_partition_data( datasets=datasets, partition_dim=["source", "experiment"], subset_kw=dict(name="mtl", method="gridpoint", lat=[45.0], lon=[-74]), - indicators_kw=dict(indicators=[xc.atmos.tg_mean]), rename_dict={"source": "new-name"}, ) - assert ds.dims == {"time": 2, "scenario": 4, "new-name": 2} - assert ds.lat.values == 45.0 - assert ds.lon.values == -74 - assert [i for i in ds.data_vars] == ["tg_mean"] + assert ds.dims == {"time": 730, "scenario": 4, "new-name": 1} + assert ds.attrs["cat:processing_level"] == "partition-ensemble" # test regrid ds_grid = xe.util.cf_grid_2d(-75, -74, 0.25, 45, 48, 0.55) @@ -1095,6 +1092,7 @@ def test_build_partition_data(self, samplecat, tmp_path): ds = xs.ensembles.build_partition_data( datasets=datasets, regrid_kw=dict(ds_grid=ds_grid, weights_location=tmp_path), + to_level="test", ) assert ds.dims == { @@ -1105,16 +1103,43 @@ def test_build_partition_data(self, samplecat, tmp_path): "lon": 4, } assert [i for i in ds.data_vars] == ["tas"] + assert ds.attrs["cat:processing_level"] == "test" - # test error - with pytest.raises( - ValueError, - ): - ds = xs.ensembles.build_partition_data( - datasets=datasets, - subset_kw=dict(name="mtl", method="gridpoint", lat=[45.0], lon=[-74]), - indicators_kw=dict(indicators=[xc.atmos.tg_mean, xc.indicators.cf.tg]), - ) + def test_partition_from_catalog(self, samplecat): + datasets = samplecat.search(variable="tas", member="r1i1p1f1") + ds_from_dict = xs.ensembles.build_partition_data( + datasets=datasets.to_dataset_dict( + xarray_open_kwargs={"engine": "h5netcdf"} + ), + partition_dim=["source", "experiment"], + subset_kw=dict(name="mtl", method="gridpoint", lat=[45.0], lon=[-74]), + ) + + ds_from_cat = xs.ensembles.build_partition_data( + datasets=datasets, + partition_dim=["source", "experiment"], + subset_kw=dict(name="mtl", method="gridpoint", lat=[45.0], lon=[-74]), + to_dataset_kw=dict(xarray_open_kwargs={"engine": "h5netcdf"}), + ) + # fix order + ds_from_cat = ds_from_cat[["time", "model", "scenario", "tas"]] + ds_from_cat["tas"] = ds_from_cat["tas"].transpose("scenario", "model", "time") + + assert ds_from_dict.equals(ds_from_cat) + + def test_realization_partition(self, samplecat): + + datasets = samplecat.search(variable="tas").to_dataset_dict( + xarray_open_kwargs={"engine": "h5netcdf"} + ) + ds = xs.ensembles.build_partition_data( + datasets=datasets, + partition_dim=["realization", "experiment"], + subset_kw=dict(name="mtl", method="gridpoint", lat=[45.0], lon=[-74]), + ) + + assert "NCC_NorESM2-MM_r1i1p1f1" in ds.model.values + assert ds.dims == {"time": 730, "scenario": 4, "model": 2} class TestReduceEnsemble: From 61eb239595f745d2d67542746535578d22268abb Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Tue, 7 Jan 2025 14:09:18 -0500 Subject: [PATCH 22/37] pr num --- CHANGELOG.rst | 2 +- src/xscen/ensembles.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 7e6946c8..ad56c54f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,7 +9,7 @@ Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`), Juliet New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ * ``xs.io.make_toc`` now includes the global attributes of the dataset after the information about the variables. (:pull:`473`). -* Improve ``xs.ensemles.build_partition_data``. (:pull:``). +* Improve ``xs.ensemles.build_partition_data``. (:pull:`504`). Bug fixes ^^^^^^^^^ diff --git a/src/xscen/ensembles.py b/src/xscen/ensembles.py index 36f7e92f..a5d1f057 100644 --- a/src/xscen/ensembles.py +++ b/src/xscen/ensembles.py @@ -658,7 +658,6 @@ def generate_weights( # noqa: C901 def _partition_from_list(datasets, partition_dim, subset_kw, regrid_kw): list_ds = [] - # calendars = [] # only keep attrs common to all datasets common_attrs = False for ds in datasets: From 9ecc8c90dad1f942b7757f707917983c50d91167 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Tue, 7 Jan 2025 14:44:49 -0500 Subject: [PATCH 23/37] pin xarray --- environment-dev.yml | 2 +- environment.yml | 2 +- pyproject.toml | 2 +- src/xscen/io.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/environment-dev.yml b/environment-dev.yml index 56345653..06324bd2 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -30,7 +30,7 @@ dependencies: - shapely >=2.0 - sparse - toolz - - xarray >=2023.11.0, !=2024.6.0 + - xarray >=2023.11.0, !=2024.6.0, <2025.1.0 #FIXME: 2025.1.0 breaks rechunker with zarr - xclim >=0.53.2, <0.54 - xesmf >=0.7, <0.8.8 # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs. - zarr >=2.13 diff --git a/environment.yml b/environment.yml index 126cb271..9b51f166 100644 --- a/environment.yml +++ b/environment.yml @@ -30,7 +30,7 @@ dependencies: - shapely >=2.0 - sparse - toolz - - xarray >=2023.11.0, !=2024.6.0 + - xarray >=2023.11.0, !=2024.6.0, <2025.1.0 #FIXME: 2025.1.0 breaks rechunker with zarr - xclim >=0.53.2, <0.54 - xesmf >=0.7, <0.8.8 # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs. - zarr >=2.13 diff --git a/pyproject.toml b/pyproject.toml index a46c4883..d134504a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,7 @@ dependencies = [ "shapely >=2.0", "sparse", "toolz", - "xarray >=2023.11.0, !=2024.6.0", + "xarray >=2023.11.0, !=2024.6.0, <2025.1.0", # FIXME: 2025.1.0 breaks rechunker with zarr "xclim >=0.53.2, <0.54", "zarr >=2.13" ] diff --git a/src/xscen/io.py b/src/xscen/io.py index ed7df22c..484e00a4 100644 --- a/src/xscen/io.py +++ b/src/xscen/io.py @@ -1053,7 +1053,7 @@ def rechunk( raise ValueError( "No chunks given. Need to give at `chunks_over_var` or `chunks_over_dim`." ) - + print(ds, chunks, worker_mem, str(path_out), str(temp_store)) plan = _rechunk(ds, chunks, worker_mem, str(path_out), temp_store=str(temp_store)) plan.execute() From 1f5baa584e7be515e43243df819476f966ac981a Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Tue, 7 Jan 2025 14:54:32 -0500 Subject: [PATCH 24/37] pin xarray --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d134504a..24df481a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,7 @@ dependencies = [ "shapely >=2.0", "sparse", "toolz", - "xarray >=2023.11.0, !=2024.6.0, <2025.1.0", # FIXME: 2025.1.0 breaks rechunker with zarr + "xarray >=2023.11.0, !=2024.6.0, <2024.11.0", # FIXME: 2025.1.0 breaks rechunker with zarr "xclim >=0.53.2, <0.54", "zarr >=2.13" ] From 10cd31b2a0f804792579ed131b846d785e65fd8b Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Tue, 7 Jan 2025 15:03:45 -0500 Subject: [PATCH 25/37] pin xarray --- environment-dev.yml | 2 +- environment.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environment-dev.yml b/environment-dev.yml index 06324bd2..afef0910 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -30,7 +30,7 @@ dependencies: - shapely >=2.0 - sparse - toolz - - xarray >=2023.11.0, !=2024.6.0, <2025.1.0 #FIXME: 2025.1.0 breaks rechunker with zarr + - xarray >=2023.11.0, !=2024.6.0, <2024.11.0 #FIXME: 2025.1.0 breaks rechunker with zarr - xclim >=0.53.2, <0.54 - xesmf >=0.7, <0.8.8 # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs. - zarr >=2.13 diff --git a/environment.yml b/environment.yml index 9b51f166..5167bbe1 100644 --- a/environment.yml +++ b/environment.yml @@ -30,7 +30,7 @@ dependencies: - shapely >=2.0 - sparse - toolz - - xarray >=2023.11.0, !=2024.6.0, <2025.1.0 #FIXME: 2025.1.0 breaks rechunker with zarr + - xarray >=2023.11.0, !=2024.6.0, <2024.11.0 #FIXME: 2025.1.0 breaks rechunker with zarr - xclim >=0.53.2, <0.54 - xesmf >=0.7, <0.8.8 # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs. - zarr >=2.13 From 7fe8abd385788acfdb05fc4b0d7b6a6b058169ab Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Tue, 7 Jan 2025 15:46:33 -0500 Subject: [PATCH 26/37] fix doc --- docs/notebooks/4_ensembles.ipynb | 16 ++++++++++++---- environment-dev.yml | 2 +- environment.yml | 2 +- pyproject.toml | 2 +- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/docs/notebooks/4_ensembles.ipynb b/docs/notebooks/4_ensembles.ipynb index fa76d1ac..0fbf8855 100644 --- a/docs/notebooks/4_ensembles.ipynb +++ b/docs/notebooks/4_ensembles.ipynb @@ -169,19 +169,28 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "is_executing": true + }, "outputs": [], "source": [ "# Get catalog\n", "from pathlib import Path\n", "\n", + "import xclim as xc\n", + "\n", "output_folder = Path().absolute() / \"_data\"\n", "cat = xs.DataCatalog(str(output_folder / \"tutorial-catalog.json\"))\n", "\n", "# create a dictionnary of datasets wanted for the partition\n", "input_dict = cat.search(variable=\"tas\", member=\"r1i1p1f1\").to_dataset_dict(\n", " xarray_open_kwargs={\"engine\": \"h5netcdf\"}\n", - ")" + ")\n", + "datasets = {}\n", + "for k, v in input_dict.items():\n", + " ds = xc.atmos.tg_mean(v.tas).to_dataset()\n", + " ds.attrs = v.attrs\n", + " datasets[k] = ds" ] }, { @@ -204,9 +213,8 @@ "import xclim as xc\n", "\n", "ds = xs.ensembles.build_partition_data(\n", - " input_dict,\n", + " datasets,\n", " subset_kw=dict(name=\"mtl\", method=\"gridpoint\", lat=[45.5], lon=[-73.6]),\n", - " indicators_kw={\"indicators\": [xc.atmos.tg_mean]},\n", ")\n", "ds" ] diff --git a/environment-dev.yml b/environment-dev.yml index afef0910..4af3a0de 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -30,7 +30,7 @@ dependencies: - shapely >=2.0 - sparse - toolz - - xarray >=2023.11.0, !=2024.6.0, <2024.11.0 #FIXME: 2025.1.0 breaks rechunker with zarr + - xarray >=2023.11.0, !=2024.6.0, <2024.10.0 #FIXME: 2024.10.0 breaks rechunker with zarr - xclim >=0.53.2, <0.54 - xesmf >=0.7, <0.8.8 # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs. - zarr >=2.13 diff --git a/environment.yml b/environment.yml index 5167bbe1..f162d8a2 100644 --- a/environment.yml +++ b/environment.yml @@ -30,7 +30,7 @@ dependencies: - shapely >=2.0 - sparse - toolz - - xarray >=2023.11.0, !=2024.6.0, <2024.11.0 #FIXME: 2025.1.0 breaks rechunker with zarr + - xarray >=2023.11.0, !=2024.6.0, <2024.10.0 #FIXME: 2024.10.0 breaks rechunker with zarr - xclim >=0.53.2, <0.54 - xesmf >=0.7, <0.8.8 # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs. - zarr >=2.13 diff --git a/pyproject.toml b/pyproject.toml index 24df481a..57cc309d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,7 @@ dependencies = [ "shapely >=2.0", "sparse", "toolz", - "xarray >=2023.11.0, !=2024.6.0, <2024.11.0", # FIXME: 2025.1.0 breaks rechunker with zarr + "xarray >=2023.11.0, !=2024.6.0, <2024.10.0", # FIXME: 2024.10.0 breaks rechunker with zarr "xclim >=0.53.2, <0.54", "zarr >=2.13" ] From c4aaf0895c0387aa7b0af7db1abe938ae95d4ae8 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Tue, 7 Jan 2025 17:24:53 -0500 Subject: [PATCH 27/37] changelog --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a3626eb0..483d37d6 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,7 +8,7 @@ Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`), Juliet New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Improve ``xs.ensemles.build_partition_data``. (:pull:``). +* Improve ``xs.ensemles.build_partition_data``. (:pull:`504`). Breaking changes ^^^^^^^^^^^^^^^^ From 98aa4ba5d65b5812b63ea4479d01b47f7286101e Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Tue, 7 Jan 2025 17:48:43 -0500 Subject: [PATCH 28/37] update xclim v --- environment.yml | 2 +- src/xscen/data/fr/LC_MESSAGES/xscen.mo | Bin 1015 -> 1015 bytes 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index f162d8a2..c2605af5 100644 --- a/environment.yml +++ b/environment.yml @@ -31,7 +31,7 @@ dependencies: - sparse - toolz - xarray >=2023.11.0, !=2024.6.0, <2024.10.0 #FIXME: 2024.10.0 breaks rechunker with zarr - - xclim >=0.53.2, <0.54 + - xclim >=0.54, <0.55 - xesmf >=0.7, <0.8.8 # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs. - zarr >=2.13 # To install from source diff --git a/src/xscen/data/fr/LC_MESSAGES/xscen.mo b/src/xscen/data/fr/LC_MESSAGES/xscen.mo index 51b5812af3b98854aae679b7238a00236247820d..a8cfcf2504034a8fc8df704fe70f6671a7322999 100644 GIT binary patch delta 32 ocmey){+)fpM@C*#T?0d119JsKb1Orm&HotX85zwc%Q4>s0Iy;Rz5oCK delta 32 ocmey){+)fpM@C)~T|+}%Ln8$PODjW*&HotX85vC`%Q4>s0Iy{UzW@LL From 0edc0aabd6073788c24c8db06a9afc32a950a09a Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Tue, 7 Jan 2025 17:52:40 -0500 Subject: [PATCH 29/37] update xclim v --- environment-dev.yml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environment-dev.yml b/environment-dev.yml index 4af3a0de..a3f2c89e 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -31,7 +31,7 @@ dependencies: - sparse - toolz - xarray >=2023.11.0, !=2024.6.0, <2024.10.0 #FIXME: 2024.10.0 breaks rechunker with zarr - - xclim >=0.53.2, <0.54 + - xclim >=0.54, <0.55 - xesmf >=0.7, <0.8.8 # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs. - zarr >=2.13 # Opt diff --git a/pyproject.toml b/pyproject.toml index 57cc309d..d921dbf4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,7 @@ dependencies = [ "sparse", "toolz", "xarray >=2023.11.0, !=2024.6.0, <2024.10.0", # FIXME: 2024.10.0 breaks rechunker with zarr - "xclim >=0.53.2, <0.54", + "xclim >=0.54, <0.55", "zarr >=2.13" ] From fa884c004724417119cd36727dab507b11e0546c Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Wed, 8 Jan 2025 10:28:06 -0500 Subject: [PATCH 30/37] fix docs --- environment-dev.yml | 3 ++- pyproject.toml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/environment-dev.yml b/environment-dev.yml index a3f2c89e..8e234de8 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -30,7 +30,7 @@ dependencies: - shapely >=2.0 - sparse - toolz - - xarray >=2023.11.0, !=2024.6.0, <2024.10.0 #FIXME: 2024.10.0 breaks rechunker with zarr + - xarray >=2023.11.0, !=2024.6.0, <2024.10.0 #FIXME: 2024.10.0 breaks rechunker with zarr, https://github.com/pangeo-data/rechunker/issues/154 - xclim >=0.54, <0.55 - xesmf >=0.7, <0.8.8 # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs. - zarr >=2.13 @@ -56,6 +56,7 @@ dependencies: - pandoc - pooch - pre-commit >=3.5.0 + - pygments <2.19 #FIXME: temporary fix, https://github.com/felix-hilden/sphinx-codeautolink/issues/153 - pytest >=8.3.2 - pytest-cov >=5.0.0 - pytest-xdist >=3.2.0 diff --git a/pyproject.toml b/pyproject.toml index d921dbf4..1c9cb828 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -109,7 +109,8 @@ docs = [ "sphinx-intl", "sphinx-mdinclude", "sphinx-rtd-theme >=1.0", - "sphinxcontrib-napoleon" + "sphinxcontrib-napoleon", + "pygments <2.19" # FIXME: temporary fix, https://github.com/felix-hilden/sphinx-codeautolink/issues/153 ] extra = [ "xesmf>=0.7, <0.8.8" # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs. From 32350ff94986d122a30d2fbfc7e5cf5df0747aad Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Wed, 8 Jan 2025 10:49:38 -0500 Subject: [PATCH 31/37] fix test --- tests/test_biasadjust.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_biasadjust.py b/tests/test_biasadjust.py index b80ccdb2..d133c715 100644 --- a/tests/test_biasadjust.py +++ b/tests/test_biasadjust.py @@ -47,11 +47,10 @@ def test_basic_train(self, var, period): def test_preprocess(self): - dref360 = self.dref.convert_calendar("360_day", align_on="year") - + dhist360 = self.dhist.convert_calendar("360_day", align_on="year") out = xs.train( - dref360, - self.dhist, + self.dref, + dhist360, var="tas", period=["2001", "2002"], adapt_freq={"thresh": "2 K"}, From 4ebc4c3a2e4fff19295634c6bb3a721d0e39a957 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Wed, 8 Jan 2025 15:27:50 -0500 Subject: [PATCH 32/37] remove test --- tests/test_biasadjust.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_biasadjust.py b/tests/test_biasadjust.py index d133c715..5af316ef 100644 --- a/tests/test_biasadjust.py +++ b/tests/test_biasadjust.py @@ -53,14 +53,14 @@ def test_preprocess(self): dhist360, var="tas", period=["2001", "2002"], - adapt_freq={"thresh": "2 K"}, + # adapt_freq={"thresh": "2 K"}, #FIXME: put back the test when xclim 0.55 is released, https://github.com/Ouranosinc/xclim/pull/2038/files jitter_over={"upper_bnd": "3 K", "thresh": "2 K"}, jitter_under={"thresh": "2 K"}, ) assert out.attrs["train_params"] == { "maximal_calendar": "noleap", - "adapt_freq": {"thresh": "2 K"}, + # "adapt_freq": {"thresh": "2 K"}, #FIXME: put back the test when xclim 0.55 is released, https://github.com/Ouranosinc/xclim/pull/2038/files "jitter_over": {"upper_bnd": "3 K", "thresh": "2 K"}, "jitter_under": {"thresh": "2 K"}, "var": ["tas"], From afd6437c8aed13dc62e8b2fc5897421c929c0315 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Wed, 8 Jan 2025 15:45:01 -0500 Subject: [PATCH 33/37] try again --- tests/test_biasadjust.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_biasadjust.py b/tests/test_biasadjust.py index 5af316ef..8b86f003 100644 --- a/tests/test_biasadjust.py +++ b/tests/test_biasadjust.py @@ -46,21 +46,23 @@ def test_basic_train(self, var, period): np.testing.assert_array_equal(out["scaling"], result) def test_preprocess(self): + # FIXME: put back the test when xclim 0.55 is released, https://github.com/Ouranosinc/xclim/pull/2038/files + # dhist360 = self.dhist.convert_calendar("360_day", align_on="year") + dhist360 = self.dhist.convert_calendar("noleap", align_on="year") - dhist360 = self.dhist.convert_calendar("360_day", align_on="year") out = xs.train( self.dref, dhist360, var="tas", period=["2001", "2002"], - # adapt_freq={"thresh": "2 K"}, #FIXME: put back the test when xclim 0.55 is released, https://github.com/Ouranosinc/xclim/pull/2038/files + adapt_freq={"thresh": "2 K"}, jitter_over={"upper_bnd": "3 K", "thresh": "2 K"}, jitter_under={"thresh": "2 K"}, ) assert out.attrs["train_params"] == { "maximal_calendar": "noleap", - # "adapt_freq": {"thresh": "2 K"}, #FIXME: put back the test when xclim 0.55 is released, https://github.com/Ouranosinc/xclim/pull/2038/files + "adapt_freq": {"thresh": "2 K"}, "jitter_over": {"upper_bnd": "3 K", "thresh": "2 K"}, "jitter_under": {"thresh": "2 K"}, "var": ["tas"], From e3c8bf55d4b0fa7497f632ccc11ceb5d5f42bb45 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Wed, 15 Jan 2025 13:41:56 -0500 Subject: [PATCH 34/37] Apply suggestions from code review Co-authored-by: RondeauG <38501935+RondeauG@users.noreply.github.com> --- CHANGELOG.rst | 2 +- src/xscen/ensembles.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 483d37d6..86c1c549 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,7 +8,7 @@ Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`), Juliet New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* Improve ``xs.ensemles.build_partition_data``. (:pull:`504`). +* Improve ``xs.ensembles.build_partition_data``. (:pull:`504`). Breaking changes ^^^^^^^^^^^^^^^^ diff --git a/src/xscen/ensembles.py b/src/xscen/ensembles.py index 53e801ef..b21fa0f0 100644 --- a/src/xscen/ensembles.py +++ b/src/xscen/ensembles.py @@ -787,10 +787,10 @@ def build_partition_data( Parameters ---------- - datasets : list, dict, DataCatalog - List or dictionnary of Datasets or DataCatalog that will be included in the ensemble. + datasets : list[xr.Dataset], dict[str, xr.Dataset], DataCatalog + Either a list/dictionary of Datasets or a DataCatalog that will be included in the ensemble. The datasets should include the necessary ("cat:") attributes to understand their metadata. - Tip: A dictionnary can be created with `datasets = pcat.search(**search_dict).to_dataset_dict()`. + Tip: A dictionary can be created with `datasets = pcat.search(**search_dict).to_dataset_dict()`. The use of a DataCatalog is recommended for large ensembles. In that case, the ensembles will be loaded separately for each `bias_adjust_project`, @@ -804,8 +804,8 @@ def build_partition_data( Arguments to pass to `xs.spatial.subset()`. regrid_kw : dict, optional Arguments to pass to `xs.regrid_dataset()`. - Note thet regriding is computationnaly expensive. For large datasets, - it might be worth it to do do regridding first, outside of this function. + Note that regriding is computationally expensive. For large datasets, + it might be worth it to do the regridding first, outside of this function. rename_dict : dict, optional Dictionary to rename the dimensions from xscen names to xclim names. The default is {'source': 'model', 'bias_adjust_project': 'downscaling', 'experiment': 'scenario'}. @@ -840,7 +840,7 @@ def build_partition_data( else: raise ValueError( - "datasets should be a list or a dictionary of xarray datasets or a xscen.DataCatalog" + "'datasets' should be a list/dictionary of xarray datasets or a xscen.DataCatalog" ) rename_dict = rename_dict or {} From ea1a99a2f8dfb32ffc0d957dbf483ee1fbf05029 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Wed, 15 Jan 2025 13:48:09 -0500 Subject: [PATCH 35/37] remove print --- src/xscen/io.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/xscen/io.py b/src/xscen/io.py index 484e00a4..e27f0c30 100644 --- a/src/xscen/io.py +++ b/src/xscen/io.py @@ -1053,7 +1053,6 @@ def rechunk( raise ValueError( "No chunks given. Need to give at `chunks_over_var` or `chunks_over_dim`." ) - print(ds, chunks, worker_mem, str(path_out), str(temp_store)) plan = _rechunk(ds, chunks, worker_mem, str(path_out), temp_store=str(temp_store)) plan.execute() From 0ca11fdddd1e0c851fb86f2d00aa8acceba84694 Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Wed, 15 Jan 2025 14:06:35 -0500 Subject: [PATCH 36/37] pin zarr --- environment-dev.yml | 2 +- environment.yml | 2 +- pyproject.toml | 2 +- src/xscen/ensembles.py | 23 ++++++++++++++++++++--- 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/environment-dev.yml b/environment-dev.yml index 8e234de8..eefbb629 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -33,7 +33,7 @@ dependencies: - xarray >=2023.11.0, !=2024.6.0, <2024.10.0 #FIXME: 2024.10.0 breaks rechunker with zarr, https://github.com/pangeo-data/rechunker/issues/154 - xclim >=0.54, <0.55 - xesmf >=0.7, <0.8.8 # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs. - - zarr >=2.13 + - zarr >=2.13, <3.0 #FIXME: xarray is compatible with zarr 3.0 from 2025.01.1, but we pin xarray below that version # Opt - nc-time-axis >=1.3.1 - pyarrow >=10.0.1 diff --git a/environment.yml b/environment.yml index c2605af5..a74a96b7 100644 --- a/environment.yml +++ b/environment.yml @@ -33,7 +33,7 @@ dependencies: - xarray >=2023.11.0, !=2024.6.0, <2024.10.0 #FIXME: 2024.10.0 breaks rechunker with zarr - xclim >=0.54, <0.55 - xesmf >=0.7, <0.8.8 # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs. - - zarr >=2.13 + - zarr >=2.13, <3.0 #FIXME: xarray is compatible with zarr 3.0 from 2025.01.1, but we pin xarray below that version # To install from source - setuptools >=65.0.0 - setuptools-scm >=8.0.0 diff --git a/pyproject.toml b/pyproject.toml index 1c9cb828..0fa70967 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,7 +67,7 @@ dependencies = [ "toolz", "xarray >=2023.11.0, !=2024.6.0, <2024.10.0", # FIXME: 2024.10.0 breaks rechunker with zarr "xclim >=0.54, <0.55", - "zarr >=2.13" + "zarr >=2.13, <3.0" # FIXME: xarray is compatible with zarr 3.0 from 2025.01.1, but we pin xarray below that version" ] [project.optional-dependencies] diff --git a/src/xscen/ensembles.py b/src/xscen/ensembles.py index b21fa0f0..dd50faa0 100644 --- a/src/xscen/ensembles.py +++ b/src/xscen/ensembles.py @@ -17,7 +17,7 @@ from .config import parse_config from .indicators import compute_indicators from .regrid import regrid_dataset -from .spatial import subset +from .spatial import get_grid_mapping, subset from .utils import clean_up, get_cat_attrs logger = logging.getLogger(__name__) @@ -638,8 +638,16 @@ def _partition_from_list(datasets, partition_dim, subset_kw, regrid_kw): for ds in datasets: if subset_kw: ds = subset(ds, **subset_kw) + gridmap = get_grid_mapping(ds) ds = ds.drop_vars( - ["lat", "lon", "rlat", "rlon", "rotated_pole"], errors="ignore" + [ + ds.cf["longitude"], + ds.cf["latitude"], + ds.cf.axes["X"][0], + ds.cf.axes["Y"][0], + gridmap, + ], + errors="ignore", ) if regrid_kw: @@ -741,9 +749,18 @@ def _partition_from_catalog( if subset_kw: ds = subset(ds, **subset_kw) + gridmap = get_grid_mapping(ds) ds = ds.drop_vars( - ["lat", "lon", "rlat", "rlon", "rotated_pole"], errors="ignore" + [ + ds.cf["longitude"], + ds.cf["latitude"], + ds.cf.axes["X"][0], + ds.cf.axes["Y"][0], + gridmap, + ], + errors="ignore", ) + if regrid_kw: ds = regrid_dataset(ds, **regrid_kw) From d9ab99158ff3bef7a386b3a61f310a5f5f00d6ed Mon Sep 17 00:00:00 2001 From: juliettelavoie Date: Wed, 15 Jan 2025 14:30:15 -0500 Subject: [PATCH 37/37] fix name --- src/xscen/ensembles.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/xscen/ensembles.py b/src/xscen/ensembles.py index dd50faa0..15df069c 100644 --- a/src/xscen/ensembles.py +++ b/src/xscen/ensembles.py @@ -641,8 +641,8 @@ def _partition_from_list(datasets, partition_dim, subset_kw, regrid_kw): gridmap = get_grid_mapping(ds) ds = ds.drop_vars( [ - ds.cf["longitude"], - ds.cf["latitude"], + ds.cf["longitude"].name, + ds.cf["latitude"].name, ds.cf.axes["X"][0], ds.cf.axes["Y"][0], gridmap, @@ -752,8 +752,8 @@ def _partition_from_catalog( gridmap = get_grid_mapping(ds) ds = ds.drop_vars( [ - ds.cf["longitude"], - ds.cf["latitude"], + ds.cf["longitude"].name, + ds.cf["latitude"].name, ds.cf.axes["X"][0], ds.cf.axes["Y"][0], gridmap,