Ouranosinc · juliettelavoie · Jan 15, 2025 · Jan 19, 2024 · Jan 25, 2024 · Jan 26, 2024
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -4,11 +4,11 @@ Changelog
 
 v0.11.0 (unreleased)
 --------------------
-Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`).
+Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`), Juliette Lavoie (:user:`juliettelavoie`).
 
 New features and enhancements
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-* N/A
+* Improve ``xs.ensemles.build_partition_data``. (:pull:`504`).
 
 Breaking changes
 ^^^^^^^^^^^^^^^^

diff --git a/docs/notebooks/4_ensembles.ipynb b/docs/notebooks/4_ensembles.ipynb
@@ -169,19 +169,28 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "is_executing": true
+   },
    "outputs": [],
    "source": [
     "# Get catalog\n",
     "from pathlib import Path\n",
     "\n",
+    "import xclim as xc\n",
+    "\n",
     "output_folder = Path().absolute() / \"_data\"\n",
     "cat = xs.DataCatalog(str(output_folder / \"tutorial-catalog.json\"))\n",
     "\n",
     "# create a dictionnary of datasets wanted for the partition\n",
     "input_dict = cat.search(variable=\"tas\", member=\"r1i1p1f1\").to_dataset_dict(\n",
     "    xarray_open_kwargs={\"engine\": \"h5netcdf\"}\n",
-    ")"
+    ")\n",
+    "datasets = {}\n",
+    "for k, v in input_dict.items():\n",
+    "    ds = xc.atmos.tg_mean(v.tas).to_dataset()\n",
+    "    ds.attrs = v.attrs\n",
+    "    datasets[k] = ds"
    ]
   },
   {
@@ -204,9 +213,8 @@
     "import xclim as xc\n",
     "\n",
     "ds = xs.ensembles.build_partition_data(\n",
-    "    input_dict,\n",
+    "    datasets,\n",
     "    subset_kw=dict(name=\"mtl\", method=\"gridpoint\", lat=[45.5], lon=[-73.6]),\n",
-    "    indicators_kw={\"indicators\": [xc.atmos.tg_mean]},\n",
     ")\n",
     "ds"
    ]

diff --git a/environment-dev.yml b/environment-dev.yml
@@ -30,8 +30,8 @@ dependencies:
   - shapely >=2.0
   - sparse
   - toolz
-  - xarray >=2023.11.0, !=2024.6.0
-  - xclim >=0.53.2, <0.54
+  - xarray >=2023.11.0, !=2024.6.0, <2024.10.0 #FIXME: 2024.10.0 breaks rechunker with zarr, https://github.com/pangeo-data/rechunker/issues/154
+  - xclim >=0.54, <0.55
   - xesmf >=0.7, <0.8.8  # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs.
   - zarr >=2.13
   # Opt
@@ -56,6 +56,7 @@ dependencies:
   - pandoc
   - pooch
   - pre-commit >=3.5.0
+  - pygments <2.19 #FIXME: temporary fix, https://github.com/felix-hilden/sphinx-codeautolink/issues/153
   - pytest >=8.3.2
   - pytest-cov >=5.0.0
   - pytest-xdist >=3.2.0

diff --git a/environment.yml b/environment.yml
@@ -30,8 +30,8 @@ dependencies:
   - shapely >=2.0
   - sparse
   - toolz
-  - xarray >=2023.11.0, !=2024.6.0
-  - xclim >=0.53.2, <0.54
+  - xarray >=2023.11.0, !=2024.6.0, <2024.10.0 #FIXME: 2024.10.0 breaks rechunker with zarr
+  - xclim >=0.54, <0.55
   - xesmf >=0.7, <0.8.8  # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs.
   - zarr >=2.13
   # To install from source

diff --git a/pyproject.toml b/pyproject.toml
@@ -65,8 +65,8 @@ dependencies = [
   "shapely >=2.0",
   "sparse",
   "toolz",
-  "xarray >=2023.11.0, !=2024.6.0",
-  "xclim >=0.53.2, <0.54",
+  "xarray >=2023.11.0, !=2024.6.0, <2024.10.0", # FIXME: 2024.10.0 breaks rechunker with zarr
+  "xclim >=0.54, <0.55",
   "zarr >=2.13"
 ]
 
@@ -109,7 +109,8 @@ docs = [
   "sphinx-intl",
   "sphinx-mdinclude",
   "sphinx-rtd-theme >=1.0",
-  "sphinxcontrib-napoleon"
+  "sphinxcontrib-napoleon",
+  "pygments <2.19" # FIXME: temporary fix, https://github.com/felix-hilden/sphinx-codeautolink/issues/153
 ]
 extra = [
   "xesmf>=0.7, <0.8.8" # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs.

diff --git a/src/xscen/data/fr/LC_MESSAGES/xscen.mo b/src/xscen/data/fr/LC_MESSAGES/xscen.mo
diff --git a/src/xscen/ensembles.py b/src/xscen/ensembles.py
@@ -12,6 +12,8 @@
 import xarray as xr
 from xclim import ensembles
 
+from .catalog import DataCatalog
+from .catutils import generate_id
 from .config import parse_config
 from .indicators import compute_indicators
 from .regrid import regrid_dataset
@@ -629,13 +631,150 @@ def generate_weights(  # noqa: C901
     return weights
 
 
+def _partition_from_list(datasets, partition_dim, subset_kw, regrid_kw):
+    list_ds = []
+    # only keep attrs common to all datasets
+    common_attrs = False
+    for ds in datasets:
+        if subset_kw:
+            ds = subset(ds, **subset_kw)
+            ds = ds.drop_vars(
+                ["lat", "lon", "rlat", "rlon", "rotated_pole"], errors="ignore"
+            )
+
+        if regrid_kw:
+            ds = regrid_dataset(ds, **regrid_kw)
+
+        for dim in partition_dim:
+            if f"cat:{dim}" in ds.attrs:
+                ds = ds.expand_dims(**{dim: [ds.attrs[f"cat:{dim}"]]})
+
+        if "bias_adjust_project" in ds.dims:
+            ds = ds.assign_coords(
+                adjustment=(
+                    "bias_adjust_project",
+                    [ds.attrs.get("cat:adjustment", np.nan)],
+                )
+            )
+            ds = ds.assign_coords(
+                reference=(
+                    "bias_adjust_project",
+                    [ds.attrs.get("cat:reference", np.nan)],
+                )
+            )
+
+        if "realization" in partition_dim:
+            new_source = f"{ds.attrs['cat:institution']}_{ds.attrs['cat:source']}_{ds.attrs['cat:member']}"
+            ds = ds.expand_dims(realization=[new_source])
+
+        a = ds.attrs
+        a.pop("intake_esm_vars", None)  # remove list for intersection to work
+        common_attrs = dict(common_attrs.items() & a.items()) if common_attrs else a
+        list_ds.append(ds)
+    ens = xr.merge(list_ds)
+    ens.attrs = common_attrs
+    return ens
+
+
+def _partition_from_catalog(
+    datasets, partition_dim, subset_kw, regrid_kw, to_dataset_kw
+):
+
+    if ("adjustment" in partition_dim or "reference" in partition_dim) and (
+        "bias_adjust_project" in partition_dim
+    ):
+        raise ValueError(
+            "The partition_dim can have either adjustment and reference or bias_adjust_project, not both."
+        )
+
+    if ("realization" in partition_dim) and ("source" in partition_dim):
+        raise ValueError(
+            "The partition_dim can have either realization or source, not both."
+        )
+
+    # special case to handle source (create one dimension with institution_source_member)
+    ensemble_on_list = None
+    if "realization" in partition_dim:
+        partition_dim.remove("realization")
+        ensemble_on_list = ["institution", "source", "member"]
+
+    subcat = datasets
+
+    # get attrs that are common to all datasets
+    common_attrs = {}
+    for col, series in subcat.df.items():
+        if (series[0] == series).all():
+            common_attrs[f"cat:{col}"] = series[0]
+
+    col_id = [
+        (
+            "adjustment" if "adjustment" in partition_dim else None
+        ),  # instead of bias_adjust_project, need to use adjustment, not method bc .sel
+        (
+            "reference" if "reference" in partition_dim else None
+        ),  # instead of bias_adjust_project
+        "bias_adjust_project" if "bias_adjust_project" in partition_dim else None,
+        "mip_era",
+        "activity",
+        "driving_model",
+        "institution" if "realization" in partition_dim else None,
+        "source",
+        "experiment",
+        "member" if "realization" in partition_dim else None,
+        "domain",
+    ]
+
+    subcat.df["id"] = generate_id(subcat.df, col_id)
+
+    # create a dataset for each bias_adjust_project, modify grid and concat them
+    # choose dim that exists in partition_dim and first in the order of preference
+    order_of_preference = ["reference", "bias_adjust_project", "source"]
+    dim_with_different_grid = list(set(partition_dim) & set(order_of_preference))[0]
+
+    list_ds = []
+    for d in subcat.df[dim_with_different_grid].unique():
+        ds = subcat.search(**{dim_with_different_grid: d}).to_dataset(
+            concat_on=partition_dim,
+            create_ensemble_on=ensemble_on_list,
+            **to_dataset_kw,
+        )
+
+        if subset_kw:
+            ds = subset(ds, **subset_kw)
+            ds = ds.drop_vars(
+                ["lat", "lon", "rlat", "rlon", "rotated_pole"], errors="ignore"
+            )
+        if regrid_kw:
+            ds = regrid_dataset(ds, **regrid_kw)
+
+        # add coords adjustment and reference
+        if "bias_adjust_project" in ds.dims:
+            ds = ds.assign_coords(
+                adjustment=(
+                    "bias_adjust_project",
+                    [ds.attrs.get("cat:adjustment", np.nan)],
+                )
+            )  # need to use adjustment, not method bc .sel
+            ds = ds.assign_coords(
+                reference=(
+                    "bias_adjust_project",
+                    [ds.attrs.get("cat:reference", np.nan)],
+                )
+            )
+        list_ds.append(ds)
+    ens = xr.concat(list_ds, dim=dim_with_different_grid)
+    ens.attrs = common_attrs
+    return ens
+
+
 def build_partition_data(
     datasets: dict | list[xr.Dataset],
-    partition_dim: list[str] = ["source", "experiment", "bias_adjust_project"],
+    partition_dim: list[str] = ["realization", "experiment", "bias_adjust_project"],
     subset_kw: dict | None = None,
     regrid_kw: dict | None = None,
-    indicators_kw: dict | None = None,
     rename_dict: dict | None = None,
+    to_dataset_kw: dict | None = None,
+    to_level: str = "partition-ensemble",
 ):
     """
     Get the input for the xclim partition functions.
@@ -644,29 +783,36 @@ def build_partition_data(
     `partition_dim` dimensions (and time) to pass to one of the xclim partition functions
     (https://xclim.readthedocs.io/en/stable/api.html#uncertainty-partitioning).
     If the inputs have different grids,
-    they have to be subsetted and regridded to a common grid/point.
-    Indicators can also be computed before combining the datasets.
+    they have to be subsetted and/or regridded to a common grid/point.
 
     Parameters
     ----------
-    datasets : dict
-        List or dictionnary of Dataset objects that will be included in the ensemble.
+    datasets : list, dict, DataCatalog
+        List or dictionnary of Datasets or DataCatalog that will be included in the ensemble.
         The datasets should include the necessary ("cat:") attributes to understand their metadata.
-        Tip: With a project catalog, you can do: `datasets = pcat.search(**search_dict).to_dataset_dict()`.
-    partition_dim : list[str]
+        Tip: A dictionnary can be created with `datasets = pcat.search(**search_dict).to_dataset_dict()`.
+
+        The use of a DataCatalog is recommended for large ensembles.
+        In that case, the ensembles will be loaded separately for each `bias_adjust_project`,
+        the subsetting or regridding can be applied before combining the datasets through concatenation.
+        If `bias_adjust_project` is not in `partition_dim`, `source` will be used instead.
+    partition_dim: list[str]
         Components of the partition. They will become the dimension of the output.
         The default is ['source', 'experiment', 'bias_adjust_project'].
         For source, the dimension will actually be institution_source_member.
     subset_kw : dict, optional
         Arguments to pass to `xs.spatial.subset()`.
     regrid_kw : dict, optional
         Arguments to pass to `xs.regrid_dataset()`.
-    indicators_kw : dict, optional
-        Arguments to pass to `xs.indicators.compute_indicators()`.
-        All indicators have to be for the same frequency, in order to be put on a single time axis.
+        Note thet regriding is computationnaly expensive. For large datasets,
+        it might be worth it to do do regridding first, outside of this function.
     rename_dict : dict, optional
         Dictionary to rename the dimensions from xscen names to xclim names.
-        If None, the default is {'source': 'model', 'bias_adjust_project': 'downscaling', 'experiment': 'scenario'}.
+        The default is {'source': 'model', 'bias_adjust_project': 'downscaling', 'experiment': 'scenario'}.
+    to_dataset_kw : dict, optional
+        Arguments to pass to `xscen.DataCatalog.to_dataset()` if datasets is a DataCatalog.
+    to_level: str
+        The processing level of the output dataset. Default is 'partition-ensemble'.
 
     Returns
     -------
@@ -682,41 +828,32 @@ def build_partition_data(
     # initialize dict
     subset_kw = subset_kw or {}
     regrid_kw = regrid_kw or {}
+    to_dataset_kw = to_dataset_kw or {}
 
-    list_ds = []
-    for ds in datasets:
-        if subset_kw:
-            ds = subset(ds, **subset_kw)
-
-        if regrid_kw:
-            ds = regrid_dataset(ds, **regrid_kw)
-
-        if indicators_kw:
-            dict_ind = compute_indicators(ds, **indicators_kw)
-            if len(dict_ind) > 1:
-                raise ValueError(
-                    f"The indicators computation should return only indicators of the same frequency.Returned frequencies: {dict_ind.keys()}"
-                )
-            else:
-                ds = list(dict_ind.values())[0]
+    if isinstance(datasets, list):
+        ens = _partition_from_list(datasets, partition_dim, subset_kw, regrid_kw)
 
-        for dim in partition_dim:
-            if f"cat:{dim}" in ds.attrs:
-                ds = ds.expand_dims(**{dim: [ds.attrs[f"cat:{dim}"]]})
+    elif isinstance(datasets, DataCatalog):
+        ens = _partition_from_catalog(
+            datasets, partition_dim, subset_kw, regrid_kw, to_dataset_kw
+        )
 
-        if "source" in partition_dim:
-            new_source = f"{ds.attrs['cat:institution']}_{ds.attrs['cat:source']}_{ds.attrs['cat:member']}"
-            ds = ds.assign_coords(source=[new_source])
-        list_ds.append(ds)
-    ens = xr.merge(list_ds)
+    else:
+        raise ValueError(
+            "datasets should be a list or a dictionary of xarray datasets or a xscen.DataCatalog"
+        )
 
     rename_dict = rename_dict or {}
+    rename_dict.setdefault("realization", "model")
     rename_dict.setdefault("source", "model")
     rename_dict.setdefault("experiment", "scenario")
     rename_dict.setdefault("bias_adjust_project", "downscaling")
     rename_dict = {k: v for k, v in rename_dict.items() if k in ens.dims}
     ens = ens.rename(rename_dict)
 
+    ens.attrs["cat:processing_level"] = to_level
+    ens.attrs["cat:id"] = generate_id(ens)[0]
+
     return ens
 
 

diff --git a/src/xscen/io.py b/src/xscen/io.py
@@ -1053,7 +1053,7 @@ def rechunk(
         raise ValueError(
             "No chunks given. Need to give at `chunks_over_var` or `chunks_over_dim`."
         )
-
+    print(ds, chunks, worker_mem, str(path_out), str(temp_store))
     plan = _rechunk(ds, chunks, worker_mem, str(path_out), temp_store=str(temp_store))
 
     plan.execute()

diff --git a/tests/test_biasadjust.py b/tests/test_biasadjust.py
@@ -47,11 +47,10 @@ def test_basic_train(self, var, period):
 
     def test_preprocess(self):
 
-        dref360 = self.dref.convert_calendar("360_day", align_on="year")
-
+        dhist360 = self.dhist.convert_calendar("360_day", align_on="year")
         out = xs.train(
-            dref360,
-            self.dhist,
+            self.dref,
+            dhist360,
             var="tas",
             period=["2001", "2002"],
             adapt_freq={"thresh": "2 K"},