From 32ce355c434f4242186812401f1ca50bf989cf44 Mon Sep 17 00:00:00 2001 From: Guillaume Maze Date: Thu, 19 Dec 2024 13:46:08 +0100 Subject: [PATCH 1/7] Update whats-new.rst --- docs/whats-new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/whats-new.rst b/docs/whats-new.rst index 9fb5dacf..0d5d890a 100644 --- a/docs/whats-new.rst +++ b/docs/whats-new.rst @@ -25,6 +25,8 @@ Features and front-end API Internals ^^^^^^^^^ +- **Support netcdf to/from zarr conversion**. Provide low-level support to export Argo datasets to zarr files and to open zarr archive (local or remote). + - **Open netcdf files lazily**. We now provide low-level support for opening a netcdf Argo dataset lazily with `kerchunk `_. Simply use the new option ``lazy=True`` with a :class:`stores.httpstore.open_dataset` or :class:`stores.s3store.open_dataset`. (:pr:`385`) by |gmaze|. .. code-block:: python From 7166b487f536e9794b1f06576d2341328a9ddc47 Mon Sep 17 00:00:00 2001 From: Guillaume Maze Date: Thu, 19 Dec 2024 13:46:16 +0100 Subject: [PATCH 2/7] Update xarray.py --- argopy/xarray.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/argopy/xarray.py b/argopy/xarray.py index 26cca9a3..9e926130 100644 --- a/argopy/xarray.py +++ b/argopy/xarray.py @@ -1929,6 +1929,31 @@ def list_WMO(self): """Return all possible WMO as a list""" return to_list(np.unique(self._obj["PLATFORM_NUMBER"].values)) + def to_zarr(self, *args, **kwargs): + """Write Argo dataset content to a zarr group + + All arguments are passed to :meth:`xarray.to_zarr`. + + If encoding is not specified, we automatically add a ``Blosc(cname="zstd", clevel=3, shuffle=2)`` compression on + all variables of the dataset. + + """ + + # Re-ensure all variables are cast properly: + self._obj = self.cast_types() + + # Define zarr compression: + if "encoding" not in kwargs: + from numcodecs import Blosc + compressor = Blosc(cname="zstd", clevel=3, shuffle=2) + encoding = {} + for v in self._obj: + encoding.update({v: {"compressor": compressor}}) + kwargs.update({'encoding': encoding}) + + # Convert to a zarr file using compression: + return self._obj.to_zarr(*args, **kwargs) + def open_Argo_dataset(filename_or_obj): ds = xr.open_dataset(filename_or_obj, decode_cf=1, use_cftime=0, mask_and_scale=1) From 1ff6fff9eb13d2dcf818b6fb66fc358ea64291d5 Mon Sep 17 00:00:00 2001 From: Guillaume Maze Date: Thu, 19 Dec 2024 13:46:30 +0100 Subject: [PATCH 3/7] add numcodecs to versions shower --- argopy/utils/locals.py | 1 + cli/show_versions | 1 + 2 files changed, 2 insertions(+) diff --git a/argopy/utils/locals.py b/argopy/utils/locals.py index f14d1b76..8508d6fc 100644 --- a/argopy/utils/locals.py +++ b/argopy/utils/locals.py @@ -183,6 +183,7 @@ def show_versions(file=sys.stdout, conda=False): # noqa: C901 [ ("boto3", get_version), ("h5netcdf", get_version), + ("numcodecs", get_version), ("s3fs", get_version), ("kerchunk", get_version), ("zarr", get_version), diff --git a/cli/show_versions b/cli/show_versions index ad238dd1..d0a8f5ff 100755 --- a/cli/show_versions +++ b/cli/show_versions @@ -168,6 +168,7 @@ def show_versions(file=sys.stdout, conda=False, free=False, core=False): # noqa [ ("zarr", get_version), ("boto3", get_version), + ("numcodecs", get_version), ("s3fs", get_version), ("kerchunk", get_version), ] From f468ee8b603cdbd28ae686af85c4f631b31191dd Mon Sep 17 00:00:00 2001 From: Guillaume Maze Date: Fri, 31 Jan 2025 16:14:30 +0100 Subject: [PATCH 4/7] Update checkers.py fux bug whereby knowns GDAC were not checked alive by isAPIconnected --- argopy/utils/checkers.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/argopy/utils/checkers.py b/argopy/utils/checkers.py index e5ce2a20..b105d694 100644 --- a/argopy/utils/checkers.py +++ b/argopy/utils/checkers.py @@ -441,7 +441,7 @@ def check_index_cols(column_names: list, convention: str = "ar_index_global_prof return column_names -def check_gdac_path(path, errors="ignore"): # noqa: C901 +def check_gdac_path(path, errors:str="ignore", ignore_knowns:bool=False): # noqa: C901 """Check if a path has the expected GDAC structure Expected GDAC structure:: @@ -469,8 +469,11 @@ def check_gdac_path(path, errors="ignore"): # noqa: C901 ---------- path: str Path name to check, including access protocol - errors: str - "ignore" or "raise" (or "warn") + errors: str, default="ignore" + Determine how check procedure error are handled: "ignore", "raise" or "warn" + ignore_knowns: bool, default=False + Should the checking procedure be by-passed for the internal list of known GDACs. + Set this to True to check if a known GDACs is connected or not. Returns ------- @@ -481,7 +484,7 @@ def check_gdac_path(path, errors="ignore"): # noqa: C901 :class:`argopy.stores.gdacfs`, :meth:`argopy.utils.list_gdac_servers` """ - if path in list_gdac_servers(): + if path in list_gdac_servers() and ignore_knowns: return True else: @@ -644,7 +647,7 @@ def isAPIconnected(src="erddap", data=True): if src in list_src and getattr(list_src[src], "api_server_check", None): if src == 'gdac': - return check_gdac_path(list_src[src].api_server_check) + return check_gdac_path(list_src[src].api_server_check, ignore_knowns=True) else: return isalive(list_src[src].api_server_check) else: From e76eadfccef3abf9a6e0658f6e8fc474b2e1edc7 Mon Sep 17 00:00:00 2001 From: Guillaume Maze Date: Fri, 31 Jan 2025 16:46:53 +0100 Subject: [PATCH 5/7] update docs --- docs/api-hidden.rst | 1 + docs/api.rst | 1 + docs/conf.py | 1 + docs/energy.rst | 1 + docs/whats-new.rst | 12 +++++++++++- 5 files changed, 15 insertions(+), 1 deletion(-) diff --git a/docs/api-hidden.rst b/docs/api-hidden.rst index 767379da..850e7a4c 100644 --- a/docs/api-hidden.rst +++ b/docs/api-hidden.rst @@ -330,6 +330,7 @@ argopy.xarray.ArgoAccessor.list_WMO_CYC argopy.xarray.ArgoAccessor.N_POINTS argopy.xarray.ArgoAccessor.N_PROF + argopy.xarray.ArgoAccessor.to_zarr argopy.xarray.ArgoEngine diff --git a/docs/api.rst b/docs/api.rst index c602d908..84a22cdb 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -207,6 +207,7 @@ Misc Dataset.argo.uid Dataset.argo.cast_types Dataset.argo.N_POINTS + Dataset.argo.to_zarr Utilities diff --git a/docs/conf.py b/docs/conf.py index 4c0988f2..41391cf9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -404,4 +404,5 @@ 'boto3': ('https://boto3.amazonaws.com/v1/documentation/api/latest/', None), 's3fs': ('https://s3fs.readthedocs.io/en/latest/', None), 'kerchunk': ('https://fsspec.github.io/kerchunk/', None), + 'numcodecs': ('https://numcodecs.readthedocs.io/en/stable/', None), } diff --git a/docs/energy.rst b/docs/energy.rst index 05af8e47..da940f4e 100644 --- a/docs/energy.rst +++ b/docs/energy.rst @@ -32,6 +32,7 @@ All branches are also monitored. Their metrics can be summed to compute each new - `Energy used by upstream CI tests running daily and on each commit in the master branch`_ + .. |energyused_CItests| image:: https://api.green-coding.io/v1/ci/badge/get?repo=euroargodev/argopy&branch=master&workflow=22344160&mode=totals :target: https://metrics.green-coding.io/ci.html?repo=euroargodev/argopy&branch=master&workflow=22344160 diff --git a/docs/whats-new.rst b/docs/whats-new.rst index d6e16d0c..0ea22eb0 100644 --- a/docs/whats-new.rst +++ b/docs/whats-new.rst @@ -56,7 +56,17 @@ With more details: Internals ^^^^^^^^^ -- **Support netcdf to/from zarr conversion**. Provide low-level support to export Argo datasets to zarr files and to open zarr archive (local or remote). +- **Support Argo dataset export to zarr**. Provide preliminary support to export Argo datasets to zarr files (local or remote). (:pr:`423`) by |gmaze|. + +.. code-block:: python + :caption: Export to zarr + + from argopy import DataFetcher + ds = DataFetcher(src='gdac').float(6903091).to_xarray() + # then: + ds.argo.to_zarr("6903091_prof.zarr") + # or: + ds.argo.to_zarr("s3://argopy/sample-data/6903091_prof.zarr") - **Open netcdf files lazily**. We now provide low-level support for opening a netcdf Argo dataset lazily with `kerchunk `_. Simply use the new option ``lazy=True`` with a :class:`stores.httpstore.open_dataset` or :class:`stores.s3store.open_dataset`. (:pr:`385`) by |gmaze|. From 9ce4e8dd0711ae2f260c98e387efc7311922dd7b Mon Sep 17 00:00:00 2001 From: Guillaume Maze Date: Fri, 31 Jan 2025 16:46:55 +0100 Subject: [PATCH 6/7] Update xarray.py --- argopy/xarray.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/argopy/xarray.py b/argopy/xarray.py index 9e926130..b7af7b3a 100644 --- a/argopy/xarray.py +++ b/argopy/xarray.py @@ -5,7 +5,10 @@ import pandas as pd import xarray as xr import logging +from typing import Union from xarray.backends import BackendEntrypoint # For xarray > 0.18 +from xarray.backends import ZarrStore +from dask.delayed import Delayed try: import gsw @@ -1929,20 +1932,33 @@ def list_WMO(self): """Return all possible WMO as a list""" return to_list(np.unique(self._obj["PLATFORM_NUMBER"].values)) - def to_zarr(self, *args, **kwargs): + def to_zarr(self, *args, **kwargs) -> Union[ZarrStore, Delayed]: """Write Argo dataset content to a zarr group - All arguments are passed to :meth:`xarray.to_zarr`. + Before write operation is delegated to :class:`xarray.Dataset.to_zarr`, we perform the following: - If encoding is not specified, we automatically add a ``Blosc(cname="zstd", clevel=3, shuffle=2)`` compression on - all variables of the dataset. + - Ensure all variables are appropriately cast. + - If the ``encoding`` argument is not specified, we automatically add a ``Blosc(cname="zstd", clevel=3, shuffle=2)`` compression to all variables. Set `encoding=None` for no compression. + Parameters + ---------- + *args, **kwargs: + Passed to :class:`xarray.Dataset.to_zarr`. + + Returns + ------- + The output from :class:`xarray.Dataset.to_zarr` call + + See Also + -------- + :class:`xarray.Dataset.to_zarr`, :class:`numcodecs.blosc.Blosc` """ - # Re-ensure all variables are cast properly: + # Ensure that all variables are cast appropriately + # (those already cast are not changed) self._obj = self.cast_types() - # Define zarr compression: + # Add zarr compression to encoding: if "encoding" not in kwargs: from numcodecs import Blosc compressor = Blosc(cname="zstd", clevel=3, shuffle=2) From bfd79c2cdadfdc1b3dd09633a1d0eb81ad0fd555 Mon Sep 17 00:00:00 2001 From: Guillaume Maze Date: Fri, 31 Jan 2025 17:51:42 +0100 Subject: [PATCH 7/7] Update xarray.py fix bug for core env --- argopy/xarray.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/argopy/xarray.py b/argopy/xarray.py index b7af7b3a..c4ea4448 100644 --- a/argopy/xarray.py +++ b/argopy/xarray.py @@ -8,7 +8,6 @@ from typing import Union from xarray.backends import BackendEntrypoint # For xarray > 0.18 from xarray.backends import ZarrStore -from dask.delayed import Delayed try: import gsw @@ -17,6 +16,15 @@ except ModuleNotFoundError: with_gsw = False +try: + from dask.delayed import Delayed + + with_dask = True +except ModuleNotFoundError: + with_dask = False + Delayed = lambda x: x + + from .utils import is_list_of_strings from .utils import ( cast_Argo_variable_type,