Merge branch 'develop' into activation_functions_config

ecmwf · Oct 28, 2024 · f6a5fe5 · f6a5fe5
2 parents ac232b2 + ed56f9d
commit f6a5fe5
Show file tree

Hide file tree

Showing 46 changed files with 899 additions and 259 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1,6 +1,6 @@
 # CODEOWNERS file
 
 # Protect workflow files
-/.github/ @theissenhelen @jesperdramsch @gmertes
-/.pre-commit-config.yaml @theissenhelen @jesperdramsch @gmertes
-/pyproject.toml @theissenhelen @jesperdramsch @gmertes
+/.github/ @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry
+/.pre-commit-config.yaml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry
+/pyproject.toml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -15,7 +15,12 @@ jobs:
       skip-hooks: "no-commit-to-branch"
 
   checks:
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
     uses: ecmwf-actions/reusable-workflows/.github/workflows/qa-pytest-pyproject.yml@v2
+    with:
+      python-version: ${{ matrix.python-version }}
 
   deploy:
     needs: [checks, quality]

diff --git a/.github/workflows/python-pull-request.yml b/.github/workflows/python-pull-request.yml
@@ -16,4 +16,9 @@ jobs:
       skip-hooks: "no-commit-to-branch"
 
   checks:
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
     uses: ecmwf-actions/reusable-workflows/.github/workflows/qa-pytest-pyproject.yml@v2
+    with:
+      python-version: ${{ matrix.python-version }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,12 +5,12 @@ repos:
   - id: clear-notebooks-output
     name: clear-notebooks-output
     files: tools/.*\.ipynb$
-    stages: [commit]
+    stages: [pre-commit]
     language: python
     entry: jupyter nbconvert --ClearOutputPreprocessor.enabled=True --inplace
     additional_dependencies: [jupyter]
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.6.0
+  rev: v5.0.0
   hooks:
   - id: check-yaml # Check YAML files for syntax errors only
     args: [--unsafe, --allow-multiple-documents]
@@ -40,7 +40,7 @@ repos:
     - --force-single-line-imports
     - --profile black
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.6.4
+  rev: v0.6.9
   hooks:
   - id: ruff
     # Next line if for documenation cod snippets
@@ -66,11 +66,11 @@ repos:
   - id: docconvert
     args: ["numpy"]
 - repo: https://github.com/tox-dev/pyproject-fmt
-  rev: "2.2.3"
+  rev: "2.2.4"
   hooks:
   - id: pyproject-fmt
 -   repo: https://github.com/jshwi/docsig # Check docstrings against function sig
-    rev: v0.60.1
+    rev: v0.64.0
     hooks:
     -   id: docsig
         args:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,11 +8,44 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 Please add your functional changes to the appropriate section in the PR.
 Keep it human-readable, your future self will thank you!
 
-## [Unreleased](https://github.com/ecmwf/anemoi-training/compare/0.1.0...HEAD)
+## [Unreleased](https://github.com/ecmwf/anemoi-training/compare/0.2.2...HEAD)
+
+## [0.2.2 - Maintenance: pin python <3.13](https://github.com/ecmwf/anemoi-training/compare/0.2.1...0.2.2) - 2024-10-28
+
+### Changed
+
+- Lock python version <3.13 [#107](https://github.com/ecmwf/anemoi-training/pull/107)
+
+## [0.2.1 - Bugfix: resuming mlflow runs](https://github.com/ecmwf/anemoi-training/compare/0.2.0...0.2.1) - 2024-10-24
+
+### Added
+
+- Mlflow-sync to include new tag for server to server syncing [#83](https://github.com/ecmwf/anemoi-training/pull/83)
+- Mlflow-sync to include functionality to resume and fork server2server runs [#83](https://github.com/ecmwf/anemoi-training/pull/83)
+- Rollout training for Limited Area Models. [#79](https://github.com/ecmwf/anemoi-training/pulls/79)
+- Feature: New `Boolean1DMask` class. Enables rollout training for limited area models. [#79](https://github.com/ecmwf/anemoi-training/pulls/79)
+
+### Fixed
+
+- Mlflow-sync to handle creation of new experiments in the remote server [#83](https://github.com/ecmwf/anemoi-training/pull/83)
+- Fix for multi-gpu when using mlflow due to refactoring of _get_mlflow_run_params function [#99](https://github.com/ecmwf/anemoi-training/pull/99)
+- ci: fix pyshtools install error [#100](https://github.com/ecmwf/anemoi-training/pull/100)
+- Fix `__version__` import in init
+
+### Changed
+
+- Update copyright notice
+
+## [0.2.0 - Feature release](https://github.com/ecmwf/anemoi-training/compare/0.1.0...0.2.0) - 2024-10-16
+
+- Make pin_memory of the Dataloader configurable (#64)
 
 ### Added
+
+- Add anemoi-transform link to documentation
 - Codeowners file (#56)
 - Changelog merge strategy (#56)
+- Contributors file (#106)
 
 #### Miscellaneous
 
@@ -26,7 +59,9 @@ Keep it human-readable, your future self will thank you!
 - Enforce same binning for histograms comparing true data to predicted data
 - Fix: Inference checkpoints are now saved according the frequency settings defined in the config [#37](https://github.com/ecmwf/anemoi-training/pull/37)
 - Feature: Add configurable models [#50](https://github.com/ecmwf/anemoi-training/pulls/50)
+- Feature: Authentication support for mlflow sync - [#51](https://github.com/ecmwf/anemoi-training/pull/51)
 - Feature: Support training for datasets with missing time steps [#48](https://github.com/ecmwf/anemoi-training/pulls/48)
+- Feature: `AnemoiMlflowClient`, an mlflow client with authentication support [#86](https://github.com/ecmwf/anemoi-training/pull/86)
 - Long Rollout Plots
 
 ### Fixed
@@ -35,10 +70,14 @@ Keep it human-readable, your future self will thank you!
 - Bugfixes for CI (#56)
 - Fix `mlflow` subcommand on python 3.9 [#62](https://github.com/ecmwf/anemoi-training/pull/62)
 - Show correct subcommand in MLFlow - Addresses [#39](https://github.com/ecmwf/anemoi-training/issues/39) in [#61](https://github.com/ecmwf/anemoi-training/pull/61)
+- Fix interactive multi-GPU training [#82](https://github.com/ecmwf/anemoi-training/pull/82)
+- Allow 500 characters in mlflow logging [#88](https://github.com/ecmwf/anemoi-training/pull/88)
 
 ### Changed
 
 - Updated configuration examples in documentation and corrected links - [#46](https://github.com/ecmwf/anemoi-training/pull/46)
+- Remove credential prompt from mlflow login, replace with seed refresh token via web - [#78](https://github.com/ecmwf/anemoi-training/pull/78)
+- Update CODEOWNERS
 
 ## [0.1.0 - Anemoi training - First release](https://github.com/ecmwf/anemoi-training/releases/tag/0.1.0) - 2024-08-16
 
@@ -52,6 +91,7 @@ Keep it human-readable, your future self will thank you!
 - Subcommand for checkpoint handling
 
 #### Functionality
+
 - Searchpaths for Hydra configs, to enable configs in CWD, `ANEMOI_CONFIG_PATH` env, and `.config/anemoi/training` in addition to package defaults
 - MlFlow token authentication
 - Configurable pressure level scaling

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -0,0 +1,13 @@
+## How to Contribute
+
+Please see the [read the docs](https://anemoi-training.readthedocs.io/en/latest/dev/contributing.html).
+
+
+## Contributors
+
+Thank you to all the wonderful people who have contributed to Anemoi. Contributions can come in many forms, including code, documentation, bug reports, feature suggestions, design, and more. A list of code-based contributors can be found [here](https://github.com/ecmwf/anemoi-training/graphs/contributors).
+
+
+## Contributing Organisations
+
+Significant contributions have been made by the following organisations: [DWD](https://www.dwd.de/), [MET Norway](https://www.met.no/), [MeteoSwiss](https://www.meteoswiss.admin.ch/), [RMI](https://www.meteo.be/) & [ECMWF](https://www.ecmwf.int/)
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ $ pip install anemoi-training
 ## License
 
 ```
-Copyright 2022, European Centre for Medium Range Weather Forecasts.
+Copyright 2024, Anemoi contributors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

diff --git a/docs/conf.py b/docs/conf.py
@@ -101,6 +101,10 @@
         "https://anemoi-registry.readthedocs.io/en/latest/",
         ("../../anemoi-registry/docs/_build/html/objects.inv", None),
     ),
+    "anemoi-transform": (
+        "https://anemoi-transform.readthedocs.io/en/latest/",
+        ("../../anemoi-transform/docs/_build/html/objects.inv", None),
+    ),
 }
 
 # -- Options for HTML output -------------------------------------------------

diff --git a/docs/index.rst b/docs/index.rst
@@ -67,6 +67,7 @@ This package provides the *Anemoi* training functionality.
 *****************
 
 -  :ref:`anemoi-utils <anemoi-utils:index-page>`
+-  :ref:`anemoi-transform <anemoi-transform:index-page>`
 -  :ref:`anemoi-datasets <anemoi-datasets:index-page>`
 -  :ref:`anemoi-models <anemoi-models:index-page>`
 -  :ref:`anemoi-graphs <anemoi-graphs:index-page>`

diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,7 @@ authors = [
   { name = "European Centre for Medium-Range Weather Forecasts (ECMWF)", email = "[email protected]" },
 ]
 
-requires-python = ">=3.9"
+requires-python = ">=3.9,<3.13" # Unable to use 3.13 until pyshtools updates
 
 classifiers = [
   "Development Status :: 4 - Beta",
@@ -83,6 +83,10 @@ urls.Documentation = "https://anemoi-training.readthedocs.io/"
 urls.Homepage = "https://github.com/ecmwf/anemoi-training/"
 urls.Issues = "https://github.com/ecmwf/anemoi-training/issues"
 urls.Repository = "https://github.com/ecmwf/anemoi-training/"
+# command for interactive DDP (not supposed to be used directly)
+# the dot is intentional, so it doesn't trigger autocomplete
+scripts.".anemoi-training-train" = "anemoi.training.commands.train:main"
+
 # Add subcommand in the `commands` directory
 scripts.anemoi-training = "anemoi.training.__main__:main"
 

diff --git a/src/anemoi/training/__init__.py b/src/anemoi/training/__init__.py
@@ -6,4 +6,10 @@
 # nor does it submit to any jurisdiction.
 
 
-from ._version import __version__  # noqa: F401
+try:
+    # NOTE: the `_version.py` file must not be present in the git repository
+    #   as it is generated by setuptools at install time
+    from ._version import __version__  # type: ignore
+except ImportError:  # pragma: no cover
+    # Local copy or not installed with setuptools
+    __version__ = "999"
diff --git a/src/anemoi/training/commands/checkpoint.py b/src/anemoi/training/commands/checkpoint.py
@@ -1,10 +1,13 @@
-# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# (C) Copyright 2024 Anemoi contributors.
+#
 # This software is licensed under the terms of the Apache Licence Version 2.0
 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
 # In applying this licence, ECMWF does not waive the privileges and immunities
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 
+
 import argparse
 import logging
 

diff --git a/src/anemoi/training/commands/config.py b/src/anemoi/training/commands/config.py
@@ -1,10 +1,13 @@
-# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# (C) Copyright 2024 Anemoi contributors.
+#
 # This software is licensed under the terms of the Apache Licence Version 2.0
 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
 # In applying this licence, ECMWF does not waive the privileges and immunities
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 
+
 from __future__ import annotations
 
 import importlib.resources as pkg_resources

diff --git a/src/anemoi/training/commands/mlflow.py b/src/anemoi/training/commands/mlflow.py
@@ -1,10 +1,13 @@
-# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# (C) Copyright 2024 Anemoi contributors.
+#
 # This software is licensed under the terms of the Apache Licence Version 2.0
 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
 # In applying this licence, ECMWF does not waive the privileges and immunities
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 
+
 import argparse
 
 from anemoi.training.commands import Command
@@ -45,23 +48,39 @@ def add_arguments(command_parser: argparse.ArgumentParser) -> None:
             "--source",
             "-s",
             help="The MLflow logs source directory.",
+            metavar="DIR",
             required=True,
             default=argparse.SUPPRESS,
         )
         sync.add_argument(
             "--destination",
             "-d",
             help="The destination MLflow tracking URI.",
+            metavar="URI",
+            required=True,
+            default=argparse.SUPPRESS,
+        )
+        sync.add_argument(
+            "--run-id",
+            "-r",
+            help="The run ID to sync.",
+            metavar="ID",
             required=True,
             default=argparse.SUPPRESS,
         )
-        sync.add_argument("--run-id", "-r", help="The run ID to sync.", required=True, default=argparse.SUPPRESS)
         sync.add_argument(
             "--experiment-name",
             "-e",
             help="The experiment name to sync to.",
+            metavar="NAME",
             default="anemoi-debug",
         )
+        sync.add_argument(
+            "--authentication",
+            "-a",
+            action="store_true",
+            help="The destination server requires authentication.",
+        )
         sync.add_argument(
             "--export-deleted-runs",
             "-x",
@@ -88,8 +107,18 @@ def run(args: argparse.Namespace) -> None:
             return
 
         if args.subcommand == "sync":
+            from anemoi.training.diagnostics.mlflow.utils import health_check
             from anemoi.training.utils.mlflow_sync import MlFlowSync
 
+            if args.authentication:
+                from anemoi.training.diagnostics.mlflow.auth import TokenAuth
+
+                auth = TokenAuth(url=args.destination)
+                auth.login()
+                auth.authenticate()
+
+            health_check(args.destination)
+
             log_level = "DEBUG" if args.verbose else "INFO"
 
             MlFlowSync(