trying to merge with main

Signed-off-by: João Lucas de Sousa Almeida <[email protected]>
IBM · Dec 20, 2024 · 9505b30 · 9505b30
2 parents d173bcd + 30dfdf1
commit 9505b30
Show file tree

Hide file tree

Showing 57 changed files with 2,892 additions and 1,163 deletions.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -8,7 +8,7 @@ on:
   pull_request:
     branches:
       - main
-
+      - dev
 jobs:
   build:
     runs-on: ubuntu-latest
@@ -19,9 +19,9 @@ jobs:
 
     steps:
       - name: Clone repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
           cache: 'pip'

diff --git a/.gitignore b/.gitignore
@@ -12,9 +12,11 @@ venv/*
 examples/notebooks/config.yaml
 examples/notebooks/wxc_input_u_v_t_p_output_theta_uw_vw_era5_training_data_hourly_2015_constant_mu_sigma_scaling05.nc
 tests/all_ecos_random/*
+examples/**/*tif*
 **/climatology/*
 **/lightning_logs/*
 **/merra-2/*
 **/*.bin
 *.stdout
 *.log
+**/*.un~
diff --git a/README.md b/README.md
@@ -32,11 +32,11 @@ TerraTorch’s main purpose is to provide a flexible fine-tuning framework for G
 ### Pip
 In order to use th file `pyproject.toml` it is necessary to guarantee `pip>=21.8`. If necessary upgrade `pip` using `python -m pip install --upgrade pip`. 
 
-For a stable point-release, use `pip install terratorch`. 
-If you prefer to get the most recent version of the main branch, install the library with `pip install git+https://github.com/IBM/terratorch.git`.
+[comment]: <For a stable point-release, use `pip install terratorch`.>
+[comment]: <If you prefer to get the most recent version of the main branch, install the library with `pip install git+https://github.com/IBM/terratorch.git`.>
+To get the most recent version of the main branch, install the library with `pip install git+https://github.com/IBM/terratorch.git`.
 
-Another alternative is to install using [pipx](https://github.com/pypa/pipx) via `pipx install terratorch`, which creates an isolated environment and allows the user to run the application as 
-a common CLI tool, with no need of installing dependencies or activating environments. 
+[comment]: <Another alternative is to install using [pipx](https://github.com/pypa/pipx) via `pipx install terratorch`, which creates an isolated environment and allows the user to run the application as a common CLI tool, with no need of installing dependencies or activating environments.>
 
 TerraTorch requires gdal to be installed, which can be quite a complex process. If you don't have GDAL set up on your system, we reccomend using a conda environment and installing it with `conda install -c conda-forge gdal`.
 

diff --git a/contribution_process.md b/contribution_process.md
@@ -6,4 +6,7 @@ If you want to contribute to this project, there are many valuable ways in doing
 1. Use / test TerraTorch and create an [Issue](https://github.com/IBM/terratorch/issues) if something is not working properly or if you have an idea for a feature request.
 1. Pick an [Issue](https://github.com/IBM/terratorch/issues) and start contributing
 
-Contributions are welcome as pull requests on a [fork](https://github.com/IBM/terratorch/fork) of this project. Ideally, pull requests are backed by an [Issue](https://github.com/IBM/terratorch/issues). You can also tag the [code owners](https://github.com/IBM/terratorch/blob/main/CODEOWNERS) in the issue before you start, so we can talk about the details (in case you can't join one of the community calls).
+Contributions are welcome as pull requests on a [fork](https://github.com/IBM/terratorch/fork) of this project. Ideally, pull requests are backed by an [Issue](https://github.com/IBM/terratorch/issues). You can also tag the [code owners](https://github.com/IBM/terratorch/blob/main/CODEOWNERS) in the issue before you start, so we can talk about the details (in case you can't join one of the community calls).
+
+After or during implementation on your branch, please create a PR to main. During development, please mark this PR as DRAFT and prefix with '[WIP]'
+If you want us to merge the PR, remove 'draft' and '[WIP]'. Before that, please make sure that all tests are passing. Unit tests are automatically run on GitHub on the branch as well. The TerraTorch committers will review your code and will run integrations tests on our GPU cluster before we merge to main.
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,21 +7,24 @@ include = ["terratorch*"]
 
 [project]
 name = "terratorch"
-version = "0.99.5"
+version = "0.99.7"
 description = "TerraTorch - A model training toolkit for geospatial tasks"
 license = { "text" = "Apache License, Version 2.0" }
 readme = "README.md"
 requires-python = ">=3.10"
 keywords = ["fine-tuning", "geospatial foundation models", "artificial intelligence"]
 authors = [
   { name = "Carlos Gomes", email = "[email protected]" },
-  { name = "Joao Lucas de Sousa Almeida", email = "[email protected]"}
+  { name = "Joao Lucas de Sousa Almeida", email = "[email protected]"},
+  { name = "Romeo Kienzler"},
+  { name = "Benedikt Blumenstiel"}
 ]
 classifiers = [
   "Development Status :: 2 - Pre-Alpha",
   "Programming Language :: Python",
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
   "Programming Language :: Python :: Implementation :: CPython",
 ]
 
@@ -65,8 +68,8 @@ mmseg = [
 ]
 
 wxc = [
-  "prithviwxc @ git+https://github.com/NASA-IMPACT/Prithvi-WxC.git",
-  "granitewxc @ git+https://github.com/IBM/granite-wxc.git"
+  "PrithviWxC",
+  "granitewxc"
 ]
 
 #geobench = [
@@ -171,7 +174,7 @@ exclude_lines = [
 ]
 
 [tool.bumpver]
-current_version = "0.99.4"
+current_version = "0.99.5"
 version_pattern = "MAJOR.MINOR.PATCH[PYTAGNUM]"
 commit_message = "Bump version {old_version} -> {new_version}"
 commit = true

diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -1,4 +1,4 @@
 mkdocs-material==9.4.14
 mkdocstrings[python]
-h5py==3.10.0
+h5py==3.12.1
 jupyterlab
diff --git a/terratorch/cli_tools.py b/terratorch/cli_tools.py
@@ -56,7 +56,7 @@
     SemanticSegmentationTask,  # noqa: F401
 )
 
-logger = logging.getLogger(__name__)
+logger = logging.getLogger("terratorch")
 
 def flatten(list_of_lists):
     return list(itertools.chain.from_iterable(list_of_lists))
@@ -101,7 +101,8 @@ def save_prediction(prediction, input_file_name, out_dir, dtype:str="int16"):
     logger.info(f"Saving output to {out_file_name} ...")
     write_tiff(result, os.path.join(out_dir, out_file_name), metadata)
 
-def import_custom_modules(custom_modules_path:None | Path | str =None) -> None:
+
+def import_custom_modules(custom_modules_path: str | Path | None = None) -> None:
 
     if custom_modules_path:
 
@@ -123,7 +124,7 @@ def import_custom_modules(custom_modules_path:None | Path | str =None) -> None:
         else:
             raise ValueError(f"Modules path {custom_modules_path} isn't a directory. Check if you have defined it properly.")
     else:
-        logger.info("No custom module is being used.")
+        logger.debug("No custom module is being used.")
 
 class CustomWriter(BasePredictionWriter):
     """Callback class to write geospatial data to file."""
@@ -385,12 +386,16 @@ def instantiate_classes(self) -> None:
             self.trainer.deploy_config = config.deploy_config_file
 
         # Custom modules path
-        if hasattr(self.config.fit, "custom_modules_path"):
-
-            custom_modules_path =  self.config.fit.custom_modules_path
+        if hasattr(self.config, "fit") and hasattr(self.config.fit, "custom_modules_path"):
+            custom_modules_path = self.config.fit.custom_modules_path
+        elif hasattr(self.config, "validate") and hasattr(self.config.validate, "custom_modules_path"):
+            custom_modules_path = self.config.validate.custom_modules_path
+        elif hasattr(self.config, "test") and hasattr(self.config.test, "custom_modules_path"):
+            custom_modules_path = self.config.test.custom_modules_path
+        elif hasattr(self.config, "predict") and hasattr(self.config.predict, "custom_modules_path"):
+            custom_modules_path = self.config.predict.custom_modules_path
         else:
-            default_path = Path(".") / "custom_modules"
-            custom_modules_path = os.environ.get("TERRATORCH_CUSTOM_MODULE_PATH", default_path)
+            custom_modules_path = os.getenv("TERRATORCH_CUSTOM_MODULE_PATH", None)
 
         import_custom_modules(custom_modules_path)
 

diff --git a/terratorch/datamodules/__init__.py b/terratorch/datamodules/__init__.py
@@ -32,7 +32,8 @@
     wxc_present = True
     from terratorch.datamodules.merra2_downscale import Merra2DownscaleNonGeoDataModule 
 except ImportError as e:
-    print('wxc_downscaling not installed')
+    import logging
+    logging.getLogger("terratorch").debug("wxc_downscaling not installed")
     wxc_present = False
 
 # GenericNonGeoRegressionDataModule,
@@ -42,6 +43,15 @@
 from terratorch.datamodules.generic_multimodal_data_module import GenericMultiModalDataModule
 
 
+# miscellaneous datamodules
+from terratorch.datamodules.openearthmap import OpenEarthMapNonGeoDataModule
+
+from terratorch.datamodules.burn_intensity import BurnIntensityNonGeoDataModule
+from terratorch.datamodules.carbonflux import CarbonFluxNonGeoDataModule
+from terratorch.datamodules.landslide4sense import Landslide4SenseNonGeoDataModule
+from terratorch.datamodules.biomassters import BioMasstersNonGeoDataModule
+from terratorch.datamodules.forestnet import ForestNetNonGeoDataModule
+
 # miscellaneous datamodules
 from terratorch.datamodules.openearthmap import OpenEarthMapNonGeoDataModule
 
@@ -54,6 +64,11 @@
     "GenericNonGeoSegmentationDataModule",
     "GenericNonGeoClassificationDataModule",
     # "GenericNonGeoRegressionDataModule",
+    "BurnIntensityNonGeoDataModule",
+    "CarbonFluxNonGeoDataModule",
+    "Landslide4SenseNonGeoDataModule",
+    "ForestNetNonGeoDataModule",
+    "BioMasstersNonGeoDataModule"
     "Sen1Floods11NonGeoDataModule",
     "Sen4MapLucasDataModule",
     "FireScarsNonGeoDataModule",

diff --git a/terratorch/datamodules/biomassters.py b/terratorch/datamodules/biomassters.py
@@ -0,0 +1,190 @@
+from collections.abc import Sequence
+from typing import Any
+
+import albumentations as A
+from torch.utils.data import DataLoader
+
+from terratorch.datamodules.generic_multimodal_data_module import MultimodalNormalize, wrap_in_compose_is_list
+from terratorch.datamodules.generic_pixel_wise_data_module import Normalize
+from terratorch.datasets import BioMasstersNonGeo
+from torchgeo.datamodules import NonGeoDataModule
+from torchgeo.transforms import AugmentationSequential
+
+MEANS = {
+    "AGBM": 63.4584,
+    "S1": {
+        "VV_Asc": 0.08871397,
+        "VH_Asc": 0.02172604,
+        "VV_Desc": 0.08556002,
+        "VH_Desc": 0.02795591,
+        "RVI_Asc": 0.75507677,
+        "RVI_Desc": 0.6600374
+    },
+    "S2": {
+        "BLUE": 1633.0802,
+        "GREEN": 1610.0035,
+        "RED": 1599.557,
+        "RED_EDGE_1": 1916.7083,
+        "RED_EDGE_2": 2478.8325,
+        "RED_EDGE_3": 2591.326,
+        "NIR_BROAD": 2738.5837,
+        "NIR_NARROW": 2685.8281,
+        "SWIR_1": 1023.90204,
+        "SWIR_2": 696.48755,
+        "CLOUD_PROBABILITY": 21.177078
+    }
+}
+
+STDS = {
+    "AGBM": 72.21242,
+    "S1": {
+        "VV_Asc": 0.16714208,
+        "VH_Asc": 0.04876742,
+        "VV_Desc": 0.19260046,
+        "VH_Desc": 0.10272296,
+        "RVI_Asc": 0.24945821,
+        "RVI_Desc": 0.3590119
+    },
+    "S2": {
+        "BLUE": 2499.7146,
+        "GREEN": 2308.5298,
+        "RED": 2388.2268,
+        "RED_EDGE_1": 2389.6375,
+        "RED_EDGE_2": 2209.6467,
+        "RED_EDGE_3": 2104.572,
+        "NIR_BROAD": 2194.209,
+        "NIR_NARROW": 2031.7762,
+        "SWIR_1": 934.0556,
+        "SWIR_2": 759.8444,
+        "CLOUD_PROBABILITY": 49.352486
+    }
+}
+
+class BioMasstersNonGeoDataModule(NonGeoDataModule):
+    """NonGeo datamodule implementation for BioMassters."""
+
+    default_metadata_filename = "The_BioMassters_-_features_metadata.csv.csv"
+
+    def __init__(
+        self,
+        data_root: str,
+        batch_size: int = 4,
+        num_workers: int = 0,
+        bands: dict[str, Sequence[str]] | Sequence[str] = BioMasstersNonGeo.all_band_names,
+        train_transform: A.Compose | None | list[A.BasicTransform] = None,
+        val_transform: A.Compose | None | list[A.BasicTransform] = None,
+        test_transform: A.Compose | None | list[A.BasicTransform] = None,
+        aug: AugmentationSequential = None,
+        drop_last: bool = True,
+        sensors: Sequence[str] = ["S1", "S2"],
+        as_time_series: bool = False,
+        metadata_filename: str = default_metadata_filename,
+        max_cloud_percentage: float | None = None,
+        max_red_mean: float | None = None,
+        include_corrupt: bool = True,
+        subset: float = 1,
+        seed: int = 42,
+        use_four_frames: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(BioMasstersNonGeo, batch_size, num_workers, **kwargs)
+        self.data_root = data_root
+        self.sensors = sensors
+        if isinstance(bands, dict):
+            self.bands = bands
+        else:
+            sens = sensors[0]
+            self.bands = {sens: bands}
+
+        self.means = {}
+        self.stds = {}
+        for sensor in self.sensors:
+            self.means[sensor] = [MEANS[sensor][band] for band in self.bands[sensor]]
+            self.stds[sensor] = [STDS[sensor][band] for band in self.bands[sensor]]
+
+        self.mask_mean = MEANS["AGBM"]
+        self.mask_std = STDS["AGBM"]
+        self.train_transform = wrap_in_compose_is_list(train_transform)
+        self.val_transform = wrap_in_compose_is_list(val_transform)
+        self.test_transform = wrap_in_compose_is_list(test_transform)
+        if len(sensors) == 1:
+            self.aug = Normalize(self.means[sensors[0]], self.stds[sensors[0]]) if aug is None else aug
+        else:
+            MultimodalNormalize(self.means, self.stds) if aug is None else aug
+        self.drop_last = drop_last
+        self.as_time_series = as_time_series
+        self.metadata_filename = metadata_filename
+        self.max_cloud_percentage = max_cloud_percentage
+        self.max_red_mean = max_red_mean
+        self.include_corrupt = include_corrupt
+        self.subset = subset
+        self.seed = seed
+        self.use_four_frames = use_four_frames
+
+    def setup(self, stage: str) -> None:
+        if stage in ["fit"]:
+            self.train_dataset = self.dataset_class(
+                split="train",
+                root=self.data_root,
+                transform=self.train_transform,
+                bands=self.bands,
+                mask_mean=self.mask_mean,
+                mask_std=self.mask_std,
+                sensors=self.sensors,
+                as_time_series=self.as_time_series,
+                metadata_filename=self.metadata_filename,
+                max_cloud_percentage=self.max_cloud_percentage,
+                max_red_mean=self.max_red_mean,
+                include_corrupt=self.include_corrupt,
+                subset=self.subset,
+                seed=self.seed,
+                use_four_frames=self.use_four_frames,
+            )
+        if stage in ["fit", "validate"]:
+            self.val_dataset = self.dataset_class(
+                split="test",
+                root=self.data_root,
+                transform=self.val_transform,
+                bands=self.bands,
+                mask_mean=self.mask_mean,
+                mask_std=self.mask_std,
+                sensors=self.sensors,
+                as_time_series=self.as_time_series,
+                metadata_filename=self.metadata_filename,
+                max_cloud_percentage=self.max_cloud_percentage,
+                max_red_mean=self.max_red_mean,
+                include_corrupt=self.include_corrupt,
+                subset=self.subset,
+                seed=self.seed,
+                use_four_frames=self.use_four_frames,
+            )
+        if stage in ["test"]:
+            self.test_dataset = self.dataset_class(
+                split="test",
+                root=self.data_root,
+                transform=self.test_transform,
+                bands=self.bands,
+                mask_mean=self.mask_mean,
+                mask_std=self.mask_std,
+                sensors=self.sensors,
+                as_time_series=self.as_time_series,
+                metadata_filename=self.metadata_filename,
+                max_cloud_percentage=self.max_cloud_percentage,
+                max_red_mean=self.max_red_mean,
+                include_corrupt=self.include_corrupt,
+                subset=self.subset,
+                seed=self.seed,
+                use_four_frames=self.use_four_frames,
+            )
+
+    def _dataloader_factory(self, split: str):
+        dataset = self._valid_attribute(f"{split}_dataset", "dataset")
+        batch_size = self._valid_attribute(f"{split}_batch_size", "batch_size")
+        return DataLoader(
+            dataset=dataset,
+            batch_size=batch_size,
+            shuffle=split == "train",
+            num_workers=self.num_workers,
+            collate_fn=self.collate_fn,
+            drop_last=split =="train" and self.drop_last,
+        )