IBM · Joao-L-S-Almeida · Dec 23, 2024 · Jan 2, 2025 · Jan 2, 2025 · Jan 2, 2025
diff --git a/examples/scripts/create_images.py b/examples/scripts/create_images.py
@@ -0,0 +1,52 @@
+from PIL import Image
+import os
+import random
+import numpy as np
+import tifffile as tiff 
+from argparse import ArgumentParser
+from osgeo import gdal
+from osgeo import osr
+
+parser = ArgumentParser()
+parser.add_argument("--input_file")
+parser.add_argument("--output_dir")
+parser.add_argument("--n_copies", type=int, default=2)
+
+args = parser.parse_args()
+input_file = args.input_file
+output_dir = args.output_dir
+n_copies = args.n_copies 
+
+pad_limit = 4
+
+# config
+GDAL_DATA_TYPE = gdal.GDT_Int32
+GEOTIFF_DRIVER_NAME = r'GTiff'
+NO_DATA = 15
+SPATIAL_REFERENCE_SYSTEM_WKID = 4326
+
+for c in range(n_copies):
+
+    pad = 3#random.randint(1, pad_limit) 
+    filename = os.path.split(input_file)[-1]
+    output_file  = os.path.join(output_dir, filename.replace(".tif", f"_{c}.tif"))
+    print(pad)
+    imarray = tiff.imread(input_file) 
+    im_shape = imarray.shape 
+    im_shape_ext = tuple([i+2*pad for i in list(im_shape[:-1])]) + (im_shape[-1],)
+    #print(im_shape_ext)
+    output = np.zeros(im_shape_ext)
+    #print(output.shape)
+    output[pad:-pad, pad:-pad, :] = imarray
+    #print(output.shape)
+    #tiff.imwrite(output_file, output)
+
+     # create driver
+    driver = gdal.GetDriverByName(GEOTIFF_DRIVER_NAME)
+
+    output_raster = driver.Create(output_file,
+                                  output.shape[1],
+                                  output.shape[0],
+                                  output.shape[-1],
+                                  eType = GDAL_DATA_TYPE)
+
diff --git a/terratorch/models/backbones/prithvi_vit.py b/terratorch/models/backbones/prithvi_vit.py
@@ -10,6 +10,7 @@
 from terratorch.models.backbones.select_patch_embed_weights import select_patch_embed_weights
 from terratorch.datasets.utils import generate_bands_intervals
 from terratorch.models.backbones.prithvi_mae import PrithviViT, PrithviMAE
+from terratorch.models.utils import pad_images
 
 logger = logging.getLogger(__name__)
 
@@ -153,20 +154,6 @@ def checkpoint_filter_fn_mae(
 
     return state_dict
 
-
-def pad_images(imgs: Tensor,patch_size: int, padding:str) -> Tensor:
-    p = patch_size
-    # h, w = imgs.shape[3], imgs.shape[4]
-    t, h, w = imgs.shape[-3:]
-    h_pad, w_pad = (p - h % p) % p, (p - w % p) % p  # Ensure padding is within bounds
-    if h_pad > 0 or w_pad > 0:
-        imgs = torch.stack([
-            nn.functional.pad(img, (0, w_pad, 0, h_pad), mode=padding)
-            for img in imgs  # Apply per image to avoid NotImplementedError from torch.nn.functional.pad
-        ])
-    return imgs
-
-
 def _create_prithvi(
     variant: str,
     pretrained: bool = False,  # noqa: FBT001, FBT002

diff --git a/terratorch/models/backbones/select_patch_embed_weights.py b/terratorch/models/backbones/select_patch_embed_weights.py
@@ -1,6 +1,5 @@
 # Copyright contributors to the Terratorch project
 
-
 import logging
 import warnings
 
@@ -13,7 +12,8 @@
 def patch_embed_weights_are_compatible(model_patch_embed: torch.Tensor, checkpoint_patch_embed: torch.Tensor) -> bool:
     # check all dimensions are the same except for channel dimension
     if len(model_patch_embed.shape) != len(checkpoint_patch_embed.shape):
-        return False
+            return False
+
     model_shape = [model_patch_embed.shape[i] for i in range(len(model_patch_embed.shape)) if i != 1]
     checkpoint_shape = [checkpoint_patch_embed.shape[i] for i in range(len(checkpoint_patch_embed.shape)) if i != 1]
     return model_shape == checkpoint_shape
@@ -82,5 +82,5 @@ def select_patch_embed_weights(
             )
 
         state_dict[patch_embed_proj_weight_key] = temp_weight
-        
+
         return state_dict
diff --git a/terratorch/models/clay_model_factory.py b/terratorch/models/clay_model_factory.py
@@ -1,6 +1,7 @@
 import importlib
 import sys
 from collections.abc import Callable
+import logging 
 
 import timm
 import torch
@@ -122,6 +123,26 @@ def build_model(
 
             backbone_kwargs, kwargs = extract_prefix_keys(kwargs, "backbone_")
 
+            # Getting some necessary parameters
+            # Patch size
+            if "patch_size" in backbone_kwargs:
+                patch_size = backbone_kwargs["patch_size"]
+            else:
+                # If the configs for the model are right and images have the proper
+                # sizes, it can still work, but there is no way to fix possible
+                # errors during execution if information about patch size is not
+                # explicitly provided. 
+                patch_size = None 
+
+            if "img_size" in backbone_kwargs:
+                img_size = backbone_kwargs["img_size"]
+            else:
+                # If the configs for the model are right and images have the proper
+                # sizes, it can still work, but there is no way to fix possible
+                # errors during execution if information about img_size is not
+                # provided in order to perform cropping when necessary.
+                img_size = None 
+
             # Trying to find the model on HuggingFace.
             try:
                 backbone: nn.Module = timm.create_model(
@@ -157,7 +178,7 @@ def build_model(
             head_kwargs["num_classes"] = num_classes
         if aux_decoders is None:
             return _build_appropriate_model(
-                task, backbone, decoder, head_kwargs, prepare_features_for_image_model, rescale=rescale
+                task, backbone, decoder, head_kwargs, prepare_features_for_image_model, patch_size=patch_size, img_size=img_size, rescale=rescale
             )
 
         to_be_aux_decoders: list[AuxiliaryHeadWithDecoderWithoutInstantiatedHead] = []
@@ -186,6 +207,8 @@ def build_model(
             decoder,
             head_kwargs,
             prepare_features_for_image_model,
+            patch_size=patch_size,
+            img_size=img_size,
             rescale=rescale,
             auxiliary_heads=to_be_aux_decoders,
         )
@@ -197,6 +220,8 @@ def _build_appropriate_model(
     decoder: nn.Module,
     head_kwargs: dict,
     prepare_features_for_image_model: Callable,
+    patch_size:int=None, 
+    img_size:int=None,
     rescale: bool = True,  # noqa: FBT001, FBT002
     auxiliary_heads: dict | None = None,
 ):
@@ -206,6 +231,8 @@ def _build_appropriate_model(
             backbone,
             decoder,
             head_kwargs,
+            patch_size=patch_size,
+            img_size=img_size,
             rescale=rescale,
             auxiliary_heads=auxiliary_heads,
         )
@@ -215,6 +242,8 @@ def _build_appropriate_model(
             backbone,
             decoder,
             head_kwargs,
+            patch_size=patch_size,
+            img_size=img_size,
             auxiliary_heads=auxiliary_heads,
         )
 

diff --git a/terratorch/models/encoder_decoder_factory.py b/terratorch/models/encoder_decoder_factory.py
@@ -2,7 +2,7 @@
 
 
 import warnings
-
+import logging 
 from torch import nn
 
 from terratorch.models.model import (
@@ -65,6 +65,8 @@ def _check_all_args_used(kwargs):
         msg = f"arguments {kwargs} were passed but not used."
         raise ValueError(msg)
 
+def _get_argument_from_instance(model, name):
+    return getattr(model._timm_module.patch_embed, name)[-1]
 
 @MODEL_FACTORY_REGISTRY.register
 class EncoderDecoderFactory(ModelFactory):
@@ -128,6 +130,26 @@ def build_model(
         backbone_kwargs, kwargs = extract_prefix_keys(kwargs, "backbone_")
         backbone = _get_backbone(backbone, **backbone_kwargs)
 
+        # Getting some necessary parameters
+        # Patch size
+        if "patch_size" in backbone_kwargs:
+            patch_size = backbone_kwargs["patch_size"]
+        else:
+            # If the configs for the model are right and images have the proper
+            # sizes, it can still work, but there is no way to fix possible
+            # errors during execution if information about patch size is not
+            # explicitly provided. 
+            patch_size = None 
+
+        if "img_size" in backbone_kwargs:
+            img_size = backbone_kwargs["img_size"]
+        else:
+            # If the configs for the model are right and images have the proper
+            # sizes, it can still work, but there is no way to fix possible
+            # errors during execution if information about img_size is not
+            # provided in order to perform cropping when necessary.
+            img_size = None 
+
         if peft_config is not None:
             if not backbone_kwargs.get("pretrained", False):
                 msg = (
@@ -166,6 +188,8 @@ def build_model(
                 backbone,
                 decoder,
                 head_kwargs,
+                patch_size=patch_size,
+                img_size=img_size,
                 necks=neck_list,
                 decoder_includes_head=decoder_includes_head,
                 rescale=rescale,
@@ -191,6 +215,8 @@ def build_model(
             backbone,
             decoder,
             head_kwargs,
+            patch_size=patch_size,
+            img_size=img_size,
             necks=neck_list,
             decoder_includes_head=decoder_includes_head,
             rescale=rescale,
@@ -203,6 +229,8 @@ def _build_appropriate_model(
     backbone: nn.Module,
     decoder: nn.Module,
     head_kwargs: dict,
+    patch_size: int,
+    img_size:int, 
     decoder_includes_head: bool = False,
     necks: list[Neck] | None = None,
     rescale: bool = True,  # noqa: FBT001, FBT002
@@ -218,6 +246,8 @@ def _build_appropriate_model(
             backbone,
             decoder,
             head_kwargs,
+            patch_size=patch_size,
+            img_size=img_size,
             decoder_includes_head=decoder_includes_head,
             neck=neck_module,
             rescale=rescale,
@@ -229,6 +259,8 @@ def _build_appropriate_model(
             backbone,
             decoder,
             head_kwargs,
+            patch_size=patch_size,
+            img_size=img_size,
             decoder_includes_head=decoder_includes_head,
             neck=neck_module,
             auxiliary_heads=auxiliary_heads,

diff --git a/terratorch/models/pixel_wise_model.py b/terratorch/models/pixel_wise_model.py
@@ -1,13 +1,14 @@
 # Copyright contributors to the Terratorch project
-
+import logging 
 import torch
 import torch.nn.functional as F  # noqa: N812
+import torchvision.transforms as transforms
 from segmentation_models_pytorch.base import SegmentationModel
 from torch import nn
 
 from terratorch.models.heads import RegressionHead, SegmentationHead
 from terratorch.models.model import AuxiliaryHeadWithDecoderWithoutInstantiatedHead, Model, ModelOutput
-
+from terratorch.models.utils import pad_images
 
 def freeze_module(module: nn.Module):
     for param in module.parameters():
@@ -26,6 +27,8 @@ def __init__(
         encoder: nn.Module,
         decoder: nn.Module,
         head_kwargs: dict,
+        patch_size: int = None, 
+        img_size:tuple = None,
         decoder_includes_head: bool = False,
         auxiliary_heads: list[AuxiliaryHeadWithDecoderWithoutInstantiatedHead] | None = None,
         neck: nn.Module | None = None,
@@ -69,6 +72,8 @@ def __init__(
 
         self.neck = neck
         self.rescale = rescale
+        self.patch_size = patch_size
+        self.img_size = (img_size, img_size)
 
     def freeze_encoder(self):
         freeze_module(self.encoder)
@@ -77,9 +82,31 @@ def freeze_decoder(self):
         freeze_module(self.decoder)
         freeze_module(self.head)
 
-    # TODO: do this properly
-    def check_input_shape(self, x: torch.Tensor) -> bool:  # noqa: ARG002
-        return True
+    def check_input_shape(self, x: torch.Tensor) -> torch.Tensor: 
+
+        if self.patch_size:
+            x_shape = x.shape[2:]
+            if all([i//self.patch_size==0 for i in x_shape]):
+               return x
+            else:
+               x = pad_images(x, self.patch_size, "constant") 
+
+               return x
+        else:
+            # If patch size is not provided, the user should guarantee the
+            # dataset is properly configured to work with the model being used. 
+            return x
+
+    def _crop_image_when_necessary(self, x:torch.Tensor, size:tuple) -> torch.Tensor:
+
+            if all(self.img_size):
+
+                x_cropped = transforms.CenterCrop(self.img_size)(x)
+                return x_cropped
+            else:
+                logging.getLogger("terratorch").info("Cropping could be  necessary to adjust images, so define `img_size` in your config file \
+                                                     if you get a shape mismatch.")
+                return x
 
     @staticmethod
     def _check_for_single_channel_and_squeeze(x):
@@ -89,7 +116,7 @@ def _check_for_single_channel_and_squeeze(x):
 
     def forward(self, x: torch.Tensor, **kwargs) -> ModelOutput:
         """Sequentially pass `x` through model`s encoder, decoder and heads"""
-        self.check_input_shape(x)
+
         if isinstance(x, torch.Tensor):
             input_size = x.shape[-2:]
         elif hasattr(kwargs, 'image_size'):
@@ -99,6 +126,9 @@ def forward(self, x: torch.Tensor, **kwargs) -> ModelOutput:
             input_size = list(x.values())[0].shape[-2:]
         else:
             ValueError('Could not infer input shape.')
+
+        # TODO make this verification optional to avoid unnecessary repetition
+        x = self.check_input_shape(x)
         features = self.encoder(x, **kwargs)
 
         ## only for backwards compatibility with pre-neck times.
@@ -114,13 +144,16 @@ def forward(self, x: torch.Tensor, **kwargs) -> ModelOutput:
         if self.rescale and mask.shape[-2:] != input_size:
             mask = F.interpolate(mask, size=input_size, mode="bilinear")
         mask = self._check_for_single_channel_and_squeeze(mask)
+
         aux_outputs = {}
         for name, decoder in self.aux_heads.items():
             aux_output = decoder([f.clone() for f in features])
             if self.rescale and aux_output.shape[-2:] != input_size:
                 aux_output = F.interpolate(aux_output, size=input_size, mode="bilinear")
             aux_output = self._check_for_single_channel_and_squeeze(aux_output)
             aux_outputs[name] = aux_output
+
+        mask = self._crop_image_when_necessary(mask, input_size)
         return ModelOutput(output=mask, auxiliary_heads=aux_outputs)
 
     def _get_head(self, task: str, input_embed_dim: int, head_kwargs):