From 64714366fe757afcdab5ccd9124e01df3050c135 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Delhome?= <raphael.delhome@oslandia.com>
Date: Wed, 6 May 2020 12:58:54 +0200
Subject: [PATCH 1/2] io: include every output paths into the
 prepare_output_folder() function

---
 CHANGELOG.md                      |  1 +
 deeposlandia/datasets/__init__.py |  2 +-
 deeposlandia/inference.py         | 13 ++---
 deeposlandia/postprocess.py       | 82 ++++++++-----------------------
 deeposlandia/train.py             | 20 +++-----
 deeposlandia/utils.py             | 45 +++++++++++------
 tests/test_path.py                | 19 ++++---
 7 files changed, 75 insertions(+), 107 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e5d7ec38..8257352f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@ Les sections conserveront leur nom en anglais.
 
 ### Changed
 
+- `utils.prepare_output_folder()` returns now a dictionary of all useful output paths
 - Some dependency updates (Tensorflow, opencv, pillow, keras, daiquiri)
 - The preprocessing has been modified for geographic datasets: `-t`, `-v` and `-T` now
   refer to raw images, the amount of preprocessed tiles being obtained by a combination
diff --git a/deeposlandia/datasets/__init__.py b/deeposlandia/datasets/__init__.py
index f10a8a4a..4b5cb252 100644
--- a/deeposlandia/datasets/__init__.py
+++ b/deeposlandia/datasets/__init__.py
@@ -16,7 +16,7 @@
 from osgeo import gdal
 from PIL import Image
 
-from deeposlandia import geometries, utils
+from deeposlandia import geometries
 
 logger = daiquiri.getLogger(__name__)
 
diff --git a/deeposlandia/inference.py b/deeposlandia/inference.py
index f5ae15c0..44f89639 100644
--- a/deeposlandia/inference.py
+++ b/deeposlandia/inference.py
@@ -167,9 +167,8 @@ def predict(
             "Please generate a valid dataset before calling the program."
         )
 
-    output_folder = utils.prepare_output_folder(datapath, dataset, problem)
-    instance_filename = "best-instance-" + str(model_input_size) + ".json"
-    instance_path = os.path.join(output_folder, instance_filename)
+    output_folder = utils.prepare_output_folder(datapath, dataset, model_input_size, problem)
+    instance_path = os.path.join(output_folder, output_folder["best-instance"])
     dropout, network = utils.recover_instance(instance_path)
     model = init_model(
         problem,
@@ -179,13 +178,11 @@ def predict(
         dropout,
         network,
     )
-    checkpoint_filename = "best-model-" + str(model_input_size) + ".h5"
-    checkpoint_full_path = os.path.join(output_folder, checkpoint_filename)
-    if os.path.isfile(checkpoint_full_path):
-        model.load_weights(checkpoint_full_path)
+    if os.path.isfile(output_folder["best-model"]):
+        model.load_weights(output_folder["best-model"])
         logger.info(
             "Model weights have been recovered from %s",
-            checkpoint_full_path,
+            output_folder["best-model"],
         )
     else:
         logger.info(
diff --git a/deeposlandia/postprocess.py b/deeposlandia/postprocess.py
index a8ffb2a1..236c080e 100644
--- a/deeposlandia/postprocess.py
+++ b/deeposlandia/postprocess.py
@@ -25,19 +25,15 @@
 logger = daiquiri.getLogger(__name__)
 
 
-def get_image_paths(datapath, dataset, image_size, image_basename):
+def get_image_paths(testing_folder, image_basename):
     """Returns a list with the path of every image that belongs to the
     `dataset`, preprocessed in `image_size`-pixelled images, that were
     extracted from an original image named as `image_basename`.
 
     Parameters
     ----------
-    datapath : str
-        Path of the data on the file system
-    dataset : str
-        Name of the dataset
-    image_size : int
-        Size of preprocessed images, in pixels
+    testing_folder : str
+        Path of the testing image folder
     image_basename : str
         Original image name
 
@@ -46,15 +42,7 @@ def get_image_paths(datapath, dataset, image_size, image_basename):
     list
         List of image full paths
     """
-    image_raw_paths = os.path.join(
-        datapath,
-        dataset,
-        "preprocessed",
-        str(image_size),
-        "testing",
-        "images",
-        image_basename + "*",
-    )
+    image_raw_paths = os.path.join(testing_folder, "images", image_basename + "*")
     return [glob.glob(f) for f in [image_raw_paths]][0]
 
 
@@ -121,16 +109,14 @@ def get_labels(datapath, dataset, tile_size):
     return [l for l in test_config["labels"] if l["is_evaluate"]]
 
 
-def get_trained_model(datapath, dataset, image_size, nb_labels):
+def get_trained_model(model_filepath, image_size, nb_labels):
     """Recover model weights stored on the file system, and assign them into
     the `model` structure
 
     Parameters
     ----------
-    datapath : str
-        Path of the data on the file system
-    dataset : str
-        Name of the dataset
+    model_filepath : str
+        Path of the model on the file system
     image_size : int
         Image size, in pixels (height=width)
     nb_labels : int
@@ -150,16 +136,9 @@ def get_trained_model(datapath, dataset, image_size, nb_labels):
         architecture="unet",
     )
     model = Model(net.X, net.Y)
-    output_folder = utils.prepare_output_folder(
-        datapath, dataset, "semseg"
-    )
-    checkpoint_filename = "best-model-" + str(image_size) + ".h5"
-    checkpoint_full_path = os.path.join(output_folder, checkpoint_filename)
-    if os.path.isfile(checkpoint_full_path):
-        model.load_weights(checkpoint_full_path)
-        logger.info(
-            "Model weights have been recovered from %s" % checkpoint_full_path
-        )
+    if os.path.isfile(model_filepath):
+        model.load_weights(model_filepath)
+        logger.info("Model weights have been recovered from %s" % model_filepath)
     else:
         logger.info(
             (
@@ -392,6 +371,7 @@ def get_image_features(datapath, dataset, filename):
 
 def main(args):
 
+    logger.info("Postprocess %s...", args.image_basename)
     features = get_image_features(
         args.datapath, args.dataset, args.image_basename
     )
@@ -399,16 +379,18 @@ def main(args):
     img_width, img_height = features["width"], features["height"]
     logger.info("Raw image size: %s, %s" % (img_width, img_height))
 
-    image_paths = get_image_paths(
-        args.datapath, args.dataset, args.image_size, args.image_basename
-    )
+    prepro_folder = utils.prepare_preprocessed_folder(args.datapath, args.dataset, args.image_size)
+    image_paths = get_image_paths(prepro_folder["testing"], args.image_basename)
     logger.info("The image will be splitted into %s tiles" % len(image_paths))
     images = extract_images(image_paths)
     coordinates = extract_coordinates_from_filenames(image_paths)
     labels = get_labels(args.datapath, args.dataset, args.image_size)
 
+    output_folder = utils.prepare_output_folder(
+        args.datapath, args.dataset, args.image_size, "semseg"
+    )
     model = get_trained_model(
-        args.datapath, args.dataset, args.image_size, len(labels)
+        output_folder["best-model"], args.image_size, len(labels)
     )
 
     logger.info("Predict labels for input images...")
@@ -429,16 +411,8 @@ def main(args):
         colored_data = draw_grid(
             colored_data, img_width, img_height, args.image_size
         )
-    predicted_label_folder = os.path.join(
-        args.datapath,
-        args.dataset,
-        "output",
-        "semseg",
-        "predicted_labels"
-    )
-    os.makedirs(predicted_label_folder, exist_ok=True)
     predicted_label_file = os.path.join(
-        predicted_label_folder,
+        output_folder["labels"],
         args.image_basename + "_" + str(args.image_size) + ".png",
     )
     Image.fromarray(colored_data).save(predicted_label_file)
@@ -449,16 +423,8 @@ def main(args):
     gdf = gpd.GeoDataFrame(
         {"labels": vectorized_labels, "geometry": vectorized_data}
     )
-    predicted_geom_folder = os.path.join(
-        args.datapath,
-        args.dataset,
-        "output",
-        "semseg",
-        "predicted_geometries",
-    )
-    os.makedirs(predicted_geom_folder, exist_ok=True)
     predicted_geom_file = os.path.join(
-        predicted_geom_folder,
+        output_folder["geometries"],
         args.image_basename + "_" + str(args.image_size) + ".geojson",
     )
     if os.path.isfile(predicted_geom_file):
@@ -473,16 +439,8 @@ def main(args):
         colored_raster_data = draw_grid(
             colored_raster_data, img_width, img_height, args.image_size
         )
-    predicted_raster_folder = os.path.join(
-        args.datapath,
-        args.dataset,
-        "output",
-        "semseg",
-        "predicted_rasters",
-    )
-    os.makedirs(predicted_raster_folder, exist_ok=True)
     predicted_raster_file = os.path.join(
-        predicted_raster_folder,
+        output_folder["rasters"],
         args.image_basename + "_" + str(args.image_size) + ".png",
     )
     Image.fromarray(colored_raster_data).save(predicted_raster_file)
diff --git a/deeposlandia/train.py b/deeposlandia/train.py
index 295ab72d..6a7346e7 100644
--- a/deeposlandia/train.py
+++ b/deeposlandia/train.py
@@ -251,6 +251,9 @@ def run_model(
 def main(args):
     # Grid search
     model_output = []
+    output_folder = utils.prepare_output_folder(
+        args.datapath, args.dataset, args.image_size, args.model
+    )
     for batch_size in args.batch_size:
         logger.info("Generating data with batch of %s images...", batch_size)
         # Data generator building
@@ -283,16 +286,14 @@ def main(args):
                 learning_rate_decay,
             ]
             instance_name = utils.list_to_str(instance_args, "_")
-            output_folder = utils.prepare_output_folder(
-                args.datapath, args.dataset, args.model, instance_name
-            )
+            instance_folder = os.path.join(output_folder["checkpoints"], instance_name)
             # Model running
             model_output.append(
                 run_model(
                     train_gen,
                     valid_gen,
                     args.model,
-                    output_folder,
+                    instance_folder,
                     instance_name,
                     args.image_size,
                     nb_labels,
@@ -309,15 +310,8 @@ def main(args):
     best_instance = max(model_output, key=lambda x: x["val_acc"])
 
     # Save best model
-    output_folder = utils.prepare_output_folder(
-        args.datapath, args.dataset, args.model
-    )
-    instance_name = os.path.join(
-        output_folder,
-        "best-{}-" + str(args.image_size) + ".{}",
-    )
-    best_instance["model"].save(instance_name.format("model", "h5"))
-    with open(instance_name.format("instance", "json"), "w") as fobj:
+    best_instance["model"].save(output_folder["best-model"])
+    with open(output_folder["best-instance"], "w") as fobj:
         json.dump(
             {
                 key: best_instance[key]
diff --git a/deeposlandia/utils.py b/deeposlandia/utils.py
index ba6613e1..bf49799f 100644
--- a/deeposlandia/utils.py
+++ b/deeposlandia/utils.py
@@ -9,6 +9,8 @@
 import pandas as pd
 from PIL import Image
 
+from deeposlandia.datasets import GEOGRAPHIC_DATASETS
+
 
 logger = daiquiri.getLogger(__name__)
 
@@ -186,7 +188,7 @@ def prepare_preprocessed_folder(
     }
 
 
-def prepare_output_folder(datapath, dataset, model, instance_name=None):
+def prepare_output_folder(datapath, dataset, image_size, model):
     """Dataset and repository management; create and return the dataset output
     path
 
@@ -196,27 +198,38 @@ def prepare_output_folder(datapath, dataset, model, instance_name=None):
         Data root directory, contain all used the datasets
     dataset : str
         Dataset name, *e.g.* `mapillary` or `shapes`
+    image_size : int
+        Size of the considered images (height and width are equal)
     model : str
-        Research problem that is tackled, *e.g.* `feature_detection` or
-    `semantic_segmentation`
-    instance_name : str
-        Instance name, used to create the accurate output folders
+        Research problem that is tackled, *e.g.* `featdet` or `semseg`
 
     Returns
     -------
-    str
-        Dataset output path
+    dict
+        Dataset output paths
     """
-    if instance_name is not None:
-        output_folder = os.path.join(
-            datapath, dataset, "output", model, "checkpoints", instance_name
-        )
-    else:
-        output_folder = os.path.join(
-            datapath, dataset, "output", model, "checkpoints"
-        )
+    output_folder = os.path.join(datapath, dataset, "output", model)
     os.makedirs(output_folder, exist_ok=True)
-    return output_folder
+    checkpoint_folder = os.path.join(output_folder, "checkpoints")
+    os.makedirs(checkpoint_folder, exist_ok=True)
+    best_model_filename = "best-model-" + str(image_size) + ".h5"
+    best_instance_filename = "best-instance-" + str(image_size) + ".json"
+    label_folder = os.path.join(output_folder, "predicted_labels")
+    os.makedirs(label_folder, exist_ok=True)
+    geometry_folder = raster_folder = None
+    if dataset in GEOGRAPHIC_DATASETS:
+        geometry_folder = os.path.join(output_folder, "predicted_geometries")
+        raster_folder = os.path.join(output_folder, "predicted_rasters")
+        os.makedirs(geometry_folder, exist_ok=True)
+        os.makedirs(raster_folder, exist_ok=True)
+    return {
+        "best-model": os.path.join(checkpoint_folder, best_model_filename),
+        "best-instance": os.path.join(checkpoint_folder, best_instance_filename),
+        "checkpoints": checkpoint_folder,
+        "geometries": geometry_folder,
+        "labels": label_folder,
+        "rasters": raster_folder,
+    }
 
 
 def recover_instance(instance_path):
diff --git a/tests/test_path.py b/tests/test_path.py
index 92e1b755..c2d68c3c 100644
--- a/tests/test_path.py
+++ b/tests/test_path.py
@@ -127,16 +127,21 @@ def test_output_folder(datapath_repo):
     """
     datapath = str(datapath_repo)
     dataset = "shapes"
+    image_size = 100
     model = "feature_detection"
-    instance_name = "test_instance"
-    prepare_output_folder(datapath, dataset, model, instance_name)
-    assert os.path.isdir(os.path.join(datapath, dataset, "output"))
-    assert os.path.isdir(os.path.join(datapath, dataset, "output", model))
+    output_folder = prepare_output_folder(datapath, dataset, image_size, model)
+    assert len(output_folder.keys()) == 6
     assert os.path.isdir(
         os.path.join(datapath, dataset, "output", model, "checkpoints")
     )
     assert os.path.isdir(
-        os.path.join(
-            datapath, dataset, "output", model, "checkpoints", instance_name
-        )
+        os.path.join(datapath, dataset, "output", model, "predicted_labels")
+    )
+    dataset = "aerial"
+    prepare_output_folder(datapath, dataset, image_size, model)
+    assert os.path.isdir(
+        os.path.join(datapath, dataset, "output", model, "predicted_geometries")
+    )
+    assert os.path.isdir(
+        os.path.join(datapath, dataset, "output", model, "predicted_rasters")
     )

From bbd34ed14529aed0800b87450c109e691dd91eb1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Delhome?= <raphael.delhome@oslandia.com>
Date: Wed, 6 May 2020 19:42:40 +0200
Subject: [PATCH 2/2] tests: fix broken tests

---
 deeposlandia/datasets/shapes.py |  2 +-
 deeposlandia/geometries.py      |  4 ++--
 tests/conftest.py               |  5 +++++
 tests/test_dataset.py           |  4 ++++
 tests/test_postprocess.py       | 18 +++++++++++++-----
 5 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/deeposlandia/datasets/shapes.py b/deeposlandia/datasets/shapes.py
index 0d69c189..b425b9cb 100644
--- a/deeposlandia/datasets/shapes.py
+++ b/deeposlandia/datasets/shapes.py
@@ -98,7 +98,7 @@ def populate(
         output_dir=None,
         input_dir=None,
         nb_images=10000,
-        nb_tiles_per_images=None,
+        nb_tiles_per_image=None,
         aggregate=False,
         labelling=True,
         buf=8,
diff --git a/deeposlandia/geometries.py b/deeposlandia/geometries.py
index be6f7269..8b716dfe 100644
--- a/deeposlandia/geometries.py
+++ b/deeposlandia/geometries.py
@@ -10,7 +10,7 @@
 
 import cv2
 import daiquiri
-import fiona
+from fiona.crs import from_epsg
 import geopandas as gpd
 import numpy as np
 import shapely.geometry as shgeom
@@ -265,7 +265,7 @@ def extract_tile_items(
         raster_features, min_x, min_y, tile_width, tile_height
     )
     bdf = gpd.GeoDataFrame(
-        crs=fiona.crs.from_epsg(raster_features["srid"]), geometry=[area]
+        crs=from_epsg(raster_features["srid"]), geometry=[area]
     )
     reproj_labels = labels.to_crs(epsg=raster_features["srid"])
     tile_items = gpd.sjoin(reproj_labels, bdf)
diff --git a/tests/conftest.py b/tests/conftest.py
index 8ceed8ce..77fe8853 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -170,6 +170,11 @@ def aerial_nb_images():
     return 1
 
 
+@pytest.fixture
+def nb_tiles_per_image():
+    return 10
+
+
 @pytest.fixture
 def aerial_nb_output_training_images():
     return 10
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index 7e67086c..60fef1d0 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -145,6 +145,7 @@ def test_aerial_training_dataset_population(
     aerial_training_temp_dir,
     aerial_raw_sample,
     aerial_nb_images,
+    nb_tiles_per_image,
     aerial_training_config,
     aerial_nb_labels,
     aerial_nb_output_training_images,
@@ -156,6 +157,7 @@ def test_aerial_training_dataset_population(
         str(aerial_training_temp_dir),
         aerial_raw_sample,
         nb_images=aerial_nb_images,
+        nb_tiles_per_image=nb_tiles_per_image,
     )
     d.save(str(aerial_training_config))
     assert d.get_nb_labels() == aerial_nb_labels
@@ -223,6 +225,7 @@ def test_tanzania_training_dataset_population(
     tanzania_training_temp_dir,
     tanzania_raw_sample,
     tanzania_nb_images,
+    nb_tiles_per_image,
     tanzania_training_config,
     tanzania_nb_labels,
     tanzania_nb_output_training_images,
@@ -234,6 +237,7 @@ def test_tanzania_training_dataset_population(
         str(tanzania_training_temp_dir),
         tanzania_raw_sample,
         nb_images=tanzania_nb_images,
+        nb_tiles_per_image=nb_tiles_per_image,
     )
     d.save(str(tanzania_training_config))
     assert d.get_nb_labels() == tanzania_nb_labels
diff --git a/tests/test_postprocess.py b/tests/test_postprocess.py
index 82659d6f..86f93d5b 100644
--- a/tests/test_postprocess.py
+++ b/tests/test_postprocess.py
@@ -1,6 +1,8 @@
 """Unit tests dedicated to predicted label postprocessing
 """
 
+import os
+
 import numpy as np
 import pytest
 
@@ -13,7 +15,7 @@ def test_get_image_paths(tanzania_image_size):
     Preprocessed image filenames must end with ".png"
     """
     filenames = postprocess.get_image_paths(
-        "./tests/data", "tanzania", tanzania_image_size, "grid_066"
+        f"./tests/data/tanzania/preprocessed/{tanzania_image_size}/testing/", "tanzania_sample"
     )
     assert np.all([f.endswith(".png") for f in filenames])
 
@@ -28,7 +30,7 @@ def test_extract_images(
     3).
     """
     filenames = postprocess.get_image_paths(
-        "./tests/data", "tanzania", tanzania_image_size, "tanzania_sample"
+        f"./tests/data/tanzania/preprocessed/{tanzania_image_size}/testing/", "tanzania_sample"
     )
     images = postprocess.extract_images(filenames)
     assert len(images.shape) == 4
@@ -64,7 +66,9 @@ def test_get_trained_model(tanzania_image_size, tanzania_nb_labels):
       + 4 are related to pooling
     """
     model = postprocess.get_trained_model(
-        "./tests.data", "tanzania", tanzania_image_size, tanzania_nb_labels
+        "./tests/data/tanzania/output/semseg/checkpoints/",
+        tanzania_image_size,
+        tanzania_nb_labels
     )
     assert model.input_shape[1:] == (
         tanzania_image_size,
@@ -233,12 +237,16 @@ def test_build_full_labelled_image(
     datapath = "./tests/data"
     dataset = "tanzania"
     image_paths = postprocess.get_image_paths(
-        datapath, dataset, tanzania_image_size, "tanzania_sample"
+        os.path.join(datapath, dataset, "preprocessed", str(tanzania_image_size), "testing"),
+        "tanzania_sample"
     )
     images = postprocess.extract_images(image_paths)
     coordinates = postprocess.extract_coordinates_from_filenames(image_paths)
+    model_filename = f"best-model-{tanzania_image_size}.h5"
     model = postprocess.get_trained_model(
-        datapath, dataset, tanzania_image_size, tanzania_nb_labels
+        os.path.join(datapath, dataset, "output/semseg/checkpoints/", model_filename),
+        tanzania_image_size,
+        tanzania_nb_labels
     )
     labelled_image = postprocess.build_full_labelled_image(
         images,