Merge pull request #158 from Oslandia/rde-refactor-output-folder-mana…

…gement Include every output paths into the prepare_output_folder() function
Oslandia · May 14, 2020 · 1cf5142 · 1cf5142
2 parents 8b8108e + bbd34ed
commit 1cf5142
Show file tree

Hide file tree

Showing 12 changed files with 100 additions and 115 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ Les sections conserveront leur nom en anglais.
 
 ### Changed
 
+- `utils.prepare_output_folder()` returns now a dictionary of all useful output paths
 - Some dependency updates (Tensorflow, opencv, pillow, keras, daiquiri)
 - The preprocessing has been modified for geographic datasets: `-t`, `-v` and `-T` now
   refer to raw images, the amount of preprocessed tiles being obtained by a combination

diff --git a/deeposlandia/datasets/__init__.py b/deeposlandia/datasets/__init__.py
@@ -16,7 +16,7 @@
 from osgeo import gdal
 from PIL import Image
 
-from deeposlandia import geometries, utils
+from deeposlandia import geometries
 
 logger = daiquiri.getLogger(__name__)
 

diff --git a/deeposlandia/datasets/shapes.py b/deeposlandia/datasets/shapes.py
@@ -98,7 +98,7 @@ def populate(
         output_dir=None,
         input_dir=None,
         nb_images=10000,
-        nb_tiles_per_images=None,
+        nb_tiles_per_image=None,
         aggregate=False,
         labelling=True,
         buf=8,

diff --git a/deeposlandia/geometries.py b/deeposlandia/geometries.py
@@ -10,7 +10,7 @@
 
 import cv2
 import daiquiri
-import fiona
+from fiona.crs import from_epsg
 import geopandas as gpd
 import numpy as np
 import shapely.geometry as shgeom
@@ -265,7 +265,7 @@ def extract_tile_items(
         raster_features, min_x, min_y, tile_width, tile_height
     )
     bdf = gpd.GeoDataFrame(
-        crs=fiona.crs.from_epsg(raster_features["srid"]), geometry=[area]
+        crs=from_epsg(raster_features["srid"]), geometry=[area]
     )
     reproj_labels = labels.to_crs(epsg=raster_features["srid"])
     tile_items = gpd.sjoin(reproj_labels, bdf)

diff --git a/deeposlandia/inference.py b/deeposlandia/inference.py
@@ -167,9 +167,8 @@ def predict(
             "Please generate a valid dataset before calling the program."
         )
 
-    output_folder = utils.prepare_output_folder(datapath, dataset, problem)
-    instance_filename = "best-instance-" + str(model_input_size) + ".json"
-    instance_path = os.path.join(output_folder, instance_filename)
+    output_folder = utils.prepare_output_folder(datapath, dataset, model_input_size, problem)
+    instance_path = os.path.join(output_folder, output_folder["best-instance"])
     dropout, network = utils.recover_instance(instance_path)
     model = init_model(
         problem,
@@ -179,13 +178,11 @@ def predict(
         dropout,
         network,
     )
-    checkpoint_filename = "best-model-" + str(model_input_size) + ".h5"
-    checkpoint_full_path = os.path.join(output_folder, checkpoint_filename)
-    if os.path.isfile(checkpoint_full_path):
-        model.load_weights(checkpoint_full_path)
+    if os.path.isfile(output_folder["best-model"]):
+        model.load_weights(output_folder["best-model"])
         logger.info(
             "Model weights have been recovered from %s",
-            checkpoint_full_path,
+            output_folder["best-model"],
         )
     else:
         logger.info(

diff --git a/deeposlandia/postprocess.py b/deeposlandia/postprocess.py
@@ -25,19 +25,15 @@
 logger = daiquiri.getLogger(__name__)
 
 
-def get_image_paths(datapath, dataset, image_size, image_basename):
+def get_image_paths(testing_folder, image_basename):
     """Returns a list with the path of every image that belongs to the
     `dataset`, preprocessed in `image_size`-pixelled images, that were
     extracted from an original image named as `image_basename`.
 
     Parameters
     ----------
-    datapath : str
-        Path of the data on the file system
-    dataset : str
-        Name of the dataset
-    image_size : int
-        Size of preprocessed images, in pixels
+    testing_folder : str
+        Path of the testing image folder
     image_basename : str
         Original image name
 
@@ -46,15 +42,7 @@ def get_image_paths(datapath, dataset, image_size, image_basename):
     list
         List of image full paths
     """
-    image_raw_paths = os.path.join(
-        datapath,
-        dataset,
-        "preprocessed",
-        str(image_size),
-        "testing",
-        "images",
-        image_basename + "*",
-    )
+    image_raw_paths = os.path.join(testing_folder, "images", image_basename + "*")
     return [glob.glob(f) for f in [image_raw_paths]][0]
 
 
@@ -121,16 +109,14 @@ def get_labels(datapath, dataset, tile_size):
     return [l for l in test_config["labels"] if l["is_evaluate"]]
 
 
-def get_trained_model(datapath, dataset, image_size, nb_labels):
+def get_trained_model(model_filepath, image_size, nb_labels):
     """Recover model weights stored on the file system, and assign them into
     the `model` structure
 
     Parameters
     ----------
-    datapath : str
-        Path of the data on the file system
-    dataset : str
-        Name of the dataset
+    model_filepath : str
+        Path of the model on the file system
     image_size : int
         Image size, in pixels (height=width)
     nb_labels : int
@@ -150,16 +136,9 @@ def get_trained_model(datapath, dataset, image_size, nb_labels):
         architecture="unet",
     )
     model = Model(net.X, net.Y)
-    output_folder = utils.prepare_output_folder(
-        datapath, dataset, "semseg"
-    )
-    checkpoint_filename = "best-model-" + str(image_size) + ".h5"
-    checkpoint_full_path = os.path.join(output_folder, checkpoint_filename)
-    if os.path.isfile(checkpoint_full_path):
-        model.load_weights(checkpoint_full_path)
-        logger.info(
-            "Model weights have been recovered from %s" % checkpoint_full_path
-        )
+    if os.path.isfile(model_filepath):
+        model.load_weights(model_filepath)
+        logger.info("Model weights have been recovered from %s" % model_filepath)
     else:
         logger.info(
             (
@@ -392,23 +371,26 @@ def get_image_features(datapath, dataset, filename):
 
 def main(args):
 
+    logger.info("Postprocess %s...", args.image_basename)
     features = get_image_features(
         args.datapath, args.dataset, args.image_basename
     )
 
     img_width, img_height = features["width"], features["height"]
     logger.info("Raw image size: %s, %s" % (img_width, img_height))
 
-    image_paths = get_image_paths(
-        args.datapath, args.dataset, args.image_size, args.image_basename
-    )
+    prepro_folder = utils.prepare_preprocessed_folder(args.datapath, args.dataset, args.image_size)
+    image_paths = get_image_paths(prepro_folder["testing"], args.image_basename)
     logger.info("The image will be splitted into %s tiles" % len(image_paths))
     images = extract_images(image_paths)
     coordinates = extract_coordinates_from_filenames(image_paths)
     labels = get_labels(args.datapath, args.dataset, args.image_size)
 
+    output_folder = utils.prepare_output_folder(
+        args.datapath, args.dataset, args.image_size, "semseg"
+    )
     model = get_trained_model(
-        args.datapath, args.dataset, args.image_size, len(labels)
+        output_folder["best-model"], args.image_size, len(labels)
     )
 
     logger.info("Predict labels for input images...")
@@ -429,16 +411,8 @@ def main(args):
         colored_data = draw_grid(
             colored_data, img_width, img_height, args.image_size
         )
-    predicted_label_folder = os.path.join(
-        args.datapath,
-        args.dataset,
-        "output",
-        "semseg",
-        "predicted_labels"
-    )
-    os.makedirs(predicted_label_folder, exist_ok=True)
     predicted_label_file = os.path.join(
-        predicted_label_folder,
+        output_folder["labels"],
         args.image_basename + "_" + str(args.image_size) + ".png",
     )
     Image.fromarray(colored_data).save(predicted_label_file)
@@ -449,16 +423,8 @@ def main(args):
     gdf = gpd.GeoDataFrame(
         {"labels": vectorized_labels, "geometry": vectorized_data}
     )
-    predicted_geom_folder = os.path.join(
-        args.datapath,
-        args.dataset,
-        "output",
-        "semseg",
-        "predicted_geometries",
-    )
-    os.makedirs(predicted_geom_folder, exist_ok=True)
     predicted_geom_file = os.path.join(
-        predicted_geom_folder,
+        output_folder["geometries"],
         args.image_basename + "_" + str(args.image_size) + ".geojson",
     )
     if os.path.isfile(predicted_geom_file):
@@ -473,16 +439,8 @@ def main(args):
         colored_raster_data = draw_grid(
             colored_raster_data, img_width, img_height, args.image_size
         )
-    predicted_raster_folder = os.path.join(
-        args.datapath,
-        args.dataset,
-        "output",
-        "semseg",
-        "predicted_rasters",
-    )
-    os.makedirs(predicted_raster_folder, exist_ok=True)
     predicted_raster_file = os.path.join(
-        predicted_raster_folder,
+        output_folder["rasters"],
         args.image_basename + "_" + str(args.image_size) + ".png",
     )
     Image.fromarray(colored_raster_data).save(predicted_raster_file)
diff --git a/deeposlandia/train.py b/deeposlandia/train.py
@@ -251,6 +251,9 @@ def run_model(
 def main(args):
     # Grid search
     model_output = []
+    output_folder = utils.prepare_output_folder(
+        args.datapath, args.dataset, args.image_size, args.model
+    )
     for batch_size in args.batch_size:
         logger.info("Generating data with batch of %s images...", batch_size)
         # Data generator building
@@ -283,16 +286,14 @@ def main(args):
                 learning_rate_decay,
             ]
             instance_name = utils.list_to_str(instance_args, "_")
-            output_folder = utils.prepare_output_folder(
-                args.datapath, args.dataset, args.model, instance_name
-            )
+            instance_folder = os.path.join(output_folder["checkpoints"], instance_name)
             # Model running
             model_output.append(
                 run_model(
                     train_gen,
                     valid_gen,
                     args.model,
-                    output_folder,
+                    instance_folder,
                     instance_name,
                     args.image_size,
                     nb_labels,
@@ -309,15 +310,8 @@ def main(args):
     best_instance = max(model_output, key=lambda x: x["val_acc"])
 
     # Save best model
-    output_folder = utils.prepare_output_folder(
-        args.datapath, args.dataset, args.model
-    )
-    instance_name = os.path.join(
-        output_folder,
-        "best-{}-" + str(args.image_size) + ".{}",
-    )
-    best_instance["model"].save(instance_name.format("model", "h5"))
-    with open(instance_name.format("instance", "json"), "w") as fobj:
+    best_instance["model"].save(output_folder["best-model"])
+    with open(output_folder["best-instance"], "w") as fobj:
         json.dump(
             {
                 key: best_instance[key]

diff --git a/deeposlandia/utils.py b/deeposlandia/utils.py
@@ -9,6 +9,8 @@
 import pandas as pd
 from PIL import Image
 
+from deeposlandia.datasets import GEOGRAPHIC_DATASETS
+
 
 logger = daiquiri.getLogger(__name__)
 
@@ -186,7 +188,7 @@ def prepare_preprocessed_folder(
     }
 
 
-def prepare_output_folder(datapath, dataset, model, instance_name=None):
+def prepare_output_folder(datapath, dataset, image_size, model):
     """Dataset and repository management; create and return the dataset output
     path
 
@@ -196,27 +198,38 @@ def prepare_output_folder(datapath, dataset, model, instance_name=None):
         Data root directory, contain all used the datasets
     dataset : str
         Dataset name, *e.g.* `mapillary` or `shapes`
+    image_size : int
+        Size of the considered images (height and width are equal)
     model : str
-        Research problem that is tackled, *e.g.* `feature_detection` or
-    `semantic_segmentation`
-    instance_name : str
-        Instance name, used to create the accurate output folders
+        Research problem that is tackled, *e.g.* `featdet` or `semseg`
 
     Returns
     -------
-    str
-        Dataset output path
+    dict
+        Dataset output paths
     """
-    if instance_name is not None:
-        output_folder = os.path.join(
-            datapath, dataset, "output", model, "checkpoints", instance_name
-        )
-    else:
-        output_folder = os.path.join(
-            datapath, dataset, "output", model, "checkpoints"
-        )
+    output_folder = os.path.join(datapath, dataset, "output", model)
     os.makedirs(output_folder, exist_ok=True)
-    return output_folder
+    checkpoint_folder = os.path.join(output_folder, "checkpoints")
+    os.makedirs(checkpoint_folder, exist_ok=True)
+    best_model_filename = "best-model-" + str(image_size) + ".h5"
+    best_instance_filename = "best-instance-" + str(image_size) + ".json"
+    label_folder = os.path.join(output_folder, "predicted_labels")
+    os.makedirs(label_folder, exist_ok=True)
+    geometry_folder = raster_folder = None
+    if dataset in GEOGRAPHIC_DATASETS:
+        geometry_folder = os.path.join(output_folder, "predicted_geometries")
+        raster_folder = os.path.join(output_folder, "predicted_rasters")
+        os.makedirs(geometry_folder, exist_ok=True)
+        os.makedirs(raster_folder, exist_ok=True)
+    return {
+        "best-model": os.path.join(checkpoint_folder, best_model_filename),
+        "best-instance": os.path.join(checkpoint_folder, best_instance_filename),
+        "checkpoints": checkpoint_folder,
+        "geometries": geometry_folder,
+        "labels": label_folder,
+        "rasters": raster_folder,
+    }
 
 
 def recover_instance(instance_path):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -170,6 +170,11 @@ def aerial_nb_images():
     return 1
 
 
+@pytest.fixture
+def nb_tiles_per_image():
+    return 10
+
+
 @pytest.fixture
 def aerial_nb_output_training_images():
     return 10