From 64714366fe757afcdab5ccd9124e01df3050c135 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Delhome?= Date: Wed, 6 May 2020 12:58:54 +0200 Subject: [PATCH 1/2] io: include every output paths into the prepare_output_folder() function --- CHANGELOG.md | 1 + deeposlandia/datasets/__init__.py | 2 +- deeposlandia/inference.py | 13 ++--- deeposlandia/postprocess.py | 82 ++++++++----------------------- deeposlandia/train.py | 20 +++----- deeposlandia/utils.py | 45 +++++++++++------ tests/test_path.py | 19 ++++--- 7 files changed, 75 insertions(+), 107 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e5d7ec38..8257352f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ Les sections conserveront leur nom en anglais. ### Changed +- `utils.prepare_output_folder()` returns now a dictionary of all useful output paths - Some dependency updates (Tensorflow, opencv, pillow, keras, daiquiri) - The preprocessing has been modified for geographic datasets: `-t`, `-v` and `-T` now refer to raw images, the amount of preprocessed tiles being obtained by a combination diff --git a/deeposlandia/datasets/__init__.py b/deeposlandia/datasets/__init__.py index f10a8a4a..4b5cb252 100644 --- a/deeposlandia/datasets/__init__.py +++ b/deeposlandia/datasets/__init__.py @@ -16,7 +16,7 @@ from osgeo import gdal from PIL import Image -from deeposlandia import geometries, utils +from deeposlandia import geometries logger = daiquiri.getLogger(__name__) diff --git a/deeposlandia/inference.py b/deeposlandia/inference.py index f5ae15c0..44f89639 100644 --- a/deeposlandia/inference.py +++ b/deeposlandia/inference.py @@ -167,9 +167,8 @@ def predict( "Please generate a valid dataset before calling the program." ) - output_folder = utils.prepare_output_folder(datapath, dataset, problem) - instance_filename = "best-instance-" + str(model_input_size) + ".json" - instance_path = os.path.join(output_folder, instance_filename) + output_folder = utils.prepare_output_folder(datapath, dataset, model_input_size, problem) + instance_path = os.path.join(output_folder, output_folder["best-instance"]) dropout, network = utils.recover_instance(instance_path) model = init_model( problem, @@ -179,13 +178,11 @@ def predict( dropout, network, ) - checkpoint_filename = "best-model-" + str(model_input_size) + ".h5" - checkpoint_full_path = os.path.join(output_folder, checkpoint_filename) - if os.path.isfile(checkpoint_full_path): - model.load_weights(checkpoint_full_path) + if os.path.isfile(output_folder["best-model"]): + model.load_weights(output_folder["best-model"]) logger.info( "Model weights have been recovered from %s", - checkpoint_full_path, + output_folder["best-model"], ) else: logger.info( diff --git a/deeposlandia/postprocess.py b/deeposlandia/postprocess.py index a8ffb2a1..236c080e 100644 --- a/deeposlandia/postprocess.py +++ b/deeposlandia/postprocess.py @@ -25,19 +25,15 @@ logger = daiquiri.getLogger(__name__) -def get_image_paths(datapath, dataset, image_size, image_basename): +def get_image_paths(testing_folder, image_basename): """Returns a list with the path of every image that belongs to the `dataset`, preprocessed in `image_size`-pixelled images, that were extracted from an original image named as `image_basename`. Parameters ---------- - datapath : str - Path of the data on the file system - dataset : str - Name of the dataset - image_size : int - Size of preprocessed images, in pixels + testing_folder : str + Path of the testing image folder image_basename : str Original image name @@ -46,15 +42,7 @@ def get_image_paths(datapath, dataset, image_size, image_basename): list List of image full paths """ - image_raw_paths = os.path.join( - datapath, - dataset, - "preprocessed", - str(image_size), - "testing", - "images", - image_basename + "*", - ) + image_raw_paths = os.path.join(testing_folder, "images", image_basename + "*") return [glob.glob(f) for f in [image_raw_paths]][0] @@ -121,16 +109,14 @@ def get_labels(datapath, dataset, tile_size): return [l for l in test_config["labels"] if l["is_evaluate"]] -def get_trained_model(datapath, dataset, image_size, nb_labels): +def get_trained_model(model_filepath, image_size, nb_labels): """Recover model weights stored on the file system, and assign them into the `model` structure Parameters ---------- - datapath : str - Path of the data on the file system - dataset : str - Name of the dataset + model_filepath : str + Path of the model on the file system image_size : int Image size, in pixels (height=width) nb_labels : int @@ -150,16 +136,9 @@ def get_trained_model(datapath, dataset, image_size, nb_labels): architecture="unet", ) model = Model(net.X, net.Y) - output_folder = utils.prepare_output_folder( - datapath, dataset, "semseg" - ) - checkpoint_filename = "best-model-" + str(image_size) + ".h5" - checkpoint_full_path = os.path.join(output_folder, checkpoint_filename) - if os.path.isfile(checkpoint_full_path): - model.load_weights(checkpoint_full_path) - logger.info( - "Model weights have been recovered from %s" % checkpoint_full_path - ) + if os.path.isfile(model_filepath): + model.load_weights(model_filepath) + logger.info("Model weights have been recovered from %s" % model_filepath) else: logger.info( ( @@ -392,6 +371,7 @@ def get_image_features(datapath, dataset, filename): def main(args): + logger.info("Postprocess %s...", args.image_basename) features = get_image_features( args.datapath, args.dataset, args.image_basename ) @@ -399,16 +379,18 @@ def main(args): img_width, img_height = features["width"], features["height"] logger.info("Raw image size: %s, %s" % (img_width, img_height)) - image_paths = get_image_paths( - args.datapath, args.dataset, args.image_size, args.image_basename - ) + prepro_folder = utils.prepare_preprocessed_folder(args.datapath, args.dataset, args.image_size) + image_paths = get_image_paths(prepro_folder["testing"], args.image_basename) logger.info("The image will be splitted into %s tiles" % len(image_paths)) images = extract_images(image_paths) coordinates = extract_coordinates_from_filenames(image_paths) labels = get_labels(args.datapath, args.dataset, args.image_size) + output_folder = utils.prepare_output_folder( + args.datapath, args.dataset, args.image_size, "semseg" + ) model = get_trained_model( - args.datapath, args.dataset, args.image_size, len(labels) + output_folder["best-model"], args.image_size, len(labels) ) logger.info("Predict labels for input images...") @@ -429,16 +411,8 @@ def main(args): colored_data = draw_grid( colored_data, img_width, img_height, args.image_size ) - predicted_label_folder = os.path.join( - args.datapath, - args.dataset, - "output", - "semseg", - "predicted_labels" - ) - os.makedirs(predicted_label_folder, exist_ok=True) predicted_label_file = os.path.join( - predicted_label_folder, + output_folder["labels"], args.image_basename + "_" + str(args.image_size) + ".png", ) Image.fromarray(colored_data).save(predicted_label_file) @@ -449,16 +423,8 @@ def main(args): gdf = gpd.GeoDataFrame( {"labels": vectorized_labels, "geometry": vectorized_data} ) - predicted_geom_folder = os.path.join( - args.datapath, - args.dataset, - "output", - "semseg", - "predicted_geometries", - ) - os.makedirs(predicted_geom_folder, exist_ok=True) predicted_geom_file = os.path.join( - predicted_geom_folder, + output_folder["geometries"], args.image_basename + "_" + str(args.image_size) + ".geojson", ) if os.path.isfile(predicted_geom_file): @@ -473,16 +439,8 @@ def main(args): colored_raster_data = draw_grid( colored_raster_data, img_width, img_height, args.image_size ) - predicted_raster_folder = os.path.join( - args.datapath, - args.dataset, - "output", - "semseg", - "predicted_rasters", - ) - os.makedirs(predicted_raster_folder, exist_ok=True) predicted_raster_file = os.path.join( - predicted_raster_folder, + output_folder["rasters"], args.image_basename + "_" + str(args.image_size) + ".png", ) Image.fromarray(colored_raster_data).save(predicted_raster_file) diff --git a/deeposlandia/train.py b/deeposlandia/train.py index 295ab72d..6a7346e7 100644 --- a/deeposlandia/train.py +++ b/deeposlandia/train.py @@ -251,6 +251,9 @@ def run_model( def main(args): # Grid search model_output = [] + output_folder = utils.prepare_output_folder( + args.datapath, args.dataset, args.image_size, args.model + ) for batch_size in args.batch_size: logger.info("Generating data with batch of %s images...", batch_size) # Data generator building @@ -283,16 +286,14 @@ def main(args): learning_rate_decay, ] instance_name = utils.list_to_str(instance_args, "_") - output_folder = utils.prepare_output_folder( - args.datapath, args.dataset, args.model, instance_name - ) + instance_folder = os.path.join(output_folder["checkpoints"], instance_name) # Model running model_output.append( run_model( train_gen, valid_gen, args.model, - output_folder, + instance_folder, instance_name, args.image_size, nb_labels, @@ -309,15 +310,8 @@ def main(args): best_instance = max(model_output, key=lambda x: x["val_acc"]) # Save best model - output_folder = utils.prepare_output_folder( - args.datapath, args.dataset, args.model - ) - instance_name = os.path.join( - output_folder, - "best-{}-" + str(args.image_size) + ".{}", - ) - best_instance["model"].save(instance_name.format("model", "h5")) - with open(instance_name.format("instance", "json"), "w") as fobj: + best_instance["model"].save(output_folder["best-model"]) + with open(output_folder["best-instance"], "w") as fobj: json.dump( { key: best_instance[key] diff --git a/deeposlandia/utils.py b/deeposlandia/utils.py index ba6613e1..bf49799f 100644 --- a/deeposlandia/utils.py +++ b/deeposlandia/utils.py @@ -9,6 +9,8 @@ import pandas as pd from PIL import Image +from deeposlandia.datasets import GEOGRAPHIC_DATASETS + logger = daiquiri.getLogger(__name__) @@ -186,7 +188,7 @@ def prepare_preprocessed_folder( } -def prepare_output_folder(datapath, dataset, model, instance_name=None): +def prepare_output_folder(datapath, dataset, image_size, model): """Dataset and repository management; create and return the dataset output path @@ -196,27 +198,38 @@ def prepare_output_folder(datapath, dataset, model, instance_name=None): Data root directory, contain all used the datasets dataset : str Dataset name, *e.g.* `mapillary` or `shapes` + image_size : int + Size of the considered images (height and width are equal) model : str - Research problem that is tackled, *e.g.* `feature_detection` or - `semantic_segmentation` - instance_name : str - Instance name, used to create the accurate output folders + Research problem that is tackled, *e.g.* `featdet` or `semseg` Returns ------- - str - Dataset output path + dict + Dataset output paths """ - if instance_name is not None: - output_folder = os.path.join( - datapath, dataset, "output", model, "checkpoints", instance_name - ) - else: - output_folder = os.path.join( - datapath, dataset, "output", model, "checkpoints" - ) + output_folder = os.path.join(datapath, dataset, "output", model) os.makedirs(output_folder, exist_ok=True) - return output_folder + checkpoint_folder = os.path.join(output_folder, "checkpoints") + os.makedirs(checkpoint_folder, exist_ok=True) + best_model_filename = "best-model-" + str(image_size) + ".h5" + best_instance_filename = "best-instance-" + str(image_size) + ".json" + label_folder = os.path.join(output_folder, "predicted_labels") + os.makedirs(label_folder, exist_ok=True) + geometry_folder = raster_folder = None + if dataset in GEOGRAPHIC_DATASETS: + geometry_folder = os.path.join(output_folder, "predicted_geometries") + raster_folder = os.path.join(output_folder, "predicted_rasters") + os.makedirs(geometry_folder, exist_ok=True) + os.makedirs(raster_folder, exist_ok=True) + return { + "best-model": os.path.join(checkpoint_folder, best_model_filename), + "best-instance": os.path.join(checkpoint_folder, best_instance_filename), + "checkpoints": checkpoint_folder, + "geometries": geometry_folder, + "labels": label_folder, + "rasters": raster_folder, + } def recover_instance(instance_path): diff --git a/tests/test_path.py b/tests/test_path.py index 92e1b755..c2d68c3c 100644 --- a/tests/test_path.py +++ b/tests/test_path.py @@ -127,16 +127,21 @@ def test_output_folder(datapath_repo): """ datapath = str(datapath_repo) dataset = "shapes" + image_size = 100 model = "feature_detection" - instance_name = "test_instance" - prepare_output_folder(datapath, dataset, model, instance_name) - assert os.path.isdir(os.path.join(datapath, dataset, "output")) - assert os.path.isdir(os.path.join(datapath, dataset, "output", model)) + output_folder = prepare_output_folder(datapath, dataset, image_size, model) + assert len(output_folder.keys()) == 6 assert os.path.isdir( os.path.join(datapath, dataset, "output", model, "checkpoints") ) assert os.path.isdir( - os.path.join( - datapath, dataset, "output", model, "checkpoints", instance_name - ) + os.path.join(datapath, dataset, "output", model, "predicted_labels") + ) + dataset = "aerial" + prepare_output_folder(datapath, dataset, image_size, model) + assert os.path.isdir( + os.path.join(datapath, dataset, "output", model, "predicted_geometries") + ) + assert os.path.isdir( + os.path.join(datapath, dataset, "output", model, "predicted_rasters") ) From bbd34ed14529aed0800b87450c109e691dd91eb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Delhome?= Date: Wed, 6 May 2020 19:42:40 +0200 Subject: [PATCH 2/2] tests: fix broken tests --- deeposlandia/datasets/shapes.py | 2 +- deeposlandia/geometries.py | 4 ++-- tests/conftest.py | 5 +++++ tests/test_dataset.py | 4 ++++ tests/test_postprocess.py | 18 +++++++++++++----- 5 files changed, 25 insertions(+), 8 deletions(-) diff --git a/deeposlandia/datasets/shapes.py b/deeposlandia/datasets/shapes.py index 0d69c189..b425b9cb 100644 --- a/deeposlandia/datasets/shapes.py +++ b/deeposlandia/datasets/shapes.py @@ -98,7 +98,7 @@ def populate( output_dir=None, input_dir=None, nb_images=10000, - nb_tiles_per_images=None, + nb_tiles_per_image=None, aggregate=False, labelling=True, buf=8, diff --git a/deeposlandia/geometries.py b/deeposlandia/geometries.py index be6f7269..8b716dfe 100644 --- a/deeposlandia/geometries.py +++ b/deeposlandia/geometries.py @@ -10,7 +10,7 @@ import cv2 import daiquiri -import fiona +from fiona.crs import from_epsg import geopandas as gpd import numpy as np import shapely.geometry as shgeom @@ -265,7 +265,7 @@ def extract_tile_items( raster_features, min_x, min_y, tile_width, tile_height ) bdf = gpd.GeoDataFrame( - crs=fiona.crs.from_epsg(raster_features["srid"]), geometry=[area] + crs=from_epsg(raster_features["srid"]), geometry=[area] ) reproj_labels = labels.to_crs(epsg=raster_features["srid"]) tile_items = gpd.sjoin(reproj_labels, bdf) diff --git a/tests/conftest.py b/tests/conftest.py index 8ceed8ce..77fe8853 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -170,6 +170,11 @@ def aerial_nb_images(): return 1 +@pytest.fixture +def nb_tiles_per_image(): + return 10 + + @pytest.fixture def aerial_nb_output_training_images(): return 10 diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 7e67086c..60fef1d0 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -145,6 +145,7 @@ def test_aerial_training_dataset_population( aerial_training_temp_dir, aerial_raw_sample, aerial_nb_images, + nb_tiles_per_image, aerial_training_config, aerial_nb_labels, aerial_nb_output_training_images, @@ -156,6 +157,7 @@ def test_aerial_training_dataset_population( str(aerial_training_temp_dir), aerial_raw_sample, nb_images=aerial_nb_images, + nb_tiles_per_image=nb_tiles_per_image, ) d.save(str(aerial_training_config)) assert d.get_nb_labels() == aerial_nb_labels @@ -223,6 +225,7 @@ def test_tanzania_training_dataset_population( tanzania_training_temp_dir, tanzania_raw_sample, tanzania_nb_images, + nb_tiles_per_image, tanzania_training_config, tanzania_nb_labels, tanzania_nb_output_training_images, @@ -234,6 +237,7 @@ def test_tanzania_training_dataset_population( str(tanzania_training_temp_dir), tanzania_raw_sample, nb_images=tanzania_nb_images, + nb_tiles_per_image=nb_tiles_per_image, ) d.save(str(tanzania_training_config)) assert d.get_nb_labels() == tanzania_nb_labels diff --git a/tests/test_postprocess.py b/tests/test_postprocess.py index 82659d6f..86f93d5b 100644 --- a/tests/test_postprocess.py +++ b/tests/test_postprocess.py @@ -1,6 +1,8 @@ """Unit tests dedicated to predicted label postprocessing """ +import os + import numpy as np import pytest @@ -13,7 +15,7 @@ def test_get_image_paths(tanzania_image_size): Preprocessed image filenames must end with ".png" """ filenames = postprocess.get_image_paths( - "./tests/data", "tanzania", tanzania_image_size, "grid_066" + f"./tests/data/tanzania/preprocessed/{tanzania_image_size}/testing/", "tanzania_sample" ) assert np.all([f.endswith(".png") for f in filenames]) @@ -28,7 +30,7 @@ def test_extract_images( 3). """ filenames = postprocess.get_image_paths( - "./tests/data", "tanzania", tanzania_image_size, "tanzania_sample" + f"./tests/data/tanzania/preprocessed/{tanzania_image_size}/testing/", "tanzania_sample" ) images = postprocess.extract_images(filenames) assert len(images.shape) == 4 @@ -64,7 +66,9 @@ def test_get_trained_model(tanzania_image_size, tanzania_nb_labels): + 4 are related to pooling """ model = postprocess.get_trained_model( - "./tests.data", "tanzania", tanzania_image_size, tanzania_nb_labels + "./tests/data/tanzania/output/semseg/checkpoints/", + tanzania_image_size, + tanzania_nb_labels ) assert model.input_shape[1:] == ( tanzania_image_size, @@ -233,12 +237,16 @@ def test_build_full_labelled_image( datapath = "./tests/data" dataset = "tanzania" image_paths = postprocess.get_image_paths( - datapath, dataset, tanzania_image_size, "tanzania_sample" + os.path.join(datapath, dataset, "preprocessed", str(tanzania_image_size), "testing"), + "tanzania_sample" ) images = postprocess.extract_images(image_paths) coordinates = postprocess.extract_coordinates_from_filenames(image_paths) + model_filename = f"best-model-{tanzania_image_size}.h5" model = postprocess.get_trained_model( - datapath, dataset, tanzania_image_size, tanzania_nb_labels + os.path.join(datapath, dataset, "output/semseg/checkpoints/", model_filename), + tanzania_image_size, + tanzania_nb_labels ) labelled_image = postprocess.build_full_labelled_image( images,