Skip to content
This repository has been archived by the owner on Jun 3, 2020. It is now read-only.

Commit

Permalink
Merge pull request #101 from Oslandia/tanzania_datagen
Browse files Browse the repository at this point in the history
Tanzania datagen
  • Loading branch information
delhomer authored Jan 24, 2019
2 parents d8e613a + 8c95603 commit ceef570
Show file tree
Hide file tree
Showing 5 changed files with 237 additions and 86 deletions.
305 changes: 229 additions & 76 deletions deeposlandia/datasets/tanzania.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,76 @@ def __init__(self, img_size):
color=self.FOUNDATION_COLOR, is_evaluate=True)


def _generate_preprocessed_filenames(
self, image_filename, output_dir, x, y, suffix
):
"""Generate preprocessed image and label filenames on the file system,
starting from a raw image filename
Parameters
----------
image_filename : str
Original image filename
output_dir : str
Output folder for preprocessed material
x : int
Extracted image west coordinates
y : int
Extracted image north coordinates
suffix : str
Preprocessed filename complement
Returns
-------
dict
Preprocessed image and corresponding label filenames
"""
basename_decomp = os.path.splitext(os.path.basename(image_filename))
img_id_str = (str(self.image_size) + '_'
+ str(self.image_size) + '_'
+ str(x) + '_' + str(y) + "_" + suffix)
new_filename = basename_decomp[0] + '_' + img_id_str + ".png"
out_image_name = os.path.join(output_dir, 'images', new_filename)
out_label_name = out_image_name.replace("images", "labels")
return {"image": out_image_name, "labels": out_label_name}


def _serialize(
self, tile_image, labelled_image, label_dict,
image_filename, output_dir, x, y, suffix
):
"""Serialize a tiled image generated from an original high-resolution
raster as well as the labelled version of the tile
The method returns a dict that contains image-related file paths.
Parameters
----------
tile_image : PIL.Image
labelled_image : PIL.Image
label_dict : dict
image_filename : str
output_dir : str
x : int
y : int
Returns
-------
dict
Information related to the serialized tile (file paths, encountered
labels)
"""
dirs = self._generate_preprocessed_filenames(
image_filename, output_dir, x, y, suffix
)
tile_image.save(dirs["image"])
labelled_image.save(dirs["labels"])
return {"raw_filename": image_filename,
"image_filename": dirs["image"],
"label_filename": dirs["labels"],
"labels": label_dict}


def _preprocess_tile(self, x, y, image_filename, output_dir,
raster, labels=None):
"""Preprocess one single tile built from `image_filename`, with respect
Expand All @@ -94,51 +164,24 @@ def _preprocess_tile(self, x, y, image_filename, output_dir,
Key/values with the filenames and label ids
"""
basename_decomp = os.path.splitext(
os.path.basename(image_filename))
img_id_str = (str(self.image_size) + '_'
+ str(self.image_size) + '_'
+ str(x) + '_' + str(y))
new_in_filename = (basename_decomp[0] + '_'
+ img_id_str + ".png")
new_in_path = os.path.join(output_dir, 'images',
new_in_filename)
gdal.Translate(new_in_path, raster,
dirs = self._generate_preprocessed_filenames(
image_filename, output_dir, x, y
)
gdal.Translate(dirs["image"], raster,
format="PNG",
srcWin=[x, y, self.image_size, self.image_size])
if not labels is None:
raster_features = get_image_features(raster)
tile_items = extract_tile_items(raster_features, labels,
x, y,
self.image_size,
self.image_size,
tile_srid=32737)
out_labelname = (new_in_path
.replace("images", "labels"))
mask = self.load_mask(tile_items, raster_features, x, y)
label_dict = utils.build_labels(mask,
range(self.get_nb_labels()),
"tanzania")
labelled_image = utils.build_image_from_config(mask,
self.labels)
labelled_image.save(out_labelname)
return {"raw_filename": image_filename,
"image_filename": new_in_path,
"label_filename": out_labelname,
"labels": label_dict}
else:
return {"raw_filename": image_filename,
"image_filename": new_in_path}
return {"raw_filename": image_filename,
"image_filename": dirs["image"]}


def _preprocess(self, image_filename, output_dir, labelling):
def _preprocess_for_inference(self, image_filename, output_dir):
"""Resize/crop then save the training & label images
Parameters
----------
image_filename : str
Full path towards the image on the disk
datadir : str
output_dir : str
Output path where preprocessed image must be saved
Returns
Expand All @@ -150,29 +193,121 @@ def _preprocess(self, image_filename, output_dir, labelling):
raw_img_width = raster.RasterXSize
raw_img_height = raster.RasterYSize
result_dicts = []
logger.info("Raw image size: %s, %s" % (raw_img_width, raw_img_height))
logger.info("Image filename: %s" % image_filename)

labels = None
if labelling:
label_filename = (image_filename
.replace("images", "labels")
.replace(".tif", ".geojson"))
labels = gpd.read_file(label_filename)
labels = labels.loc[~labels.geometry.isna(), ["condition", "geometry"]]
none_mask = [lc is None for lc in labels.condition]
labels.loc[none_mask, "condition"] = "Complete"
logger.info("Raw image size: %s, %s" % (raw_img_width, raw_img_height))

for x in range(0, raw_img_width, self.image_size):
for y in range(0, raw_img_height, self.image_size):
tile_results = self._preprocess_tile(x, y, image_filename,
output_dir,
raster, labels)
output_dir, raster)
result_dicts.append(tile_results)
del raster
return result_dicts


def _preprocess_for_training(self, image_filename, output_dir, nb_images):
"""Resize/crop then save the training & label images
Parameters
----------
image_filename : str
Full path towards the image on the disk
output_dir : str
Output path where preprocessed image must be saved
Returns
-------
dict
Key/values with the filenames and label ids
"""
raster = gdal.Open(image_filename)
raw_img_width = raster.RasterXSize
raw_img_height = raster.RasterYSize
image_data = raster.ReadAsArray()
image_data = np.swapaxes(image_data, 0, 2)
result_dicts = []
logger.info("Image filename: %s" % image_filename)
logger.info("Raw image size: %s, %s" % (raw_img_width, raw_img_height))

label_filename = (image_filename
.replace("images", "labels")
.replace(".tif", ".geojson"))
labels = gpd.read_file(label_filename)
labels = labels.loc[~labels.geometry.isna(), ["condition", "geometry"]]
none_mask = [lc is None for lc in labels.condition]
labels.loc[none_mask, "condition"] = "Complete"

nb_attempts = 0
image_counter = 0
empty_image_counter = 0
while image_counter < nb_images and nb_attempts < 2 * nb_images:
# randomly pick an image
x = np.random.randint(0, raw_img_width - self.image_size)
y = np.random.randint(0, raw_img_height - self.image_size)

tile_data = image_data[x:(x+self.image_size),
y:(y+self.image_size)]
tile_image = Image.fromarray(tile_data)
raster_features = get_image_features(raster)
tile_items = extract_tile_items(raster_features, labels,
x, y,
self.image_size,
self.image_size,
tile_srid=32737)
mask = self.load_mask(tile_items, raster_features, x, y)
label_dict = utils.build_labels(mask,
range(self.get_nb_labels()),
"tanzania")
labelled_image = utils.build_image_from_config(mask, self.labels)
if len(tile_items) > 0:
tiled_results = self._serialize(
tile_image, labelled_image, label_dict,
image_filename, output_dir, x, y, "nw"
)
result_dicts.append(tiled_results)
image_counter += 1
tile_image_ne = tile_image.transpose(Image.FLIP_LEFT_RIGHT)
labelled_image_ne = labelled_image.transpose(Image.FLIP_LEFT_RIGHT)
tiled_results_ne = self._serialize(
tile_image_ne, labelled_image_ne, label_dict,
image_filename, output_dir, x, y, "ne"
)
result_dicts.append(tiled_results_ne)
image_counter += 1
tile_image_sw = tile_image.transpose(Image.FLIP_TOP_BOTTOM)
labelled_image_sw = labelled_image.transpose(Image.FLIP_TOP_BOTTOM)
tiled_results_sw = self._serialize(
tile_image_sw, labelled_image_sw, label_dict,
image_filename, output_dir, x, y, "sw"
)
result_dicts.append(tiled_results_sw)
image_counter += 1
tile_image_se = tile_image_sw.transpose(Image.FLIP_LEFT_RIGHT)
labelled_image_se = labelled_image_sw.transpose(Image.FLIP_LEFT_RIGHT)
tiled_results_se = self._serialize(
tile_image_se, labelled_image_se, label_dict,
image_filename, output_dir, x, y, "se"
)
result_dicts.append(tiled_results_se)
image_counter += 1
del tile_image_se, tile_image_sw, tile_image_ne
del labelled_image_se, labelled_image_sw, labelled_image_ne
else:
if empty_image_counter < 0.1 * nb_images:
tiled_results = self._serialize(
tile_image, labelled_image, label_dict,
image_filename, output_dir, x, y, "nw"
)
result_dicts.append(tiled_results)
image_counter += 1
empty_image_counter += 1
nb_attempts += 1
del raster
logger.info("Generate %s images after %s attempts."
% (image_counter, nb_attempts))
return result_dicts


def populate(self, output_dir, input_dir, nb_images=None,
aggregate=False, labelling=True):
""" Populate the dataset with images contained into `datadir` directory
Expand All @@ -195,15 +330,24 @@ class method genericity
image_list = os.listdir(os.path.join(input_dir, "images"))
image_list_longname = [os.path.join(input_dir, "images", l)
for l in image_list
if not l.startswith('.')][:nb_images]
if not l.startswith('.')]
nb_image_files = len(image_list_longname)

logger.info("Getting %s images to preprocess..."
% len(image_list_longname))
% nb_image_files)
logger.info(image_list_longname)
with Pool() as p:
self.image_info = p.starmap(self._preprocess,
[(x, output_dir, labelling)
for x in image_list_longname])
if labelling:
nb_tile_per_image = int(nb_images/nb_image_files)
with Pool(processes=3) as p:
self.image_info = p.starmap(self._preprocess_for_training,
[(x, output_dir, nb_tile_per_image)
for x in image_list_longname])
else:
with Pool(processes=3) as p:
self.image_info = p.starmap(self._preprocess_for_inference,
[(x, output_dir)
for x in image_list_longname])

self.image_info = [item for sublist in self.image_info
for item in sublist]
logger.info("Saved %s images in the preprocessed dataset."
Expand Down Expand Up @@ -242,36 +386,45 @@ def load_mask(self, buildings, raster_features, min_x, min_y):
if buildings.shape[0] == 0:
return mask
for idx, row in buildings.iterrows():
points = self.extract_points_from_polygon(row["geometry"],
raster_features)
points[:, 0] -= min_x
points[:, 1] -= min_y
points = extract_points_from_polygon(row["geometry"],
raster_features,
min_x, min_y)
label_id = [label["id"] for label in self.labels
if label["name"] == row["condition"].lower()][0]
mask = cv2.fillPoly(mask, [points], label_id)
return mask


def extract_points_from_polygon(self, p, features):
"""Extract points from a polygon
def extract_points_from_polygon(p, features, min_x, min_y):
"""Extract points from a polygon
Parameters
----------
p : shapely.geometry.Polygon
Polygon to detail
features : dict
Geographical features associated to the image
Returns
-------
np.array
Polygon vertices
Parameters
----------
p : shapely.geometry.Polygon
Polygon to detail
features : dict
Geographical features associated to the image
min_x : int
Minimal x-coordinate (west)
min_y : int
Minimal y-coordinate (north)
Returns
-------
np.array
Polygon vertices
"""
raw_xs, raw_ys = p.exterior.xy
xs = get_x_pixel(raw_xs, features["east"], features["west"], features["width"])
ys = get_y_pixel(raw_ys, features["south"], features["north"], features["height"])
points = np.array([[x, y] for x, y in zip(xs, ys)], dtype=np.int32)
return points
"""
raw_xs, raw_ys = p.exterior.xy
xs = get_x_pixel(
raw_xs, features["east"], features["west"], features["width"]
)
ys = get_y_pixel(
raw_ys, features["south"], features["north"], features["height"]
)
points = np.array([[y, x] for x, y in zip(xs, ys)], dtype=np.int32)
points[:, 0] -= min_y
points[:, 1] -= min_x
return points


def get_x_pixel(coord, east, west, width):
Expand Down
3 changes: 1 addition & 2 deletions deeposlandia/paramoptim.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,8 +292,7 @@ def run_model(train_generator, validation_generator, dl_model, output_folder,
save_weights_only=False,
mode='auto', period=1)
terminate_on_nan = callbacks.TerminateOnNaN()
earlystop = callbacks.EarlyStopping(monitor='val_acc',
min_delta=0.001,
earlystop = callbacks.EarlyStopping(monitor='val_loss',
patience=10,
verbose=1,
mode='max')
Expand Down
3 changes: 1 addition & 2 deletions deeposlandia/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,7 @@ def add_training_arguments(parser):
save_weights_only=False,
mode='auto', period=1)
terminate_on_nan = callbacks.TerminateOnNaN()
earlystop = callbacks.EarlyStopping(monitor='val_acc',
min_delta=0.001,
earlystop = callbacks.EarlyStopping(monitor='val_loss',
patience=10,
verbose=1,
mode='max')
Expand Down
2 changes: 1 addition & 1 deletion deeposlandia/webapp/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def recover_image_info(dataset, filename):
elif dataset == "aerial":
size_aggregation = "250_full"
elif dataset == "tanzania":
size_aggregation = "384_full"
size_aggregation = "512_full"
elif dataset == "shapes":
size_aggregation = "64_full"
else:
Expand Down
Loading

0 comments on commit ceef570

Please sign in to comment.