From f2475f87c0e1c6829fb8d0c567ac5a9948be615f Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Thu, 7 Mar 2024 12:00:43 -0900 Subject: [PATCH 01/13] Update to_points() functionality and add load_mask from rasterio --- examples/handling/interface/topoints.py | 2 +- geoutils/raster/raster.py | 178 ++++++++++++++++++------ tests/test_raster.py | 20 +-- 3 files changed, 147 insertions(+), 53 deletions(-) diff --git a/examples/handling/interface/topoints.py b/examples/handling/interface/topoints.py index f7e5dfc6..fded01f6 100644 --- a/examples/handling/interface/topoints.py +++ b/examples/handling/interface/topoints.py @@ -21,7 +21,7 @@ # %% # We convert the raster to points. By default, this returns a vector with columb geometry burned. -pts_rast = rast.to_points() +pts_rast = rast.to_pointcloud() pts_rast # %% diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index 1d092276..598b4f2a 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -192,6 +192,7 @@ def _default_nodata(dtype: DTypeLike) -> int: def _load_rio( dataset: rio.io.DatasetReader, + only_mask: bool = False, indexes: int | list[int] | None = None, masked: bool = False, transform: Affine | None = None, @@ -205,6 +206,7 @@ def _load_rio( Ensure that ``self.data.ndim=3`` for ease of use (needed e.g. in show). :param dataset: Dataset to read (opened with :func:`rasterio.open`). + :param only_mask: Read only the dataset mask. :param indexes: Band(s) to load. Note that rasterio begins counting at 1, not 0. :param masked: Whether the mask should be read (if any exists) to use the nodata to mask values. :param transform: Create a window from the given transform (to read only parts of the raster) @@ -242,12 +244,17 @@ def _load_rio( window = None if indexes is None: - data = dataset.read(masked=masked, window=window, **kwargs) + if only_mask: + data = dataset.read_masks(masked=masked, window=window, **kwargs) + else: + data = dataset.read(masked=masked, window=window, **kwargs) else: - data = dataset.read(indexes=indexes, masked=masked, window=window, **kwargs) + if only_mask: + data = dataset.read_masks(indexes=indexes, masked=masked, window=window, **kwargs) + else: + data = dataset.read(indexes=indexes, masked=masked, window=window, **kwargs) return data - def _get_reproject_params( raster: RasterType, crs: CRS | str | int | None = None, @@ -691,12 +698,64 @@ def driver(self) -> str | None: """Driver used to read a file on disk.""" return self._driver + def _load_only_mask(self, bands: int | list[int] | None = None, **kwargs: Any) -> NDArrayBool: + """ + Load only the raster mask from disk and return as independent array (not stored in any class attributes). + + :param bands: Band(s) to load. Note that rasterio begins counting at 1, not 0. + :param kwargs: Optional keyword arguments sent to '_load_rio()'. + + :raises ValueError: If the data are already loaded. + :raises AttributeError: If no 'filename' attribute exists. + """ + if self.is_loaded: + raise ValueError("Data are already loaded.") + + if self.filename is None: + raise AttributeError( + "Cannot load as filename is not set anymore. Did you manually update the filename attribute?" + ) + + out_count = self._out_count + # If no index is passed, use all of them + if bands is None: + valid_bands = self.bands + # If a new index was pass, redefine out_shape + elif isinstance(bands, (int, list)): + # Rewrite properly as a tuple + if isinstance(bands, int): + valid_bands = (bands,) + else: + valid_bands = tuple(bands) + # Update out_count if out_shape exists (when a downsampling has been passed) + if self._out_shape is not None: + out_count = len(valid_bands) + + # If a downsampled out_shape was defined during instantiation + with rio.open(self.filename) as dataset: + mask = _load_rio( + dataset, + only_mask=True, + indexes=list(valid_bands), + masked=self._masked, + transform=self.transform, + shape=self.shape, + out_shape=self._out_shape, + out_count=out_count, + **kwargs, + ) + + # Valid data is equal to 255, invalid is equal to zero + mask_bool = mask == 255 + + return mask_bool + def load(self, bands: int | list[int] | None = None, **kwargs: Any) -> None: """ Load the raster array from disk. - :param kwargs: Optional keyword arguments sent to '_load_rio()'. :param bands: Band(s) to load. Note that rasterio begins counting at 1, not 0. + :param kwargs: Optional keyword arguments sent to '_load_rio()'. :raises ValueError: If the data are already loaded. :raises AttributeError: If no 'filename' attribute exists. @@ -3364,86 +3423,117 @@ def split_bands(self: RasterType, copy: bool = False, bands: list[int] | int | N return raster_bands @overload - def to_points( + def to_pointcloud( self, - subsample: float | int, + data_columns: str | list[str] = None, + subsample: float | int = 1, + *, as_array: Literal[False] = False, - pixel_offset: Literal["center", "corner"] = "center", + random_state: np.random.RandomState | int | None = None, + force_pixel_offset: Literal["center", "ul", "ur", "ll", "lr"] = "ul", ) -> NDArrayNum: ... @overload - def to_points( + def to_pointcloud( self, - subsample: float | int, + data_columns: str | list[str] = None, + subsample: float | int = 1, + *, as_array: Literal[True], - pixel_offset: Literal["center", "corner"] = "center", + random_state: np.random.RandomState | int | None = None, + force_pixel_offset: Literal["center", "ul", "ur", "ll", "lr"] = "ul", ) -> Vector: ... - def to_points( + @overload + def to_pointcloud( + self, + data_column_names: str | list[str] = None, + subsample: float | int = 1, + *, + as_array: bool = False, + random_state: np.random.RandomState | int | None = None, + force_pixel_offset: Literal["center", "ul", "ur", "ll", "lr"] = "ul", + ) -> NDArrayNum | Vector: + ... + + def to_pointcloud( self, + data_column_names: str | list[str] = None, subsample: float | int = 1, as_array: bool = False, - pixel_offset: Literal["center", "corner"] = "center", + random_state: np.random.RandomState | int | None = None, + force_pixel_offset: Literal["center", "ul", "ur", "ll", "lr"] = "ul", ) -> NDArrayNum | Vector: """ - Convert raster to a table of coordinates and their corresponding values. - - Optionally, randomly sample the raster. + Convert raster to point cloud with one data column per band (names default to "b1", "b2", etc). - If 'sample' is either 1, or is equal to the pixel count, all points are returned in order. - If 'sample' is smaller than 1 (for fractions), or smaller than the pixel count, a random sample - of points is returned. + Optionally, randomly subsample valid pixels (nodata values are skipped). + If 'subsample' is either 1, or is equal to the pixel count, all valid points are returned. + If 'subsample' is smaller than 1 (for fractions), or smaller than the pixel count, a random subsample + of valid points is returned. - If the raster is not loaded, sampling will be done from disk without loading the entire Raster. + If the raster is not loaded, sampling will be done from disk using rasterio.sample after loading only the masks + of the dataset. Formats: * `as_array` == False: A vector with dataframe columns ["b1", "b2", ..., "geometry"], * `as_array` == True: A numpy ndarray of shape (N, 2 + count) with the columns [x, y, b1, b2..]. + :param data_column_names: Names of data columns to use for band values. :param subsample: Subsample size. If > 1, parsed as a count, otherwise a fraction. :param as_array: Return an array instead of a vector. - :param pixel_offset: The point at which to associate the pixel coordinate with ('corner' == upper left). + :param random_state: Random state or seed number. + :param force_pixel_offset: Force offset to derive point coordinate with. Raster coordinates normally only + associate to upper-left corner "ul" ("Area" definition) or center ("Point" definition). :raises ValueError: If the sample count or fraction is poorly formatted. - :returns: A GeoDataFrame, or ndarray of the shape (N, 2 + count) where N is the sample count. + :returns: A vector, or ndarray of the shape (N, 2 + count) where N is the sample count. """ - data_size = self.width * self.height - # Validate the sample argument. - if subsample <= 0.0: - raise ValueError(f"subsample cannot be zero or negative (given value: {subsample})") - # If the sample is equal to or less than 1, it is assumed to be a fraction. - if subsample <= 1.0: - sample = int(data_size * subsample) - else: - sample = int(subsample) - if sample > data_size: - raise ValueError(f"sample cannot exceed the size of the dataset ({sample} vs {data_size})") + # Format data columns input into list if it is a string + if isinstance(data_column_names, str): + data_column_names = [data_column_names] + if not self.count == len(data_column_names): + raise ValueError(f"Data column names {data_column_names} must be a list of same length as " + f"raster band count {self.count}.") + + if self.is_loaded: - # If the sample is smaller than the max size, take a random sample of indices, otherwise take the whole. - choice = np.random.randint(0, data_size - 1, sample) if sample != data_size else np.arange(data_size) + # Get subsample indices on raster directly + indices = self.subsample(subsample=subsample, random_state=random_state, return_indices=True) - cols = choice % self.width - rows = (choice / self.width).astype(int) + else: + # Load only mask of valid data from disk + valid_mask = self._load_only_mask() + + # Get subsample on valid mask + indices = subsample_array(array=valid_mask, subsample=subsample, random_state=random_state, + return_indices=True) - # Extract the coordinates of the pixels and filter by the chosen pixels. - x_coords, y_coords = (np.array(a) for a in self.ij2xy(rows, cols, offset=pixel_offset)) + # Extract the coordinates at subsampled pixels with valid data + # To extract data, we always use "upper left" which rasterio interprets as the exact coordinates of the raster + x_coords, y_coords = (np.array(a) for a in self.ij2xy(indices[0], indices[1], offset="ul")) - # If the Raster is loaded, pick from the data, otherwise use the disk-sample method from rasterio. + # If the Raster is loaded, pick from the data while ignoring the mask if self.is_loaded: if self.count == 1: - pixel_data = self.data[rows, cols] + pixel_data = self.data[indices[0], indices[1]] else: - pixel_data = self.data[:, rows, cols] + pixel_data = self.data[:, indices[0], indices[1]] + + # Otherwise use rasterio.sample to load only requested pixels else: with rio.open(self.filename) as raster: pixel_data = np.array(list(raster.sample(zip(x_coords, y_coords)))).T - if isinstance(pixel_data, np.ma.masked_array): - pixel_data = np.where(pixel_data.mask, np.nan, pixel_data.data) + # Fill masked array with NaNs before writing to points + pixel_data = pixel_data.filled(np.nan) + + # Now we force the coordinates we define for the point cloud, according to pixel interpretation + x_coords, y_coords = (np.array(a) for a in self.ij2xy(indices[0], indices[1], offset=force_pixel_offset)) # Merge the coordinates and pixel data into a point cloud. points_arr = np.vstack((x_coords.reshape(1, -1), y_coords.reshape(1, -1), pixel_data)).T @@ -3452,7 +3542,7 @@ def to_points( points = Vector( gpd.GeoDataFrame( points_arr[:, 2:], - columns=[f"b{i}" for i in range(1, self.count + 1)], + columns=data_column_names if not None else [f"b{i}" for i in range(1, self.count + 1)], geometry=gpd.points_from_xy(points_arr[:, 0], points_arr[:, 1]), crs=self.crs, ) diff --git a/tests/test_raster.py b/tests/test_raster.py index 97a68d86..86812e7a 100644 --- a/tests/test_raster.py +++ b/tests/test_raster.py @@ -2966,8 +2966,12 @@ def test_proximity_parameters(self) -> None: # With only inside proximity raster1.proximity(vector=vector, in_or_out="in") - def test_to_points(self) -> None: - """Test the outputs of the to_points method and that it doesn't load if not needed.""" + @pytest.mark.parametrize("example", [landsat_b4_path, aster_dem_path]) # type: ignore + def test_to_pointcloud(self, example: str) -> None: + """Test to_pointcloud with real data.""" + + def test_to_pointcloud__synthetic(self) -> None: + """Test to_pointcloud method with synthetic data.""" # Create a small raster to test point sampling on img0 = gu.Raster.from_array( @@ -2975,24 +2979,24 @@ def test_to_points(self) -> None: ) # Sample the whole raster (fraction==1) - points = img0.to_points(1, as_array=True) + points = img0.to_pointcloud(subsample=1, as_array=True) # Validate that 25 points were sampled (equating to img1.height * img1.width) with x, y, and band0 values. assert isinstance(points, np.ndarray) assert points.shape == (25, 3) assert np.array_equal(np.asarray(points[:, 0]), np.tile(np.linspace(0.5, 4.5, 5), 5)) - assert img0.to_points(0.2, as_array=True).shape == (5, 3) + assert img0.to_pointcloud(subsample=0.2, as_array=True).shape == (5, 3) # Try with a single-band raster img1 = gu.Raster(self.aster_dem_path) - points = img1.to_points(10, as_array=True) + points = img1.to_pointcloud(subsample=10, as_array=True) assert points.shape == (10, 3) assert not img1.is_loaded - points_frame = img1.to_points(10) + points_frame = img1.to_pointcloud(subsample=10) assert isinstance(points_frame, gu.Vector) assert np.array_equal(points_frame.ds.columns, ["b1", "geometry"]) @@ -3001,12 +3005,12 @@ def test_to_points(self) -> None: # Try with a multi-band raster img2 = gu.Raster(self.landsat_rgb_path) - points = img2.to_points(10, as_array=True) + points = img2.to_pointcloud(subsample=10, as_array=True) assert points.shape == (10, 5) assert not img2.is_loaded - points_frame = img2.to_points(10) + points_frame = img2.to_pointcloud(subsample=10) assert isinstance(points_frame, gu.Vector) assert np.array_equal(points_frame.ds.columns, ["b1", "b2", "b3", "geometry"]) From 1d6ef326f61c5e5925e894d385d5796eff843109 Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Thu, 7 Mar 2024 14:42:19 -0900 Subject: [PATCH 02/13] Fix behaviour to be consistent with point cloud definition --- geoutils/raster/raster.py | 95 ++++++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 26 deletions(-) diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index 598b4f2a..b0b1c305 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -3425,7 +3425,9 @@ def split_bands(self: RasterType, copy: bool = False, bands: list[int] | int | N @overload def to_pointcloud( self, - data_columns: str | list[str] = None, + data_column_name: str = "b1", + data_band: int = 1, + store_auxiliary_bands: bool = False, subsample: float | int = 1, *, as_array: Literal[False] = False, @@ -3437,7 +3439,9 @@ def to_pointcloud( @overload def to_pointcloud( self, - data_columns: str | list[str] = None, + data_column_name: str = "b1", + data_band: int = 1, + store_auxiliary_bands: bool = False, subsample: float | int = 1, *, as_array: Literal[True], @@ -3448,26 +3452,40 @@ def to_pointcloud( @overload def to_pointcloud( - self, - data_column_names: str | list[str] = None, - subsample: float | int = 1, - *, - as_array: bool = False, - random_state: np.random.RandomState | int | None = None, - force_pixel_offset: Literal["center", "ul", "ur", "ll", "lr"] = "ul", + self, + data_column_name: str = "b1", + data_band: int = 1, + store_auxiliary_bands: bool = False, + subsample: float | int = 1, + *, + as_array: bool = False, + random_state: np.random.RandomState | int | None = None, + force_pixel_offset: Literal["center", "ul", "ur", "ll", "lr"] = "ul", ) -> NDArrayNum | Vector: ... def to_pointcloud( self, - data_column_names: str | list[str] = None, + data_column_name: str = "b1", + data_band: int = 1, + store_auxiliary_bands: bool = False, subsample: float | int = 1, as_array: bool = False, random_state: np.random.RandomState | int | None = None, force_pixel_offset: Literal["center", "ul", "ur", "ll", "lr"] = "ul", ) -> NDArrayNum | Vector: """ - Convert raster to point cloud with one data column per band (names default to "b1", "b2", etc). + Convert raster to point cloud. + + A point cloud is a vector of point geometries associated to a data column, and possibly other auxiliary data + columns, see geoutils.PointCloud. + + For a single band raster, the main data column name of the point cloud defaults to "b1" and stores values of + that single band. + For a multi-band raster, the main data column name of the point cloud defaults to "bX" where X is the data band + index chosen by the user (defaults to 1, the first band). + Optionally, all other bands can also be stored in columns "b1", "b2", etc. For more specific band selection, + use Raster.split_bands previous to converting to point cloud. Optionally, randomly subsample valid pixels (nodata values are skipped). If 'subsample' is either 1, or is equal to the pixel count, all valid points are returned. @@ -3481,7 +3499,11 @@ def to_pointcloud( * `as_array` == False: A vector with dataframe columns ["b1", "b2", ..., "geometry"], * `as_array` == True: A numpy ndarray of shape (N, 2 + count) with the columns [x, y, b1, b2..]. - :param data_column_names: Names of data columns to use for band values. + :param data_column_name: Name of point cloud data column to use, defaults to "b1". + :param data_band: (Only for multi-band rasters) Band to use for data column, defaults to first. Band counting + starts at 1. + :param store_auxiliary_bands: (Only for multi-band rasters) Whether to save other bands as auxiliary data + columns, defaults to False. :param subsample: Subsample size. If > 1, parsed as a count, otherwise a fraction. :param as_array: Return an array instead of a vector. :param random_state: Random state or seed number. @@ -3490,15 +3512,24 @@ def to_pointcloud( :raises ValueError: If the sample count or fraction is poorly formatted. - :returns: A vector, or ndarray of the shape (N, 2 + count) where N is the sample count. + :returns: A point cloud, or ndarray of the shape (N, 2 + count) where N is the sample count. """ - # Format data columns input into list if it is a string - if isinstance(data_column_names, str): - data_column_names = [data_column_names] - if not self.count == len(data_column_names): - raise ValueError(f"Data column names {data_column_names} must be a list of same length as " - f"raster band count {self.count}.") + # Input checks + if not isinstance(data_column_name, str): + raise ValueError("Data column name must be a string.") + if not isinstance(data_band, int) and (1 < data_band or self.count < data_band): + raise ValueError(f"Data band number must be between 1 and the total number of bands of {self.count}.") + + # Rename data column if a different band is selected but the name is still default + if data_band != 1 and data_column_name == "b1": + data_column_name = "b" + str(data_band) + + # Extract all bands if storing, otherwise the single one + if store_auxiliary_bands: + bands_to_extract = None + else: + bands_to_extract = data_band if self.is_loaded: @@ -3507,7 +3538,7 @@ def to_pointcloud( else: # Load only mask of valid data from disk - valid_mask = self._load_only_mask() + valid_mask = self._load_only_mask(bands=bands_to_extract) # Get subsample on valid mask indices = subsample_array(array=valid_mask, subsample=subsample, random_state=random_state, @@ -3515,6 +3546,7 @@ def to_pointcloud( # Extract the coordinates at subsampled pixels with valid data # To extract data, we always use "upper left" which rasterio interprets as the exact coordinates of the raster + # Further below we redefine output coordinates based on point interpretation x_coords, y_coords = (np.array(a) for a in self.ij2xy(indices[0], indices[1], offset="ul")) # If the Raster is loaded, pick from the data while ignoring the mask @@ -3522,27 +3554,38 @@ def to_pointcloud( if self.count == 1: pixel_data = self.data[indices[0], indices[1]] else: - pixel_data = self.data[:, indices[0], indices[1]] + if store_auxiliary_bands: + pixel_data = self.data[:, indices[0], indices[1]] + else: + pixel_data = self.data[data_band - 1, indices[0], indices[1]] # Otherwise use rasterio.sample to load only requested pixels else: with rio.open(self.filename) as raster: - pixel_data = np.array(list(raster.sample(zip(x_coords, y_coords)))).T + pixel_data = np.array(list(raster.sample(zip(x_coords, y_coords))), indexes=bands_to_extract).T # Fill masked array with NaNs before writing to points - pixel_data = pixel_data.filled(np.nan) + if np.ma.is_masked(pixel_data): + pixel_data = pixel_data.filled(np.nan) # Now we force the coordinates we define for the point cloud, according to pixel interpretation - x_coords, y_coords = (np.array(a) for a in self.ij2xy(indices[0], indices[1], offset=force_pixel_offset)) + x_coords_2, y_coords_2 = (np.array(a) for a in self.ij2xy(indices[0], indices[1], offset=force_pixel_offset)) # Merge the coordinates and pixel data into a point cloud. - points_arr = np.vstack((x_coords.reshape(1, -1), y_coords.reshape(1, -1), pixel_data)).T + points_arr = np.vstack((x_coords_2.reshape(1, -1), y_coords_2.reshape(1, -1), pixel_data)).T + + # Define column names + if store_auxiliary_bands: + all_column_names = [f"b{i}" for i in range(1, self.count + 1)] + all_column_names[data_band - 1] = data_column_name + else: + all_column_names = [data_column_name] if not as_array: points = Vector( gpd.GeoDataFrame( points_arr[:, 2:], - columns=data_column_names if not None else [f"b{i}" for i in range(1, self.count + 1)], + columns=all_column_names, geometry=gpd.points_from_xy(points_arr[:, 0], points_arr[:, 1]), crs=self.crs, ) From abae0973ac0938f59bd9948dd90f3e9c5beb9672 Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Thu, 7 Mar 2024 18:38:34 -0900 Subject: [PATCH 03/13] Add tests and adjust subsampling logic for subsample=1 --- geoutils/raster/raster.py | 49 ++++++++------ geoutils/raster/sampling.py | 9 ++- tests/test_raster.py | 123 ++++++++++++++++++++++++++++-------- 3 files changed, 132 insertions(+), 49 deletions(-) diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index b0b1c305..4baa753e 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -46,6 +46,7 @@ _get_utm_ups_crs, ) from geoutils.raster.sampling import subsample_array +from geoutils.raster.array import get_mask from geoutils.vector import Vector # If python38 or above, Literal is builtin. Otherwise, use typing_extensions @@ -245,12 +246,12 @@ def _load_rio( if indexes is None: if only_mask: - data = dataset.read_masks(masked=masked, window=window, **kwargs) + data = dataset.read_masks(window=window, **kwargs) else: data = dataset.read(masked=masked, window=window, **kwargs) else: if only_mask: - data = dataset.read_masks(indexes=indexes, masked=masked, window=window, **kwargs) + data = dataset.read_masks(indexes=indexes, window=window, **kwargs) else: data = dataset.read(indexes=indexes, masked=masked, window=window, **kwargs) return data @@ -817,7 +818,9 @@ def from_array( ) -> RasterType: """Create a raster from a numpy array and the georeferencing information. - :param data: Input array. + Expects a 2D (single band) or 3D (multi-band) array of the raster. The first axis corresponds to bands. + + :param data: Input array, 2D for single band or 3D for multi-band (bands should be first axis). :param transform: Affine 2D transform. Either a tuple(x_res, 0.0, top_left_x, 0.0, y_res, top_left_y) or an affine.Affine object. :param crs: Coordinate reference system. Either a rasterio CRS, @@ -3487,7 +3490,7 @@ def to_pointcloud( Optionally, all other bands can also be stored in columns "b1", "b2", etc. For more specific band selection, use Raster.split_bands previous to converting to point cloud. - Optionally, randomly subsample valid pixels (nodata values are skipped). + Optionally, randomly subsample valid pixels of the data band (nodata values are skipped). If 'subsample' is either 1, or is equal to the pixel count, all valid points are returned. If 'subsample' is smaller than 1 (for fractions), or smaller than the pixel count, a random subsample of valid points is returned. @@ -3525,24 +3528,24 @@ def to_pointcloud( if data_band != 1 and data_column_name == "b1": data_column_name = "b" + str(data_band) - # Extract all bands if storing, otherwise the single one - if store_auxiliary_bands: - bands_to_extract = None - else: - bands_to_extract = data_band - + # The valid mask is considered only for the data band if self.is_loaded: + if self.count == 1: + self_mask = get_mask(self.data) # This is to avoid the case where the mask is just "False" + valid_mask = ~self_mask + else: + self_mask = get_mask(self.data[data_band - 1, :, :]) # This is to avoid the case where the mask is just "False" + valid_mask = ~self_mask - # Get subsample indices on raster directly - indices = self.subsample(subsample=subsample, random_state=random_state, return_indices=True) - + # Load only mask of valid data from disk if array not loaded else: - # Load only mask of valid data from disk - valid_mask = self._load_only_mask(bands=bands_to_extract) + valid_mask = self._load_only_mask(bands=data_band) - # Get subsample on valid mask - indices = subsample_array(array=valid_mask, subsample=subsample, random_state=random_state, - return_indices=True) + # Get subsample on valid mask + # Build a low memory boolean masked array with invalid values masked to pass to subsampling + ma_valid = np.ma.masked_array(data=np.ones(np.shape(valid_mask), dtype=bool), mask=~valid_mask) + # Take a subsample within the valid values + indices = subsample_array(array=ma_valid, subsample=subsample, random_state=random_state, return_indices=True) # Extract the coordinates at subsampled pixels with valid data # To extract data, we always use "upper left" which rasterio interprets as the exact coordinates of the raster @@ -3561,11 +3564,17 @@ def to_pointcloud( # Otherwise use rasterio.sample to load only requested pixels else: + # Sample all bands if storing auxiliary, otherwise the single one + if store_auxiliary_bands: + bands_to_extract = None + else: + bands_to_extract = data_band + with rio.open(self.filename) as raster: - pixel_data = np.array(list(raster.sample(zip(x_coords, y_coords))), indexes=bands_to_extract).T + pixel_data = np.array(list(raster.sample(zip(x_coords, y_coords), indexes=bands_to_extract))).T # Fill masked array with NaNs before writing to points - if np.ma.is_masked(pixel_data): + if np.ma.isMaskedArray(pixel_data): pixel_data = pixel_data.filled(np.nan) # Now we force the coordinates we define for the point cloud, according to pixel interpretation diff --git a/geoutils/raster/sampling.py b/geoutils/raster/sampling.py index d0abcc8e..45ed9996 100644 --- a/geoutils/raster/sampling.py +++ b/geoutils/raster/sampling.py @@ -72,6 +72,14 @@ def subsample_array( valids = np.argwhere(~mask.flatten()).squeeze() # Get number of points to extract + # If subsample is one, we don't perform subsampling any operation, we return the valid array or indices + if subsample == 1: + unraveled_indices = np.unravel_index(valids, array.shape) + if return_indices: + return unraveled_indices + + else: + return array[unraveled_indices] if (subsample <= 1) & (subsample > 0): npoints = int(subsample * np.count_nonzero(~mask)) elif subsample > 1: @@ -90,7 +98,6 @@ def subsample_array( if return_indices: return unraveled_indices - else: return array[unraveled_indices] diff --git a/tests/test_raster.py b/tests/test_raster.py index 86812e7a..f64cf597 100644 --- a/tests/test_raster.py +++ b/tests/test_raster.py @@ -2966,55 +2966,122 @@ def test_proximity_parameters(self) -> None: # With only inside proximity raster1.proximity(vector=vector, in_or_out="in") - @pytest.mark.parametrize("example", [landsat_b4_path, aster_dem_path]) # type: ignore - def test_to_pointcloud(self, example: str) -> None: - """Test to_pointcloud with real data.""" + def test_to_pointcloud(self) -> None: + """Test to_pointcloud method.""" - def test_to_pointcloud__synthetic(self) -> None: - """Test to_pointcloud method with synthetic data.""" + # 1/ Single band synthetic data # Create a small raster to test point sampling on - img0 = gu.Raster.from_array( - np.arange(25, dtype="int32").reshape(5, 5), transform=rio.transform.from_origin(0, 5, 1, 1), crs=4326 - ) + img_arr = np.arange(25, dtype="int32").reshape(5, 5) + img0 = gu.Raster.from_array(img_arr, transform=rio.transform.from_origin(0, 5, 1, 1), crs=4326) # Sample the whole raster (fraction==1) - points = img0.to_pointcloud(subsample=1, as_array=True) + points = img0.to_pointcloud() + points_arr = img0.to_pointcloud(as_array=True) - # Validate that 25 points were sampled (equating to img1.height * img1.width) with x, y, and band0 values. - assert isinstance(points, np.ndarray) - assert points.shape == (25, 3) - assert np.array_equal(np.asarray(points[:, 0]), np.tile(np.linspace(0.5, 4.5, 5), 5)) + # Check output types + assert isinstance(points, gu.Vector) + assert isinstance(points_arr, np.ndarray) + + # Check that both outputs (array or vector) are fully consistent, order matters here + assert np.array_equal(points.ds.geometry.x.values, points_arr[:, 0]) + assert np.array_equal(points.ds.geometry.y.values, points_arr[:, 1]) + assert np.array_equal(points.ds["b1"].values, points_arr[:, 2]) - assert img0.to_pointcloud(subsample=0.2, as_array=True).shape == (5, 3) + # Validate that 25 points were sampled (equating to img1.height * img1.width) with x, y, and band0 values. + assert isinstance(points_arr, np.ndarray) + assert points_arr.shape == (25, 3) + assert points.ds.shape == (25, 2) # One less column here due to geometry storing X and Y + # Check that X, Y and Z arrays are equal to raster array input independently of value order + x_coords, y_coords = img0.ij2xy(i=np.arange(0, 5), j=np.arange(0, 5)) + assert np.array_equal(np.sort(np.asarray(points_arr[:, 0])), np.sort(np.tile(x_coords, 5))) + assert np.array_equal(np.sort(np.asarray(points_arr[:, 1])), np.sort(np.tile(y_coords, 5))) + assert np.array_equal(np.sort(np.asarray(points_arr[:, 2])), np.sort(img_arr.ravel())) + + # Check that subsampling works properly + points_arr = img0.to_pointcloud(subsample=0.2, as_array=True) + assert points_arr.shape == (5, 3) + + # All values should be between 0 and 25 + assert all(0 <= points_arr[:, 2]) and all(points_arr[:, 2] < 25) + + # 2/ Multi-band synthetic data + img_arr = np.arange(25, dtype="int32").reshape(5, 5) + img_3d_arr = np.stack((img_arr, 25 + img_arr, 50 + img_arr), axis=0) + img3d = gu.Raster.from_array(img_3d_arr, transform=rio.transform.from_origin(0, 5, 1, 1), crs=4326) - # Try with a single-band raster + # Sample the whole raster (fraction==1) + points = img3d.to_pointcloud(store_auxiliary_bands=True) + points_arr = img3d.to_pointcloud(as_array=True, store_auxiliary_bands=True) + + # Check equality between both output types + assert np.array_equal(points.ds.geometry.x.values, points_arr[:, 0]) + assert np.array_equal(points.ds.geometry.y.values, points_arr[:, 1]) + assert np.array_equal(points.ds["b1"].values, points_arr[:, 2]) + assert np.array_equal(points.ds["b2"].values, points_arr[:, 3]) + assert np.array_equal(points.ds["b3"].values, points_arr[:, 4]) + + # Check it is the right data + assert np.array_equal(np.sort(np.asarray(points_arr[:, 0])), np.sort(np.tile(x_coords, 5))) + assert np.array_equal(np.sort(np.asarray(points_arr[:, 1])), np.sort(np.tile(y_coords, 5))) + assert np.array_equal(np.sort(np.asarray(points_arr[:, 2])), np.sort(img_3d_arr[0, :, :].ravel())) + assert np.array_equal(np.sort(np.asarray(points_arr[:, 3])), np.sort(img_3d_arr[1, :, :].ravel())) + assert np.array_equal(np.sort(np.asarray(points_arr[:, 4])), np.sort(img_3d_arr[2, :, :].ravel())) + + # With a subsample + points_arr = img3d.to_pointcloud(as_array=True, subsample=10, store_auxiliary_bands=True) + assert points_arr.shape == (10, 5) + + # Check the values are still good + assert all(0 <= points_arr[:, 2]) and all(points_arr[:, 2] < 25) + assert all(25 <= points_arr[:, 3]) and all(points_arr[:, 3] < 50) + assert all(50 <= points_arr[:, 4]) and all(points_arr[:, 4] < 75) + + # 3/ Single-band real raster with nodata values img1 = gu.Raster(self.aster_dem_path) - points = img1.to_pointcloud(subsample=10, as_array=True) + # Get a large sample to ensure they should be some NaNs normally + points_arr = img1.to_pointcloud(subsample=10000, as_array=True, random_state=42) + points = img1.to_pointcloud(subsample=10000, random_state=42) - assert points.shape == (10, 3) + # This should not load the image assert not img1.is_loaded - points_frame = img1.to_pointcloud(subsample=10) + # The subsampled values should be valid and the right shape + assert points_arr.shape == (10000, 3) + assert points.ds.shape == (10000, 2) # One less column here due to geometry storing X and Y + assert all(np.isfinite(points_arr[:, 2])) - assert isinstance(points_frame, gu.Vector) - assert np.array_equal(points_frame.ds.columns, ["b1", "geometry"]) - assert points_frame.crs == img1.crs + # The output should respect the default band naming and the input CRS + assert np.array_equal(points.ds.columns, ["b1", "geometry"]) + assert points.crs == img1.crs - # Try with a multi-band raster + # Try setting the band name + points = img1.to_pointcloud(data_column_name="lol", subsample=10) + assert np.array_equal(points.ds.columns, ["lol", "geometry"]) + + # 4/ Multi-band real raster img2 = gu.Raster(self.landsat_rgb_path) - points = img2.to_pointcloud(subsample=10, as_array=True) + # By default only loads a single band without loading + points_arr = img2.to_pointcloud(subsample=10, as_array=True) + points = img2.to_pointcloud(subsample=10) - assert points.shape == (10, 5) + assert points_arr.shape == (10, 3) + assert points.ds.shape == (10, 2) # One less column here due to geometry storing X and Y assert not img2.is_loaded - points_frame = img2.to_pointcloud(subsample=10) + # Storing auxiliary bands + points_arr = img2.to_pointcloud(subsample=10, as_array=True, store_auxiliary_bands=True) + points = img2.to_pointcloud(subsample=10, store_auxiliary_bands=True) + assert points_arr.shape == (10, 5) + assert points.ds.shape == (10, 4) # One less column here due to geometry storing X and Y + assert not img2.is_loaded + assert np.array_equal(points.ds.columns, ["b1", "b2", "b3", "geometry"]) - assert isinstance(points_frame, gu.Vector) - assert np.array_equal(points_frame.ds.columns, ["b1", "b2", "b3", "geometry"]) - assert points_frame.crs == img2.crs + # Try setting the column name of a specific band while storing all + points = img2.to_pointcloud(subsample=10, data_column_name="yes", data_band=2, store_auxiliary_bands=True) + assert np.array_equal(points.ds.columns, ["b1", "yes", "b3", "geometry"]) class TestMask: From 1c2be36bb22e9dcf5c03847ae19cbbf0588de1e3 Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Thu, 7 Mar 2024 18:41:21 -0900 Subject: [PATCH 04/13] Fix comment --- geoutils/raster/sampling.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/geoutils/raster/sampling.py b/geoutils/raster/sampling.py index 45ed9996..8cdfa0ef 100644 --- a/geoutils/raster/sampling.py +++ b/geoutils/raster/sampling.py @@ -72,12 +72,11 @@ def subsample_array( valids = np.argwhere(~mask.flatten()).squeeze() # Get number of points to extract - # If subsample is one, we don't perform subsampling any operation, we return the valid array or indices + # If subsample is one, we don't perform any subsampling operation, we return the valid array or indices directly if subsample == 1: unraveled_indices = np.unravel_index(valids, array.shape) if return_indices: return unraveled_indices - else: return array[unraveled_indices] if (subsample <= 1) & (subsample > 0): From ce9c5e363df2f6ed03b6db630e6a055c15eb3c38 Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Thu, 7 Mar 2024 18:46:43 -0900 Subject: [PATCH 05/13] Linting --- geoutils/raster/array.py | 11 ++++++----- geoutils/raster/raster.py | 7 +++++-- tests/test_raster.py | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/geoutils/raster/array.py b/geoutils/raster/array.py index 55de4d7c..a838a2e6 100644 --- a/geoutils/raster/array.py +++ b/geoutils/raster/array.py @@ -7,10 +7,10 @@ import numpy as np import geoutils as gu -from geoutils._typing import MArrayNum, NDArrayNum +from geoutils._typing import MArrayNum, NDArrayBool, NDArrayNum -def get_mask(array: NDArrayNum | MArrayNum) -> NDArrayNum: +def get_mask(array: NDArrayNum | NDArrayBool | MArrayNum) -> NDArrayBool: """ Return the mask of invalid values, whether array is a ndarray with NaNs or a np.ma.masked_array. @@ -24,7 +24,7 @@ def get_mask(array: NDArrayNum | MArrayNum) -> NDArrayNum: def get_array_and_mask( array: NDArrayNum | MArrayNum, check_shape: bool = True, copy: bool = True -) -> tuple[NDArrayNum, NDArrayNum]: +) -> tuple[NDArrayNum, NDArrayBool]: """ Return array with masked values set to NaN and the associated mask. Works whether array is a ndarray with NaNs or a np.ma.masked_array. @@ -66,14 +66,15 @@ def get_array_and_mask( return array_data, invalid_mask -def get_valid_extent(array: NDArrayNum | MArrayNum) -> tuple[int, ...]: +def get_valid_extent(array: NDArrayNum | NDArrayBool | MArrayNum) -> tuple[int, ...]: """ Return (rowmin, rowmax, colmin, colmax), the first/last row/column of array with valid pixels """ if not array.dtype == "bool": valid_mask = ~get_mask(array) else: - valid_mask = array + # Not sure why Mypy is not recognizing that the type of the array can only be bool here + valid_mask = array # type: ignore cols_nonzero = np.where(np.count_nonzero(valid_mask, axis=0) > 0)[0] rows_nonzero = np.where(np.count_nonzero(valid_mask, axis=1) > 0)[0] return rows_nonzero[0], rows_nonzero[-1], cols_nonzero[0], cols_nonzero[-1] diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index 4baa753e..a3fa2ee8 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -45,8 +45,8 @@ _get_footprint_projected, _get_utm_ups_crs, ) -from geoutils.raster.sampling import subsample_array from geoutils.raster.array import get_mask +from geoutils.raster.sampling import subsample_array from geoutils.vector import Vector # If python38 or above, Literal is builtin. Otherwise, use typing_extensions @@ -256,6 +256,7 @@ def _load_rio( data = dataset.read(indexes=indexes, masked=masked, window=window, **kwargs) return data + def _get_reproject_params( raster: RasterType, crs: CRS | str | int | None = None, @@ -3534,7 +3535,9 @@ def to_pointcloud( self_mask = get_mask(self.data) # This is to avoid the case where the mask is just "False" valid_mask = ~self_mask else: - self_mask = get_mask(self.data[data_band - 1, :, :]) # This is to avoid the case where the mask is just "False" + self_mask = get_mask( + self.data[data_band - 1, :, :] + ) # This is to avoid the case where the mask is just "False" valid_mask = ~self_mask # Load only mask of valid data from disk if array not loaded diff --git a/tests/test_raster.py b/tests/test_raster.py index f64cf597..1a1ed33f 100644 --- a/tests/test_raster.py +++ b/tests/test_raster.py @@ -2991,7 +2991,7 @@ def test_to_pointcloud(self) -> None: # Validate that 25 points were sampled (equating to img1.height * img1.width) with x, y, and band0 values. assert isinstance(points_arr, np.ndarray) assert points_arr.shape == (25, 3) - assert points.ds.shape == (25, 2) # One less column here due to geometry storing X and Y + assert points.ds.shape == (25, 2) # One less column here due to geometry storing X and Y # Check that X, Y and Z arrays are equal to raster array input independently of value order x_coords, y_coords = img0.ij2xy(i=np.arange(0, 5), j=np.arange(0, 5)) assert np.array_equal(np.sort(np.asarray(points_arr[:, 0])), np.sort(np.tile(x_coords, 5))) From 7e2bf8586fba42e98ed87c92a258ec96a661a36b Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Thu, 7 Mar 2024 19:06:15 -0900 Subject: [PATCH 06/13] Update documentation --- doc/source/api.md | 2 +- doc/source/raster_class.md | 4 ++-- examples/handling/interface/topoints.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/api.md b/doc/source/api.md index 0f3606a3..9e397e08 100644 --- a/doc/source/api.md +++ b/doc/source/api.md @@ -118,7 +118,7 @@ documentation. Raster.load Raster.save - Raster.to_points + Raster.to_pointcloud Raster.to_rio_dataset Raster.to_xarray ``` diff --git a/doc/source/raster_class.md b/doc/source/raster_class.md index f4bc6e60..637030b3 100644 --- a/doc/source/raster_class.md +++ b/doc/source/raster_class.md @@ -332,7 +332,7 @@ A {class}`~geoutils.Raster` can be exported to different formats, to facilitate Those include exporting to: - a {class}`xarray.Dataset` with {class}`~geoutils.Raster.to_xarray`, - a {class}`rasterio.io.DatasetReader` with {class}`~geoutils.Raster.to_rio_dataset`, -- a {class}`numpy.ndarray` or {class}`geoutils.Vector` as a point cloud with {class}`~geoutils.Raster.to_points`. +- a {class}`numpy.ndarray` or {class}`geoutils.Vector` as a point cloud with {class}`~geoutils.Raster.to_pointcloud`. ```{code-cell} ipython3 # Export to rasterio dataset-reader through a memoryfile @@ -341,7 +341,7 @@ rast_reproj.to_rio_dataset() ```{code-cell} ipython3 # Export to geopandas dataframe -rast_reproj.to_points() +rast_reproj.to_pointcloud() ``` ```{code-cell} ipython3 diff --git a/examples/handling/interface/topoints.py b/examples/handling/interface/topoints.py index fded01f6..aa3f39b2 100644 --- a/examples/handling/interface/topoints.py +++ b/examples/handling/interface/topoints.py @@ -27,4 +27,4 @@ # %% # We plot the point vector. -pts_rast.plot(column="b1", cmap="terrain", legend=True) +pts_rast.ds.plot(column="b1", cmap="terrain", legend=True) From 044f294630f1cb15ce74b1298a441a10ecac61ec Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Fri, 8 Mar 2024 10:55:01 -0900 Subject: [PATCH 07/13] Fix masked array bug --- geoutils/raster/raster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index a3fa2ee8..864448fb 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -3576,9 +3576,9 @@ def to_pointcloud( with rio.open(self.filename) as raster: pixel_data = np.array(list(raster.sample(zip(x_coords, y_coords), indexes=bands_to_extract))).T - # Fill masked array with NaNs before writing to points + # At this point there should not be any nodata anymore, so we can transform everything to normal array if np.ma.isMaskedArray(pixel_data): - pixel_data = pixel_data.filled(np.nan) + pixel_data = pixel_data.data # Now we force the coordinates we define for the point cloud, according to pixel interpretation x_coords_2, y_coords_2 = (np.array(a) for a in self.ij2xy(indices[0], indices[1], offset=force_pixel_offset)) From 09b1d56aa625f27cd2b7ed72e8cae61dab5f3c01 Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Fri, 8 Mar 2024 11:48:22 -0900 Subject: [PATCH 08/13] Add get_mask function and tests for loading mask --- geoutils/raster/array.py | 6 ++-- geoutils/raster/raster.py | 56 +++++++++++++++++++++++-------------- geoutils/raster/sampling.py | 4 +-- tests/test_raster.py | 18 ++++++++++++ tests/test_sampling.py | 4 +++ 5 files changed, 62 insertions(+), 26 deletions(-) diff --git a/geoutils/raster/array.py b/geoutils/raster/array.py index a838a2e6..9e7e9a9e 100644 --- a/geoutils/raster/array.py +++ b/geoutils/raster/array.py @@ -10,7 +10,7 @@ from geoutils._typing import MArrayNum, NDArrayBool, NDArrayNum -def get_mask(array: NDArrayNum | NDArrayBool | MArrayNum) -> NDArrayBool: +def get_mask_from_array(array: NDArrayNum | NDArrayBool | MArrayNum) -> NDArrayBool: """ Return the mask of invalid values, whether array is a ndarray with NaNs or a np.ma.masked_array. @@ -59,7 +59,7 @@ def get_array_and_mask( array_data = np.array(array).squeeze() if copy else np.asarray(array).squeeze() # Get the mask of invalid pixels and set nans if it is occupied. - invalid_mask = get_mask(array) + invalid_mask = get_mask_from_array(array) if np.any(invalid_mask): array_data[invalid_mask] = np.nan @@ -71,7 +71,7 @@ def get_valid_extent(array: NDArrayNum | NDArrayBool | MArrayNum) -> tuple[int, Return (rowmin, rowmax, colmin, colmax), the first/last row/column of array with valid pixels """ if not array.dtype == "bool": - valid_mask = ~get_mask(array) + valid_mask = ~get_mask_from_array(array) else: # Not sure why Mypy is not recognizing that the type of the array can only be bool here valid_mask = array # type: ignore diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index 864448fb..ec02ed6c 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -45,7 +45,7 @@ _get_footprint_projected, _get_utm_ups_crs, ) -from geoutils.raster.array import get_mask +from geoutils.raster.array import get_mask_from_array from geoutils.raster.sampling import subsample_array from geoutils.vector import Vector @@ -747,8 +747,11 @@ def _load_only_mask(self, bands: int | list[int] | None = None, **kwargs: Any) - **kwargs, ) - # Valid data is equal to 255, invalid is equal to zero - mask_bool = mask == 255 + # Rasterio says the mask should be returned in 2D for a single band but it seems not + mask = mask.squeeze() + + # Valid data is equal to 255, invalid is equal to zero (see Rasterio doc) + mask_bool = mask == 0 return mask_bool @@ -1048,23 +1051,13 @@ def raster_equal(self, other: RasterType) -> bool: - The raster's transform, crs and nodata values. """ - # If the mask is just "False", it is equivalent to being equal to an array of False - if isinstance(self.data.mask, np.bool_): - self_mask = np.zeros(np.shape(self.data), dtype=bool) - else: - self_mask = self.data.mask - - if isinstance(other.data.mask, np.bool_): - other_mask = np.zeros(np.shape(other.data), dtype=bool) - else: - other_mask = other.data.mask - if not isinstance(other, Raster): # TODO: Possibly add equals to SatelliteImage? raise NotImplementedError("Equality with other object than Raster not supported by raster_equal.") return all( [ np.array_equal(self.data.data, other.data.data, equal_nan=True), - np.array_equal(self_mask, other_mask), + # Use getmaskarray to avoid comparing boolean with array when mask=False + np.array_equal(np.ma.getmaskarray(self.data.mask), np.ma.getmaskarray(other.data.mask)), self.data.fill_value == other.data.fill_value, self.data.dtype == other.data.dtype, self.transform == other.transform, @@ -1909,7 +1902,7 @@ def get_nanarray(self, return_mask: bool = False) -> NDArrayNum | tuple[NDArrayN :param return_mask: Whether to return the mask of valid data. - :returns Array with masked data as NaNs, (Optional) Mask of valid data. + :returns Array with masked data as NaNs, (Optional) Mask of invalid data. """ # Cast array to float32 is its dtype is integer (cannot be filled with NaNs otherwise) @@ -1930,6 +1923,28 @@ def get_nanarray(self, return_mask: bool = False) -> NDArrayNum | tuple[NDArrayN else: return nanarray + def get_mask(self) -> NDArrayBool: + """ + Get mask from the raster. + + The mask is always returned as a boolean array, even if there is no mask to .data and thus .data.mask = a + single False value, nomask property of masked arrays. + + If the raster is not loaded, reads only the mask from disk to optimize memory usage. + + :return: + """ + # If it is loaded, use NumPy's getmaskarray function to deal with False values + if self.is_loaded: + mask = np.ma.getmaskarray(self.data) + # Otherwise, load from Rasterio and deal with the possibility of having a single value "False" mask manually + else: + mask = self._load_only_mask() + if isinstance(mask, np.bool_): + mask = np.zeros(self.shape, dtype=bool) + + return mask + # This is interfering with __array_ufunc__ and __array_function__, so better to leave out and specify # behaviour directly in those. # def __array__(self) -> np.ndarray: @@ -3532,17 +3547,16 @@ def to_pointcloud( # The valid mask is considered only for the data band if self.is_loaded: if self.count == 1: - self_mask = get_mask(self.data) # This is to avoid the case where the mask is just "False" - valid_mask = ~self_mask + self_mask = get_mask_from_array(self.data) # This is to avoid the case where the mask is just "False" else: - self_mask = get_mask( + self_mask = get_mask_from_array( self.data[data_band - 1, :, :] ) # This is to avoid the case where the mask is just "False" - valid_mask = ~self_mask + valid_mask = ~self_mask # Load only mask of valid data from disk if array not loaded else: - valid_mask = self._load_only_mask(bands=data_band) + valid_mask = ~self._load_only_mask(bands=data_band) # Get subsample on valid mask # Build a low memory boolean masked array with invalid values masked to pass to subsampling diff --git a/geoutils/raster/sampling.py b/geoutils/raster/sampling.py index 8cdfa0ef..7fd76f91 100644 --- a/geoutils/raster/sampling.py +++ b/geoutils/raster/sampling.py @@ -7,7 +7,7 @@ import numpy as np from geoutils._typing import MArrayNum, NDArrayNum -from geoutils.raster.array import get_mask +from geoutils.raster.array import get_mask_from_array @overload @@ -68,7 +68,7 @@ def subsample_array( rnd = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(random_state))) # Remove invalid values and flatten array - mask = get_mask(array) # -> need to remove .squeeze in get_mask + mask = get_mask_from_array(array) # -> need to remove .squeeze in get_mask valids = np.argwhere(~mask.flatten()).squeeze() # Get number of points to extract diff --git a/tests/test_raster.py b/tests/test_raster.py index 1a1ed33f..016d5f14 100644 --- a/tests/test_raster.py +++ b/tests/test_raster.py @@ -339,6 +339,24 @@ def test_load(self) -> None: r.filename = None r.load() + @pytest.mark.parametrize("example", [landsat_b4_path, aster_dem_path, landsat_rgb_path]) # type: ignore + def test_load_only_mask(self, example: str) -> None: + """ + Test that loading only mask works properly. + """ + + # Load raster with and without loading + r_loaded = gu.Raster(example, load_data=True) + r_notloaded = gu.Raster(example) + + # Get the mask for the two options + mask_loaded = np.ma.getmaskarray(r_loaded.data) + mask_notloaded = r_notloaded._load_only_mask() + + # Data should not be loaded and masks should be equal + assert not r_notloaded.is_loaded + assert np.array_equal(mask_notloaded, mask_loaded) + @pytest.mark.parametrize("example", [landsat_b4_path, aster_dem_path]) # type: ignore def test_to_rio_dataset(self, example: str): """Test the export to a rasterio dataset""" diff --git a/tests/test_sampling.py b/tests/test_sampling.py index 2fdd47b7..ca8ce47b 100644 --- a/tests/test_sampling.py +++ b/tests/test_sampling.py @@ -53,6 +53,10 @@ def test_subsample(self, array: NDArrayNum) -> None: random_values = gu.raster.subsample_array(array, subsample=1) assert np.all(np.sort(random_values) == array[~array.mask]) + # Check that order is preserved for subsample = 1 (no random sampling, simply returns valid mask) + random_values_2 = gu.raster.subsample_array(array, subsample=1) + assert np.array_equal(random_values, random_values_2) + # Test if subsample < 1 random_values = gu.raster.subsample_array(array, subsample=0.5) assert np.size(random_values) == int(np.count_nonzero(~array.mask) * 0.5) From cb521f24f7ae86dc75caf738b80a903b641aced5 Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Fri, 8 Mar 2024 12:55:23 -0900 Subject: [PATCH 09/13] Fix raster_equal --- doc/source/api.md | 1 + geoutils/raster/raster.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/source/api.md b/doc/source/api.md index 9e397e08..76e43e70 100644 --- a/doc/source/api.md +++ b/doc/source/api.md @@ -107,6 +107,7 @@ documentation. Raster.set_mask Raster.set_nodata Raster.get_nanarray + Raster.get_mask Raster.subsample ``` diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index ec02ed6c..3debf9ce 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -1057,7 +1057,7 @@ def raster_equal(self, other: RasterType) -> bool: [ np.array_equal(self.data.data, other.data.data, equal_nan=True), # Use getmaskarray to avoid comparing boolean with array when mask=False - np.array_equal(np.ma.getmaskarray(self.data.mask), np.ma.getmaskarray(other.data.mask)), + np.array_equal(np.ma.getmaskarray(self.data), np.ma.getmaskarray(other.data)), self.data.fill_value == other.data.fill_value, self.data.dtype == other.data.dtype, self.transform == other.transform, @@ -3506,7 +3506,8 @@ def to_pointcloud( Optionally, all other bands can also be stored in columns "b1", "b2", etc. For more specific band selection, use Raster.split_bands previous to converting to point cloud. - Optionally, randomly subsample valid pixels of the data band (nodata values are skipped). + Optionally, randomly subsample valid pixels for the data band (nodata values are skipped, but only for the band + that will be used as data column of the point cloud). If 'subsample' is either 1, or is equal to the pixel count, all valid points are returned. If 'subsample' is smaller than 1 (for fractions), or smaller than the pixel count, a random subsample of valid points is returned. @@ -3544,7 +3545,7 @@ def to_pointcloud( if data_band != 1 and data_column_name == "b1": data_column_name = "b" + str(data_band) - # The valid mask is considered only for the data band + # We do 2D subsampling on the data band only, regardless of valid masks on other bands if self.is_loaded: if self.count == 1: self_mask = get_mask_from_array(self.data) # This is to avoid the case where the mask is just "False" From 9a10c65edf866036422abea718998023f2bb1d02 Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Fri, 8 Mar 2024 13:08:06 -0900 Subject: [PATCH 10/13] Add deprecation notice --- geoutils/raster/raster.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index 3debf9ce..7d1ded62 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -11,6 +11,7 @@ from contextlib import ExitStack from math import floor from typing import IO, Any, Callable, TypeVar, overload +from packaging.version import Version import affine import geopandas as gpd @@ -48,6 +49,7 @@ from geoutils.raster.array import get_mask_from_array from geoutils.raster.sampling import subsample_array from geoutils.vector import Vector +from geoutils.misc import deprecate # If python38 or above, Literal is builtin. Otherwise, use typing_extensions try: @@ -3441,6 +3443,12 @@ def split_bands(self: RasterType, copy: bool = False, bands: list[int] | int | N return raster_bands + @deprecate(Version("0.3.0"), "Raster.to_points() is deprecated in favor of Raster.to_pointcloud() and " + "will be removed in v0.3.") + def to_points(self, **kwargs): # type: ignore + + self.to_pointcloud(**kwargs) # type: ignore + @overload def to_pointcloud( self, From 640e6ceb653724d3b7c7ca3d321db8cc4ff7ad6a Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Fri, 8 Mar 2024 13:08:29 -0900 Subject: [PATCH 11/13] Linting --- geoutils/raster/raster.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index 7d1ded62..19e10eb4 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -11,7 +11,6 @@ from contextlib import ExitStack from math import floor from typing import IO, Any, Callable, TypeVar, overload -from packaging.version import Version import affine import geopandas as gpd @@ -24,6 +23,7 @@ import rasterio.windows from affine import Affine from mpl_toolkits.axes_grid1 import make_axes_locatable +from packaging.version import Version from rasterio.crs import CRS from rasterio.enums import Resampling from rasterio.features import shapes @@ -41,6 +41,7 @@ NDArrayNum, Number, ) +from geoutils.misc import deprecate from geoutils.projtools import ( _get_bounds_projected, _get_footprint_projected, @@ -49,7 +50,6 @@ from geoutils.raster.array import get_mask_from_array from geoutils.raster.sampling import subsample_array from geoutils.vector import Vector -from geoutils.misc import deprecate # If python38 or above, Literal is builtin. Otherwise, use typing_extensions try: @@ -3443,8 +3443,10 @@ def split_bands(self: RasterType, copy: bool = False, bands: list[int] | int | N return raster_bands - @deprecate(Version("0.3.0"), "Raster.to_points() is deprecated in favor of Raster.to_pointcloud() and " - "will be removed in v0.3.") + @deprecate( + Version("0.3.0"), + "Raster.to_points() is deprecated in favor of Raster.to_pointcloud() and " "will be removed in v0.3.", + ) def to_points(self, **kwargs): # type: ignore self.to_pointcloud(**kwargs) # type: ignore From dc35a36b7133f07464b71dad04daa10e7fb716ad Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Tue, 19 Mar 2024 13:28:11 -0800 Subject: [PATCH 12/13] Amaury comments --- examples/handling/interface/topoints.py | 4 +- geoutils/raster/raster.py | 94 ++++++++++++++++--------- geoutils/vector.py | 7 +- tests/test_raster.py | 36 +++++++--- 4 files changed, 91 insertions(+), 50 deletions(-) diff --git a/examples/handling/interface/topoints.py b/examples/handling/interface/topoints.py index aa3f39b2..56e8b071 100644 --- a/examples/handling/interface/topoints.py +++ b/examples/handling/interface/topoints.py @@ -19,7 +19,7 @@ rast.plot(cmap="terrain") # %% -# We convert the raster to points. By default, this returns a vector with columb geometry burned. +# We convert the raster to points. By default, this returns a vector with column geometry burned. pts_rast = rast.to_pointcloud() pts_rast @@ -27,4 +27,4 @@ # %% # We plot the point vector. -pts_rast.ds.plot(column="b1", cmap="terrain", legend=True) +pts_rast.plot(ax="new", column="b1", cmap="terrain", legend=True) diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index 19e10eb4..371d9a30 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -10,7 +10,7 @@ from collections import abc from contextlib import ExitStack from math import floor -from typing import IO, Any, Callable, TypeVar, overload +from typing import IO, Any, Callable, TypeVar, overload, Iterable import affine import geopandas as gpd @@ -3456,7 +3456,8 @@ def to_pointcloud( self, data_column_name: str = "b1", data_band: int = 1, - store_auxiliary_bands: bool = False, + auxiliary_data_bands: list[int] | None = None, + auxiliary_column_names: list[str] | None = None, subsample: float | int = 1, *, as_array: Literal[False] = False, @@ -3470,7 +3471,8 @@ def to_pointcloud( self, data_column_name: str = "b1", data_band: int = 1, - store_auxiliary_bands: bool = False, + auxiliary_data_bands: list[int] | None = None, + auxiliary_column_names: list[str] | None = None, subsample: float | int = 1, *, as_array: Literal[True], @@ -3484,7 +3486,8 @@ def to_pointcloud( self, data_column_name: str = "b1", data_band: int = 1, - store_auxiliary_bands: bool = False, + auxiliary_data_bands: list[int] | None = None, + auxiliary_column_names: list[str] | None = None, subsample: float | int = 1, *, as_array: bool = False, @@ -3497,7 +3500,8 @@ def to_pointcloud( self, data_column_name: str = "b1", data_band: int = 1, - store_auxiliary_bands: bool = False, + auxiliary_data_bands: list[int] | None = None, + auxiliary_column_names: list[str] | None = None, subsample: float | int = 1, as_array: bool = False, random_state: np.random.RandomState | int | None = None, @@ -3529,11 +3533,14 @@ def to_pointcloud( * `as_array` == False: A vector with dataframe columns ["b1", "b2", ..., "geometry"], * `as_array` == True: A numpy ndarray of shape (N, 2 + count) with the columns [x, y, b1, b2..]. - :param data_column_name: Name of point cloud data column to use, defaults to "b1". + :param data_column_name: Name to use for point cloud data column, defaults to "bX" where X is the data band + number. :param data_band: (Only for multi-band rasters) Band to use for data column, defaults to first. Band counting starts at 1. - :param store_auxiliary_bands: (Only for multi-band rasters) Whether to save other bands as auxiliary data - columns, defaults to False. + :param auxiliary_data_bands: (Only for multi-band rasters) Whether to save other band numbers as auxiliary data + columns, defaults to none. + :param auxiliary_column_names: (Only for multi-band rasters) Names to use for auxiliary data bands, only if + auxiliary data bands is not none, defaults to "b1", "b2", etc. :param subsample: Subsample size. If > 1, parsed as a count, otherwise a fraction. :param as_array: Return an array instead of a vector. :param random_state: Random state or seed number. @@ -3542,19 +3549,54 @@ def to_pointcloud( :raises ValueError: If the sample count or fraction is poorly formatted. - :returns: A point cloud, or ndarray of the shape (N, 2 + count) where N is the sample count. + :returns: A point cloud, or array of the shape (N, 2 + count) where N is the sample count. """ # Input checks + + # Main data column checks if not isinstance(data_column_name, str): raise ValueError("Data column name must be a string.") - if not isinstance(data_band, int) and (1 < data_band or self.count < data_band): - raise ValueError(f"Data band number must be between 1 and the total number of bands of {self.count}.") + if not (isinstance(data_band, int) and data_band >= 1 and data_band <= self.count): + raise ValueError(f"Data band number must be an integer between 1 and the total number of bands ({self.count}).") # Rename data column if a different band is selected but the name is still default if data_band != 1 and data_column_name == "b1": data_column_name = "b" + str(data_band) + # Auxiliary data columns checks + if auxiliary_column_names is not None and auxiliary_data_bands is None: + raise ValueError("Passing auxiliary column names requires passing auxiliary data band numbers as well.") + if auxiliary_data_bands is not None: + if not (isinstance(auxiliary_data_bands, Iterable) and all(isinstance(b, int) for b in auxiliary_data_bands)): + raise ValueError("Auxiliary data band number must be an iterable containing only integers.") + if any((1 > b or self.count < b) for b in auxiliary_data_bands): + raise ValueError(f"Auxiliary data band numbers must be between 1 and the total number of bands ({self.count}).") + if data_band in auxiliary_data_bands: + raise ValueError(f"Main data band {data_band} should not be listed in auxiliary data bands {auxiliary_data_bands}.") + + # Ensure auxiliary column name is defined if auxiliary data bands is not None + if auxiliary_column_names is not None: + if not (isinstance(auxiliary_column_names, Iterable) and all(isinstance(b, str) for b in auxiliary_column_names)): + raise ValueError("Auxiliary column names must be an iterable containing only strings.") + if not len(auxiliary_column_names) == len(auxiliary_data_bands): + raise ValueError(f"Length of auxiliary column name and data band numbers should be the same, " + f"found {len(auxiliary_column_names)} and {len(auxiliary_data_bands)} respectively.") + + else: + auxiliary_column_names = [f"b{i}" for i in auxiliary_data_bands] + + # Define bigger list with all bands and names + all_bands = [data_band] + auxiliary_data_bands + all_column_names = [data_column_name] + auxiliary_column_names + + else: + all_bands = [data_band] + all_column_names = [data_column_name] + + # Band indexes in the array are band number minus one + all_indexes = [b - 1 for b in all_bands] + # We do 2D subsampling on the data band only, regardless of valid masks on other bands if self.is_loaded: if self.count == 1: @@ -3575,31 +3617,24 @@ def to_pointcloud( # Take a subsample within the valid values indices = subsample_array(array=ma_valid, subsample=subsample, random_state=random_state, return_indices=True) - # Extract the coordinates at subsampled pixels with valid data - # To extract data, we always use "upper left" which rasterio interprets as the exact coordinates of the raster - # Further below we redefine output coordinates based on point interpretation - x_coords, y_coords = (np.array(a) for a in self.ij2xy(indices[0], indices[1], offset="ul")) - # If the Raster is loaded, pick from the data while ignoring the mask if self.is_loaded: if self.count == 1: pixel_data = self.data[indices[0], indices[1]] else: - if store_auxiliary_bands: - pixel_data = self.data[:, indices[0], indices[1]] - else: - pixel_data = self.data[data_band - 1, indices[0], indices[1]] + # TODO: Combining both indexes at once could reduce memory usage? + pixel_data = self.data[all_indexes, :][:, indices[0], indices[1]] # Otherwise use rasterio.sample to load only requested pixels else: - # Sample all bands if storing auxiliary, otherwise the single one - if store_auxiliary_bands: - bands_to_extract = None - else: - bands_to_extract = data_band + # Extract the coordinates at subsampled pixels with valid data + # To extract data, we always use "upper left" which rasterio interprets as the exact raster coordinates + # Further below we redefine output coordinates based on point interpretation + x_coords, y_coords = (np.array(a) for a in self.ij2xy(indices[0], indices[1], offset="ul")) with rio.open(self.filename) as raster: - pixel_data = np.array(list(raster.sample(zip(x_coords, y_coords), indexes=bands_to_extract))).T + # Rasterio uses indexes (starts at 1) + pixel_data = np.array(list(raster.sample(zip(x_coords, y_coords), indexes=all_bands))).T # At this point there should not be any nodata anymore, so we can transform everything to normal array if np.ma.isMaskedArray(pixel_data): @@ -3611,13 +3646,6 @@ def to_pointcloud( # Merge the coordinates and pixel data into a point cloud. points_arr = np.vstack((x_coords_2.reshape(1, -1), y_coords_2.reshape(1, -1), pixel_data)).T - # Define column names - if store_auxiliary_bands: - all_column_names = [f"b{i}" for i in range(1, self.count + 1)] - all_column_names[data_band - 1] = data_column_name - else: - all_column_names = [data_column_name] - if not as_array: points = Vector( gpd.GeoDataFrame( diff --git a/geoutils/vector.py b/geoutils/vector.py index 2ae022d9..dc7cfbb3 100644 --- a/geoutils/vector.py +++ b/geoutils/vector.py @@ -206,12 +206,7 @@ def plot( # Create axes, or get current ones by default (like in matplotlib) if ax is None: - # If no figure exists, get a new axis - if len(plt.get_fignums()) == 0: - ax0 = plt.gca() - # Otherwise, get first axis - else: - ax0 = plt.gcf().axes[0] + ax0 = plt.gca() elif isinstance(ax, str) and ax.lower() == "new": _, ax0 = plt.subplots() elif isinstance(ax, matplotlib.axes.Axes): diff --git a/tests/test_raster.py b/tests/test_raster.py index 016d5f14..041cddf6 100644 --- a/tests/test_raster.py +++ b/tests/test_raster.py @@ -3007,7 +3007,6 @@ def test_to_pointcloud(self) -> None: assert np.array_equal(points.ds["b1"].values, points_arr[:, 2]) # Validate that 25 points were sampled (equating to img1.height * img1.width) with x, y, and band0 values. - assert isinstance(points_arr, np.ndarray) assert points_arr.shape == (25, 3) assert points.ds.shape == (25, 2) # One less column here due to geometry storing X and Y # Check that X, Y and Z arrays are equal to raster array input independently of value order @@ -3029,8 +3028,8 @@ def test_to_pointcloud(self) -> None: img3d = gu.Raster.from_array(img_3d_arr, transform=rio.transform.from_origin(0, 5, 1, 1), crs=4326) # Sample the whole raster (fraction==1) - points = img3d.to_pointcloud(store_auxiliary_bands=True) - points_arr = img3d.to_pointcloud(as_array=True, store_auxiliary_bands=True) + points = img3d.to_pointcloud(auxiliary_data_bands=[2, 3]) + points_arr = img3d.to_pointcloud(as_array=True, auxiliary_data_bands=[2, 3]) # Check equality between both output types assert np.array_equal(points.ds.geometry.x.values, points_arr[:, 0]) @@ -3047,7 +3046,7 @@ def test_to_pointcloud(self) -> None: assert np.array_equal(np.sort(np.asarray(points_arr[:, 4])), np.sort(img_3d_arr[2, :, :].ravel())) # With a subsample - points_arr = img3d.to_pointcloud(as_array=True, subsample=10, store_auxiliary_bands=True) + points_arr = img3d.to_pointcloud(as_array=True, subsample=10, auxiliary_data_bands=[2, 3]) assert points_arr.shape == (10, 5) # Check the values are still good @@ -3090,17 +3089,36 @@ def test_to_pointcloud(self) -> None: assert not img2.is_loaded # Storing auxiliary bands - points_arr = img2.to_pointcloud(subsample=10, as_array=True, store_auxiliary_bands=True) - points = img2.to_pointcloud(subsample=10, store_auxiliary_bands=True) + points_arr = img2.to_pointcloud(subsample=10, as_array=True, auxiliary_data_bands=[2, 3]) + points = img2.to_pointcloud(subsample=10, auxiliary_data_bands=[2, 3]) assert points_arr.shape == (10, 5) assert points.ds.shape == (10, 4) # One less column here due to geometry storing X and Y assert not img2.is_loaded assert np.array_equal(points.ds.columns, ["b1", "b2", "b3", "geometry"]) # Try setting the column name of a specific band while storing all - points = img2.to_pointcloud(subsample=10, data_column_name="yes", data_band=2, store_auxiliary_bands=True) - assert np.array_equal(points.ds.columns, ["b1", "yes", "b3", "geometry"]) - + points = img2.to_pointcloud(subsample=10, data_column_name="yes", data_band=2, auxiliary_data_bands=[1, 3]) + assert np.array_equal(points.ds.columns, ["yes", "b1", "b3", "geometry"]) + + # 5/ Error raising + with pytest.raises(ValueError, match="Data column name must be a string.*"): + img1.to_pointcloud(data_column_name=1) # type: ignore + with pytest.raises(ValueError, match=re.escape("Data band number must be an integer between 1 and the total number of bands (3).")): + img2.to_pointcloud(data_band=4) + with pytest.raises(ValueError, match="Passing auxiliary column names requires passing auxiliary data band numbers as well."): + img2.to_pointcloud(auxiliary_column_names=["a"]) + with pytest.raises(ValueError, match="Auxiliary data band number must be an iterable containing only integers."): + img2.to_pointcloud(auxiliary_data_bands=[1, 2.5]) # type: ignore + img2.to_pointcloud(auxiliary_data_bands="lol") # type: ignore + with pytest.raises(ValueError, match=re.escape("Auxiliary data band numbers must be between 1 and the total number of bands (3).")): + img2.to_pointcloud(auxiliary_data_bands=[0]) + img2.to_pointcloud(auxiliary_data_bands=[4]) + with pytest.raises(ValueError, match=re.escape("Main data band 1 should not be listed in auxiliary data bands [1, 2].")): + img2.to_pointcloud(auxiliary_data_bands=[1, 2]) + with pytest.raises(ValueError, match="Auxiliary column names must be an iterable containing only strings."): + img2.to_pointcloud(auxiliary_data_bands=[2, 3], auxiliary_column_names=["lol", 1]) + with pytest.raises(ValueError, match="Length of auxiliary column name and data band numbers should be the same*"): + img2.to_pointcloud(auxiliary_data_bands=[2, 3], auxiliary_column_names=["lol", "lol2", "lol3"]) class TestMask: # Paths to example data From 5f81f8d8e1a8e12062b551918f5b8d7d6df92c03 Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Tue, 19 Mar 2024 13:28:33 -0800 Subject: [PATCH 13/13] Linting --- geoutils/raster/raster.py | 29 +++++++++++++++++++++-------- tests/test_raster.py | 27 +++++++++++++++++++++------ 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index 371d9a30..7b75dec3 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -10,7 +10,7 @@ from collections import abc from contextlib import ExitStack from math import floor -from typing import IO, Any, Callable, TypeVar, overload, Iterable +from typing import IO, Any, Callable, Iterable, TypeVar, overload import affine import geopandas as gpd @@ -3558,7 +3558,9 @@ def to_pointcloud( if not isinstance(data_column_name, str): raise ValueError("Data column name must be a string.") if not (isinstance(data_band, int) and data_band >= 1 and data_band <= self.count): - raise ValueError(f"Data band number must be an integer between 1 and the total number of bands ({self.count}).") + raise ValueError( + f"Data band number must be an integer between 1 and the total number of bands ({self.count})." + ) # Rename data column if a different band is selected but the name is still default if data_band != 1 and data_column_name == "b1": @@ -3568,20 +3570,31 @@ def to_pointcloud( if auxiliary_column_names is not None and auxiliary_data_bands is None: raise ValueError("Passing auxiliary column names requires passing auxiliary data band numbers as well.") if auxiliary_data_bands is not None: - if not (isinstance(auxiliary_data_bands, Iterable) and all(isinstance(b, int) for b in auxiliary_data_bands)): + if not ( + isinstance(auxiliary_data_bands, Iterable) and all(isinstance(b, int) for b in auxiliary_data_bands) + ): raise ValueError("Auxiliary data band number must be an iterable containing only integers.") if any((1 > b or self.count < b) for b in auxiliary_data_bands): - raise ValueError(f"Auxiliary data band numbers must be between 1 and the total number of bands ({self.count}).") + raise ValueError( + f"Auxiliary data band numbers must be between 1 and the total number of bands ({self.count})." + ) if data_band in auxiliary_data_bands: - raise ValueError(f"Main data band {data_band} should not be listed in auxiliary data bands {auxiliary_data_bands}.") + raise ValueError( + f"Main data band {data_band} should not be listed in auxiliary data bands {auxiliary_data_bands}." + ) # Ensure auxiliary column name is defined if auxiliary data bands is not None if auxiliary_column_names is not None: - if not (isinstance(auxiliary_column_names, Iterable) and all(isinstance(b, str) for b in auxiliary_column_names)): + if not ( + isinstance(auxiliary_column_names, Iterable) + and all(isinstance(b, str) for b in auxiliary_column_names) + ): raise ValueError("Auxiliary column names must be an iterable containing only strings.") if not len(auxiliary_column_names) == len(auxiliary_data_bands): - raise ValueError(f"Length of auxiliary column name and data band numbers should be the same, " - f"found {len(auxiliary_column_names)} and {len(auxiliary_data_bands)} respectively.") + raise ValueError( + f"Length of auxiliary column name and data band numbers should be the same, " + f"found {len(auxiliary_column_names)} and {len(auxiliary_data_bands)} respectively." + ) else: auxiliary_column_names = [f"b{i}" for i in auxiliary_data_bands] diff --git a/tests/test_raster.py b/tests/test_raster.py index 041cddf6..9ab575ed 100644 --- a/tests/test_raster.py +++ b/tests/test_raster.py @@ -3103,23 +3103,38 @@ def test_to_pointcloud(self) -> None: # 5/ Error raising with pytest.raises(ValueError, match="Data column name must be a string.*"): img1.to_pointcloud(data_column_name=1) # type: ignore - with pytest.raises(ValueError, match=re.escape("Data band number must be an integer between 1 and the total number of bands (3).")): + with pytest.raises( + ValueError, + match=re.escape("Data band number must be an integer between 1 and the total number of bands (3)."), + ): img2.to_pointcloud(data_band=4) - with pytest.raises(ValueError, match="Passing auxiliary column names requires passing auxiliary data band numbers as well."): + with pytest.raises( + ValueError, match="Passing auxiliary column names requires passing auxiliary data band numbers as well." + ): img2.to_pointcloud(auxiliary_column_names=["a"]) - with pytest.raises(ValueError, match="Auxiliary data band number must be an iterable containing only integers."): + with pytest.raises( + ValueError, match="Auxiliary data band number must be an iterable containing only integers." + ): img2.to_pointcloud(auxiliary_data_bands=[1, 2.5]) # type: ignore img2.to_pointcloud(auxiliary_data_bands="lol") # type: ignore - with pytest.raises(ValueError, match=re.escape("Auxiliary data band numbers must be between 1 and the total number of bands (3).")): + with pytest.raises( + ValueError, + match=re.escape("Auxiliary data band numbers must be between 1 and the total number of bands (3)."), + ): img2.to_pointcloud(auxiliary_data_bands=[0]) img2.to_pointcloud(auxiliary_data_bands=[4]) - with pytest.raises(ValueError, match=re.escape("Main data band 1 should not be listed in auxiliary data bands [1, 2].")): + with pytest.raises( + ValueError, match=re.escape("Main data band 1 should not be listed in auxiliary data bands [1, 2].") + ): img2.to_pointcloud(auxiliary_data_bands=[1, 2]) with pytest.raises(ValueError, match="Auxiliary column names must be an iterable containing only strings."): img2.to_pointcloud(auxiliary_data_bands=[2, 3], auxiliary_column_names=["lol", 1]) - with pytest.raises(ValueError, match="Length of auxiliary column name and data band numbers should be the same*"): + with pytest.raises( + ValueError, match="Length of auxiliary column name and data band numbers should be the same*" + ): img2.to_pointcloud(auxiliary_data_bands=[2, 3], auxiliary_column_names=["lol", "lol2", "lol3"]) + class TestMask: # Paths to example data landsat_b4_path = examples.get_path("everest_landsat_b4")