From de74df2b4db7532c413207337d7a3dedea9d3aeb Mon Sep 17 00:00:00 2001 From: Michael Johns Date: Tue, 23 Jan 2024 14:55:15 -0500 Subject: [PATCH] Fixed a broken python test from refactor. Live tested rasterio udf examples (last 2) and updated code and comments. Added additional json_spec examples to rst_mapalgebra. --- docs/source/api/raster-functions.rst | 6 ++ docs/source/api/rasterio-udfs.rst | 83 +++++++++++++++------------- python/test/test_vector_functions.py | 4 +- 3 files changed, 52 insertions(+), 41 deletions(-) diff --git a/docs/source/api/raster-functions.rst b/docs/source/api/raster-functions.rst index c982e929a..87af5c46e 100644 --- a/docs/source/api/raster-functions.rst +++ b/docs/source/api/raster-functions.rst @@ -908,6 +908,12 @@ rst_mapalgebra arrays (such as +, -, *, and /) along with logical operators (such as >, <, =). For this distributed implementation, all rasters must have the same dimensions and no projection checking is performed. + Here are examples of the json_spec': (1) shows default indexing, (2) shows reusing an index, + and (3) shows band indexing. + (1) '{"calc": "A+B/C"}' + (2) '{"calc": "A+B/C", "A_index": 0, "B_index": 1, "C_index": 1}' + (3) '{"calc": "A+B/C", "A_index": 0, "B_index": 1, "C_index": 2, "A_band": 1, "B_band": 1, "C_band": 1}' + :param tile: A column containing the raster tile. :type tile: Column (RasterTileType) :param json_spec: A column containing the map algebra operation specification. diff --git a/docs/source/api/rasterio-udfs.rst b/docs/source/api/rasterio-udfs.rst index 80561a89a..339223c58 100644 --- a/docs/source/api/rasterio-udfs.rst +++ b/docs/source/api/rasterio-udfs.rst @@ -236,7 +236,11 @@ Firstly we will create a spark DataFrame from a directory of raster files. Next we will define a function that will write a given raster file to disk. A "gotcha" to keep in mind is that you do not want to have a file context manager open when you go to write out its context as the context manager will not yet -have been flushed. +have been flushed. Another "gotcha" might be that the raster dataset does not have CRS included; if this arises, we +recommend adjusting the function to specify the CRS and set it on the dst variable, more at +`rasterio.crs `_. We would also point out that notional +"file_id" param can be constructed as a repeatable name from other field(s) in your dataframe / table or be random, +depending on your needs. .. code-block:: python @@ -253,31 +257,30 @@ have been flushed. # - [1] populate the initial profile # # profile is needed in order to georeference the image - profile = None - with MemoryFile(BytesIO(raster)) as memfile: - with memfile.open() as dataset: - profile = dataset.profile - - # - [2] get the correct extension - extensions_map = rasterio.drivers.raster_driver_extensions() - driver_map = {v: k for k, v in extensions_map.items()} - extension = driver_map[driver] #e.g. GTiff - file_name = f"{file_id}.{extension}" - - # - [3] write local raster - # - this is showing a single band [1] - # being written with tempfile.TemporaryDirectory() as tmp_dir: - tmp_path = f"{tmp_dir}/{file_name}" + profile = None + data_arr = None + with MemoryFile(BytesIO(raster)) as memfile: + with memfile.open() as dataset: + profile = dataset.profile + data_arr = dataset.read() # here you can update profile using .update method # example https://rasterio.readthedocs.io/en/latest/topics/writing.html + # - [2] get the correct extension + extensions_map = rasterio.drivers.raster_driver_extensions() + driver_map = {v: k for k, v in extensions_map.items()} + extension = driver_map[driver] #e.g. GTiff + file_name = f"{file_id}.{extension}" + # - [3] write local raster + # - this is showing a single band [1] + # being written + tmp_path = f"{tmp_dir}/{file_name}" with rasterio.open( - tmp_path, - "w", - **profile + tmp_path, + "w", + **profile ) as dst: - dst.write(raster,1) # <- adjust as needed - + dst.write(data_arr) # <- adjust as needed # - [4] copy to fuse path Path(fuse_dir).mkdir(parents=True, exist_ok=True) fuse_path = f"{fuse_dir}/{file_name}" @@ -294,25 +297,27 @@ Finally we will apply the function to the DataFrame. "tile.raster", lit("GTiff").alias("driver"), "uuid", - lit("dbfs:/path/to/output/dir").alias("fuse_dir") + lit("/dbfs/path/to/output/dir").alias("fuse_dir") ) ).display() +----------------------------------------------+ | write_raster(raster, driver, uuid, fuse_dir) | +----------------------------------------------+ - | dbfs:/path/to/output/dir/1234.tif | - | dbfs:/path/to/output/dir/4545.tif | - | dbfs:/path/to/output/dir/3215.tif | + | /dbfs/path/to/output/dir/1234.tif | + | /dbfs/path/to/output/dir/4545.tif | + | /dbfs/path/to/output/dir/3215.tif | | ... | +----------------------------------------------+ Sometimes you don't need to be quite as fancy. Consider when you simply want to specify to write out raster contents, -assuming you specify the extension in the file_id. This is just writing binary column to file, nothing further. +assuming you specify the extension in the file_name. This is just writing binary column to file, nothing further. Again, +we use a notional "uuid" column as part of "file_name" param, which would have the same considerations as mentioned +above. .. code-block:: python @udf("string") - def write_contents(raster, file_name, fuse_dir): + def write_binary(raster_bin, file_name, fuse_dir): from pathlib import Path import os import shutil @@ -326,7 +331,7 @@ assuming you specify the extension in the file_id. This is just writing binary c # - write within the tmp_dir context # - flush the writer before copy tmp_file = open(tmp_path, "wb") - tmp_file.write(raster) # <- write entire binary content + tmp_file.write(raster_bin) # <- write entire binary content tmp_file.close() # - copy local to fuse shutil.copyfile(tmp_path, fuse_path) @@ -337,17 +342,17 @@ Finally we will apply the function to the DataFrame. .. code-block:: python df.select( - write_contents( + write_binary( "tile.raster", - F.concat("uuid", F.lit(".tif").alias("file_name"), - lit("dbfs:/path/to/output/dir").alias("fuse_dir") + F.concat("uuid", F.lit(".tif")).alias("file_name"), + F.lit("/dbfs/path/to/output/dir").alias("fuse_dir") ) ).display() - +----------------------------------------+ - | write_tif(raster, file_name, fuse_dir) | - +----------------------------------------+ - | dbfs:/path/to/output/dir/1234.tif | - | dbfs:/path/to/output/dir/4545.tif | - | dbfs:/path/to/output/dir/3215.tif | - | ... | - +----------------------------------------+ + +-------------------------------------------+ + | write_binary(raster, file_name, fuse_dir) | + +-------------------------------------------+ + | /dbfs/path/to/output/dir/1234.tif | + | /dbfs/path/to/output/dir/4545.tif | + | /dbfs/path/to/output/dir/3215.tif | + | ... | + +-------------------------------------------+ diff --git a/python/test/test_vector_functions.py b/python/test/test_vector_functions.py index 4db8acd1f..3a127327a 100644 --- a/python/test/test_vector_functions.py +++ b/python/test/test_vector_functions.py @@ -169,10 +169,10 @@ def test_aggregation_functions(self): .join(right_df, col("left_index.index_id") == col("right_index.index_id")) .groupBy("left_id", "right_id") .agg( - api.st_intersects_aggregate( + api.st_intersects_agg( col("left_index"), col("right_index") ).alias("agg_intersects"), - api.st_intersection_aggregate( + api.st_intersection_agg( col("left_index"), col("right_index") ).alias("agg_intersection"), first("left_geom").alias("left_geom"),