Skip to content

Commit

Permalink
Add GeoDataFrame support to Pipeline (#173)
Browse files Browse the repository at this point in the history
* Added GeoDataFrame support to pipeline.py

Added basic GeoPandas GeoDataFrame support. If GeoPandas is installed users can read an array from an executed pipeline and return a GeoDataFrame, with optional arguments for XY vs XYZ point and CRS. DataFrames passed to the Pipeline constructor will drop the "geometry" column if present.

* Update test_pipeline.py

Added test for GeoDataFrames

* add geopandas to environment reqs

---------

Co-authored-by: Howard Butler <[email protected]>
  • Loading branch information
jf-geo and hobu authored Oct 4, 2024
1 parent b83d78d commit f858350
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .github/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ dependencies:
- pdal
- pytest
- meshio
- pandas
- geopandas
22 changes: 20 additions & 2 deletions src/pdal/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
except ModuleNotFoundError: # pragma: no cover
DataFrame = None

try:
from geopandas import GeoDataFrame, points_from_xy
except ModuleNotFoundError: # pragma: no cover
GeoDataFrame = points_from_xy = None

from . import drivers, libpdalpython

LogLevelToPDAL = {
Expand Down Expand Up @@ -45,7 +50,7 @@ def __init__(

# Convert our data frames to Numpy Structured Arrays
if dataframes:
arrays = [df.to_records() for df in dataframes]
arrays = [df.to_records() if not "geometry" in df.columns else df.drop(columns=["geometry"]).to_records() for df in dataframes]

super().__init__()
self._stages: List[Stage] = []
Expand Down Expand Up @@ -124,13 +129,26 @@ def get_meshio(self, idx: int) -> Optional[Mesh]:
[("triangle", np.stack((mesh["A"], mesh["B"], mesh["C"]), 1))],
)


def get_dataframe(self, idx: int) -> Optional[DataFrame]:
if DataFrame is None:
raise RuntimeError("Pandas support requires Pandas to be installed")

return DataFrame(self.arrays[idx])

def get_geodataframe(self, idx: int, xyz: bool=False, crs: Any=None) -> Optional[GeoDataFrame]:
if GeoDataFrame is None:
raise RuntimeError("GeoPandas support requires GeoPandas to be installed")
df = DataFrame(self.arrays[idx])
coords = [df["X"], df["Y"], df["Z"]] if xyz else [df["X"], df["Y"]]
geometry = points_from_xy(*coords)
gdf = GeoDataFrame(
df,
geometry=geometry,
crs=crs,
)
df = coords = geometry = None
return gdf

def _get_json(self) -> str:
return self.toJSON()

Expand Down
59 changes: 59 additions & 0 deletions test/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,65 @@ def test_load(self):
assert data["Intensity"].sum() == 57684


class TestGeoDataFrame:

@pytest.mark.skipif(
not pdal.pipeline.GeoDataFrame,
reason="geopandas is not available",
)
def test_fetch(self):
r = pdal.Reader(os.path.join(DATADIRECTORY,"autzen-utm.las"))
p = r.pipeline()
p.execute()
record_count = p.arrays[0].shape[0]
dimension_count = len(p.arrays[0].dtype)
gdf = p.get_geodataframe(0)
gdf_xyz = p.get_geodataframe(0, xyz=True)
gdf_crs = p.get_geodataframe(0, crs="EPSG:4326")
assert len(gdf) == record_count
assert len(gdf.columns) == dimension_count + 1
assert isinstance(gdf, pdal.pipeline.GeoDataFrame)
assert gdf.geometry.is_valid.all()
assert not gdf.geometry.is_empty.any()
assert gdf.crs is None
assert gdf.geometry.z.isna().all()
assert not gdf_xyz.geometry.z.isna().any()
assert gdf_crs.crs.srs == "EPSG:4326"

@pytest.mark.skipif(
not pdal.pipeline.GeoDataFrame,
reason="geopandas is not available",
)
def test_load(self):
r = pdal.Reader(os.path.join(DATADIRECTORY,"autzen-utm.las"))
p = r.pipeline()
p.execute()
data = p.arrays[0]
gdf = pdal.pipeline.GeoDataFrame(
data,
geometry=pdal.pipeline.points_from_xy(data["X"], data["Y"], data["Z"])
)
dataframes = [gdf, gdf, gdf]
filter_intensity = """{
"pipeline":[
{
"type":"filters.range",
"limits":"Intensity[100:300)"
}
]
}"""
p = pdal.Pipeline(filter_intensity, dataframes = dataframes)
p.execute()
arrays = p.arrays
assert len(arrays) == 3

# We copied the array three times. Sum the Intensity values
# post filtering to see if we had our intended effect
for data in arrays:
assert len(data) == 387
assert data["Intensity"].sum() == 57684


class TestPipelineIterator:
@pytest.mark.parametrize("filename", ["sort.json", "sort.py"])
def test_non_streamable(self, filename):
Expand Down

0 comments on commit f858350

Please sign in to comment.