From 92303f76e949529f528d1ea373ad30323154ff42 Mon Sep 17 00:00:00 2001 From: Josh Cunningham Date: Thu, 22 Aug 2024 13:38:28 -0500 Subject: [PATCH 1/4] remove geopandas and fiona --- .github/workflows/ubuntu-latest.yml | 2 +- readme.md | 3 +- requirements.txt | 1 - src/troute-network/pyproject.toml | 2 - src/troute-network/tests/make_test_network.py | 10 +- .../troute/HYFeaturesNetwork.py | 98 ++++++++++++------- test/LowerColorado_TX_v4/run_BMI_Coastal.py | 4 +- test/LowerColorado_TX_v4/run_with_BMI.py | 2 - 8 files changed, 74 insertions(+), 48 deletions(-) diff --git a/.github/workflows/ubuntu-latest.yml b/.github/workflows/ubuntu-latest.yml index 8746a6637..c38f46157 100644 --- a/.github/workflows/ubuntu-latest.yml +++ b/.github/workflows/ubuntu-latest.yml @@ -40,7 +40,7 @@ jobs: sudo apt-get install libstdc++-10-dev libgfortran-10-dev glibc-source openmpi-bin openmpi-common libopenmpi-dev libopenmpi3 libgtk-3-bin libgtk-3-common libgtk-3-dev -y sudo apt-get install netcdf-bin libnetcdf-dev libnetcdff-dev libnetcdf-c++4 libnetcdf-c++4-dev -y python -m pip install --upgrade pip - pip3 install wheel dask pyproj fiona bmipy opencv-contrib-python-headless + pip3 install wheel dask pyproj bmipy opencv-contrib-python-headless if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Install t-route diff --git a/readme.md b/readme.md index a0bc3fe45..779b4d84c 100644 --- a/readme.md +++ b/readme.md @@ -66,7 +66,6 @@ joblib toolz Cython pyyaml -geopandas pyarrow deprecated ``` @@ -84,7 +83,7 @@ To get a sense of the operation of the routing scheme, follow this sequence of c ```shell # install required python modules -pip3 install numpy pandas xarray netcdf4 joblib toolz pyyaml Cython>3,!=3.0.4 geopandas pyarrow deprecated wheel +pip3 install numpy pandas xarray netcdf4 joblib toolz pyyaml Cython>3,!=3.0.4 pyarrow deprecated wheel # clone t-toute git clone --progress --single-branch --branch master http://github.com/NOAA-OWP/t-route.git diff --git a/requirements.txt b/requirements.txt index 6963bc407..266163860 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,3 @@ toolz joblib deprecated pyarrow<12.0 -geopandas diff --git a/src/troute-network/pyproject.toml b/src/troute-network/pyproject.toml index 77ad761c0..724c7ae9c 100644 --- a/src/troute-network/pyproject.toml +++ b/src/troute-network/pyproject.toml @@ -7,8 +7,6 @@ name = "troute.network" version = "0.0.0" dependencies = [ "deprecated", - "geopandas", - "fiona", "joblib", "netcdf4", "numpy", diff --git a/src/troute-network/tests/make_test_network.py b/src/troute-network/tests/make_test_network.py index 4a73a1309..fbe05a0e4 100644 --- a/src/troute-network/tests/make_test_network.py +++ b/src/troute-network/tests/make_test_network.py @@ -1,5 +1,5 @@ -import geopandas as gpd import pandas as pd +import sqlite3 from pathlib import Path import sys @@ -45,9 +45,11 @@ def make_network_from_segment(flowpaths, edges, attributes, depth, segment): sub_edges.drop('geometry', axis=1).to_json("flowpath_edge_list.json", orient='records', indent=2) def make_network_from_geopkg(file_path, depth, segment=None): - flowpaths = gpd.read_file(file_path, layer="flowpaths") - attributes = gpd.read_file(file_path, layer="flowpath_attributes") - edges = gpd.read_file(file_path, layer="flowpath_edge_list") + with sqlite3.connect(file_path) as conn: + flowpaths = pd.read_sql_query("SELECT * FROM flowpaths", file_path) + attributes = pd.read_sql_query("SELECT * FROM flowpath_attributes", file_path) + edges = pd.read_sql_query("SELECT * FROM flowpath_edge_list", file_path) + if segment is None: segment = flowpaths[flowpaths['toid'].str.startswith('tnex')].iloc[0]['id'] make_network_from_segment(flowpaths, edges, attributes, depth, segment) diff --git a/src/troute-network/troute/HYFeaturesNetwork.py b/src/troute-network/troute/HYFeaturesNetwork.py index fb8a19225..f1dd72cc2 100644 --- a/src/troute-network/troute/HYFeaturesNetwork.py +++ b/src/troute-network/troute/HYFeaturesNetwork.py @@ -1,7 +1,6 @@ from .AbstractNetwork import AbstractNetwork import pandas as pd import numpy as np -import geopandas as gpd import time import json from pathlib import Path @@ -13,7 +12,7 @@ from datetime import datetime from pprint import pformat import os -import fiona +import sqlite3 import troute.nhd_io as nhd_io #FIXME from troute.nhd_network import reverse_dict, extract_connections, reverse_network, reachable from .rfc_lake_gage_crosswalk import get_rfc_lake_gage_crosswalk, get_great_lakes_climatology @@ -31,12 +30,17 @@ def find_layer_name(layers, pattern): return None def read_geopkg(file_path, compute_parameters, waterbody_parameters, cpu_pool): - # Retrieve available layers from the GeoPackage - available_layers = fiona.listlayers(file_path) + # Retrieve available layers from the GeoPackage + with sqlite3.connect(file_path) as conn: + # gpkg_contents, gpkg_ogr_contents, and sqlite_sequence all contain the layer names + result = conn.execute("SELECT table_name FROM gpkg_contents;").fetchall() + # fetchall returns a list tuples + available_layers = [r[0] for r in result] # patterns for the layers we want to find layer_patterns = { - 'flowpaths': r'flow[-_]?paths?|flow[-_]?lines?', + # without $ flowpaths would match also flowpath_attributes + 'flowpaths': r'flow[-_]?paths?$|flow[-_]?lines?$', 'flowpath_attributes': r'flow[-_]?path[-_]?attributes?|flow[-_]?line[-_]?attributes?', 'lakes': r'lakes?', 'nexus': r'nexus?', @@ -67,13 +71,16 @@ def read_geopkg(file_path, compute_parameters, waterbody_parameters, cpu_pool): # Function that read a layer from the geopackage def read_layer(layer_name): - if layer_name: - try: - return gpd.read_file(file_path, layer=layer_name) - except Exception as e: - print(f"Error reading {layer_name}: {e}") - return pd.DataFrame() - return pd.DataFrame() + if not layer_name: + return pd.DataFrame() + try: + with sqlite3.connect(file_path) as conn: + # user sql injection prevented find_layer_name function + return pd.read_sql_query(f"SELECT * FROM {layer_name}", conn) + except Exception as e: + print(f"Error reading layer {layer_name} from {file_path}: {e}") + return pd.DataFrame() + # Retrieve geopackage information using matched layer names if cpu_pool > 1: @@ -83,26 +90,36 @@ def read_layer(layer_name): table_dict = {layers_to_read[i]: gpkg_list[i] for i in range(len(layers_to_read))} else: table_dict = {layer: read_layer(matched_layers[layer]) for layer in layers_to_read} - - # Handle different key column names between flowpaths and flowpath_attributes - flowpaths_df = table_dict.get('flowpaths', pd.DataFrame()) - flowpath_attributes_df = table_dict.get('flowpath_attributes', pd.DataFrame()) - - # Check if 'link' column exists and rename it to 'id' - if 'link' in flowpath_attributes_df.columns: - flowpath_attributes_df.rename(columns={'link': 'id'}, inplace=True) - - # Merge flowpaths and flowpath_attributes - flowpaths = pd.merge( - flowpaths_df, - flowpath_attributes_df, - on='id', - how='inner' - ) lakes = table_dict.get('lakes', pd.DataFrame()) network = table_dict.get('network', pd.DataFrame()) nexus = table_dict.get('nexus', pd.DataFrame()) + + # Handle different key column names between flowpaths and flowpath_attributes + flowpaths_df = table_dict.get('flowpaths', None) + flowpath_attributes_df = table_dict.get('flowpath_attributes', None) + + if flowpath_attributes_df is not None: + # Check if 'link' column exists and rename it to 'id' + if 'link' in flowpath_attributes_df.columns: + flowpath_attributes_df.rename(columns={'link': 'id'}, inplace=True) + + if flowpaths_df is not None and flowpath_attributes_df is not None: + # Merge flowpaths and flowpath_attributes + flowpaths = pd.merge( + flowpaths_df, + flowpath_attributes_df, + on='id', + how='inner' + ) + elif flowpaths_df is not None: + flowpaths = flowpaths_df + elif flowpath_attributes_df is not None: + flowpaths = flowpath_attributes_df + else: + raise ValueError("No flowpaths or flowpath_attributes found in the geopackage") + + return flowpaths, lakes, network, nexus @@ -126,8 +143,18 @@ def read_json(file_path, edge_list): return df_main def read_geojson(file_path): - flowpaths = gpd.read_file(file_path) - return flowpaths + with open(file_path) as f: + data = json.load(f) + data = data['features'] + df = pd.json_normalize(data,max_level=1) + df.columns = df.columns.str.replace('properties.','') + df = df.drop(columns=['type']) + # Geometry seems to be unused or dropped, in case it is needed: + # geometry type e.g. MULTIPOLYGON, is stored in geometry.type + # and the coordinates are stored in geometry.coordinates + # crs stored in data['crs'] e.g. + # data['crs'] = { "type": "name", "properties": { "name": "urn:ogc:def:crs:EPSG::5070" } } + return df def numeric_id(flowpath): id = flowpath['key'].split('-')[-1] @@ -138,6 +165,7 @@ def numeric_id(flowpath): def read_ngen_waterbody_df(parm_file, lake_index_field="wb-id", lake_id_mask=None): """ + FIXME FUNCTION NEVER CALLED Reads .gpkg or lake.json file and prepares a dataframe, filtered to the relevant reservoirs, to provide the parameters for level-pool reservoir computation. @@ -145,8 +173,10 @@ def read_ngen_waterbody_df(parm_file, lake_index_field="wb-id", lake_id_mask=Non def node_key_func(x): return int( x.split('-')[-1] ) if Path(parm_file).suffix=='.gpkg': - df = gpd.read_file(parm_file, layer='lakes') - + # This can be made more efficient by reading only the necessary columns + # but I don't have a reference for what columns remain after dropping + with sqlite3.connect(parm_file) as conn: + df = pd.read_sql_query('SELECT * from lakes', conn) df = ( df.drop(['id','toid','hl_id','hl_reference','hl_uri','geometry'], axis=1) .rename(columns={'hl_link': 'lake_id'}) @@ -164,6 +194,7 @@ def node_key_func(x): def read_ngen_waterbody_type_df(parm_file, lake_index_field="wb-id", lake_id_mask=None): """ + FIXME FUNCTION NEVER CALLED """ #FIXME: this function is likely not correct. Unclear how we will get # reservoir type from the gpkg files. Information should be in 'crosswalk' @@ -173,7 +204,8 @@ def node_key_func(x): return int( x.split('-')[-1] ) if Path(parm_file).suffix=='.gpkg': - df = gpd.read_file(parm_file, layer="crosswalk").set_index('id') + with sqlite3.connect(parm_file) as conn: + df = pd.read_sql_query('SELECT * FROM crosswalk', conn) elif Path(parm_file).suffix=='.json': df = pd.read_json(parm_file, orient="index") diff --git a/test/LowerColorado_TX_v4/run_BMI_Coastal.py b/test/LowerColorado_TX_v4/run_BMI_Coastal.py index 724108cd0..345e89526 100644 --- a/test/LowerColorado_TX_v4/run_BMI_Coastal.py +++ b/test/LowerColorado_TX_v4/run_BMI_Coastal.py @@ -3,8 +3,6 @@ import os import numpy as np import pandas as pd -import geopandas as gpd -import pickle from datetime import datetime, timedelta import time @@ -20,7 +18,7 @@ #import troute_model from troute.HYFeaturesNetwork import HYFeaturesNetwork -from troute.AbstractNetwork import * +from troute.AbstractNetwork import read_coastal_output import bmi_df2array as df2a diff --git a/test/LowerColorado_TX_v4/run_with_BMI.py b/test/LowerColorado_TX_v4/run_with_BMI.py index bf8702620..d19d9524d 100644 --- a/test/LowerColorado_TX_v4/run_with_BMI.py +++ b/test/LowerColorado_TX_v4/run_with_BMI.py @@ -3,8 +3,6 @@ import os import numpy as np import pandas as pd -import geopandas as gpd -import pickle from datetime import datetime, timedelta import time From 3d5f18ef22478639b2d7ec870786e47cd7a1e92d Mon Sep 17 00:00:00 2001 From: Josh Cunningham Date: Fri, 23 Aug 2024 21:41:33 +0000 Subject: [PATCH 2/4] Get lat,lon,crs, when possible from gpkg --- .gitignore | 1 + src/troute-network/troute/AbstractNetwork.py | 8 +----- .../troute/HYFeaturesNetwork.py | 26 ++++++++++++++++--- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index d5a0a0d03..9db41e308 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,7 @@ __pycache__/ .env .python-version .ipynb_checkpoints/ +build/ # pyenv # ######### diff --git a/src/troute-network/troute/AbstractNetwork.py b/src/troute-network/troute/AbstractNetwork.py index 830be578e..9d547377f 100644 --- a/src/troute-network/troute/AbstractNetwork.py +++ b/src/troute-network/troute/AbstractNetwork.py @@ -926,13 +926,7 @@ def filter_diffusive_nexus_pts(self,): diff_tw_ids = ['nex-' + str(s) for s in diff_tw_ids] nexus_latlon = nexus_latlon[nexus_latlon['id'].isin(diff_tw_ids)] nexus_latlon['id'] = nexus_latlon['id'].str.split('-',expand=True).loc[:,1].astype(float).astype(int) - lat_lon_crs = nexus_latlon[['id','geometry']] - lat_lon_crs = lat_lon_crs.to_crs(crs=4326) - lat_lon_crs['lon'] = lat_lon_crs.geometry.x - lat_lon_crs['lat'] = lat_lon_crs.geometry.y - lat_lon_crs['crs'] = str(lat_lon_crs.crs) - lat_lon_crs = lat_lon_crs[['lon','lat','crs']] - self._nexus_latlon = nexus_latlon[['id']].join(lat_lon_crs) + self._nexus_latlon = nexus_latlon[['id','lon','lat', 'crs']] def get_timesteps_from_nex(nexus_files): # Return a list of output files diff --git a/src/troute-network/troute/HYFeaturesNetwork.py b/src/troute-network/troute/HYFeaturesNetwork.py index f1dd72cc2..6fd9c98e0 100644 --- a/src/troute-network/troute/HYFeaturesNetwork.py +++ b/src/troute-network/troute/HYFeaturesNetwork.py @@ -28,7 +28,7 @@ def find_layer_name(layers, pattern): if re.search(pattern, layer, re.IGNORECASE): return layer return None - + def read_geopkg(file_path, compute_parameters, waterbody_parameters, cpu_pool): # Retrieve available layers from the GeoPackage with sqlite3.connect(file_path) as conn: @@ -75,8 +75,23 @@ def read_layer(layer_name): return pd.DataFrame() try: with sqlite3.connect(file_path) as conn: - # user sql injection prevented find_layer_name function - return pd.read_sql_query(f"SELECT * FROM {layer_name}", conn) + has_spatial_metadata = False + geometry_columns = conn.execute(f"SELECT c.column_name,g.definition FROM gpkg_geometry_columns AS c JOIN gpkg_spatial_ref_sys AS g ON c.srs_id = g.srs_id WHERE c.table_name = '{layer_name}'").fetchall() + if len(geometry_columns) > 0: + has_spatial_metadata = True + geom_column = geometry_columns[0][0] + crs = geometry_columns[0][1] + if has_spatial_metadata: + sql_query = f"""SELECT d.*, + (r.minx + r.maxx) / 2.0 AS lon, + (r.miny + r.maxy) / 2.0 AS lat + FROM {layer_name} AS d + JOIN rtree_{layer_name}_{geom_column} AS r ON d.fid = r.id""" + df = pd.read_sql_query(sql_query, conn) + df['crs'] = crs + return df + else: + return pd.read_sql_query(f"SELECT * FROM {layer_name}", conn) except Exception as e: print(f"Error reading layer {layer_name} from {file_path}: {e}") return pd.DataFrame() @@ -119,7 +134,10 @@ def read_layer(layer_name): else: raise ValueError("No flowpaths or flowpath_attributes found in the geopackage") - + # if lat or lon or crs is missing from nexus, call a function to add them + if not nexus.empty: + if 'lat' not in nexus.columns or 'lon' not in nexus.columns or 'crs' not in nexus.columns: + nexus = add_lat_lon_crs(file_path,nexus) return flowpaths, lakes, network, nexus From d325c36307bb5ad7afddbf555853d06c188d2e17 Mon Sep 17 00:00:00 2001 From: Josh Cunningham Date: Fri, 23 Aug 2024 21:55:35 +0000 Subject: [PATCH 3/4] improves sql formatting and removes dead code --- src/troute-network/troute/HYFeaturesNetwork.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/troute-network/troute/HYFeaturesNetwork.py b/src/troute-network/troute/HYFeaturesNetwork.py index 6fd9c98e0..fbb6d1a71 100644 --- a/src/troute-network/troute/HYFeaturesNetwork.py +++ b/src/troute-network/troute/HYFeaturesNetwork.py @@ -69,19 +69,27 @@ def read_geopkg(file_path, compute_parameters, waterbody_parameters, cpu_pool): if hybrid_parameters.get('run_hybrid_routing', False) and 'nexus' not in layers_to_read: layers_to_read.append('nexus') - # Function that read a layer from the geopackage + # Function that reads a layer from the geopackage def read_layer(layer_name): if not layer_name: return pd.DataFrame() try: with sqlite3.connect(file_path) as conn: has_spatial_metadata = False - geometry_columns = conn.execute(f"SELECT c.column_name,g.definition FROM gpkg_geometry_columns AS c JOIN gpkg_spatial_ref_sys AS g ON c.srs_id = g.srs_id WHERE c.table_name = '{layer_name}'").fetchall() + # try and get the name of the geometry column and it's crs + geometry_columns = conn.execute(f""" + SELECT c.column_name,g.definition + FROM gpkg_geometry_columns AS c + JOIN gpkg_spatial_ref_sys AS g ON c.srs_id = g.srs_id + WHERE c.table_name = '{layer_name}'""").fetchall() + if len(geometry_columns) > 0: has_spatial_metadata = True geom_column = geometry_columns[0][0] crs = geometry_columns[0][1] + if has_spatial_metadata: + # select everything from the layer, + the midpoint of it's bounding box sql_query = f"""SELECT d.*, (r.minx + r.maxx) / 2.0 AS lon, (r.miny + r.maxy) / 2.0 AS lat @@ -134,11 +142,6 @@ def read_layer(layer_name): else: raise ValueError("No flowpaths or flowpath_attributes found in the geopackage") - # if lat or lon or crs is missing from nexus, call a function to add them - if not nexus.empty: - if 'lat' not in nexus.columns or 'lon' not in nexus.columns or 'crs' not in nexus.columns: - nexus = add_lat_lon_crs(file_path,nexus) - return flowpaths, lakes, network, nexus def read_json(file_path, edge_list): From 5e9065dae411e770ccfba32e73591bf132c0b7ec Mon Sep 17 00:00:00 2001 From: Josh Cunningham Date: Tue, 27 Aug 2024 11:33:57 -0500 Subject: [PATCH 4/4] grammar, whitespace, comment improvement --- src/troute-network/troute/HYFeaturesNetwork.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/troute-network/troute/HYFeaturesNetwork.py b/src/troute-network/troute/HYFeaturesNetwork.py index fbb6d1a71..3a59aa47d 100644 --- a/src/troute-network/troute/HYFeaturesNetwork.py +++ b/src/troute-network/troute/HYFeaturesNetwork.py @@ -28,7 +28,7 @@ def find_layer_name(layers, pattern): if re.search(pattern, layer, re.IGNORECASE): return layer return None - + def read_geopkg(file_path, compute_parameters, waterbody_parameters, cpu_pool): # Retrieve available layers from the GeoPackage with sqlite3.connect(file_path) as conn: @@ -76,7 +76,7 @@ def read_layer(layer_name): try: with sqlite3.connect(file_path) as conn: has_spatial_metadata = False - # try and get the name of the geometry column and it's crs + # try and get the name of the geometry column and its crs geometry_columns = conn.execute(f""" SELECT c.column_name,g.definition FROM gpkg_geometry_columns AS c @@ -89,7 +89,9 @@ def read_layer(layer_name): crs = geometry_columns[0][1] if has_spatial_metadata: - # select everything from the layer, + the midpoint of it's bounding box + # select everything from the layer, + the midpoint of its bounding box + # decoding the geometry blob can be done as it's just gpkg header + WKB + # the rtree table calculation is much faster sql_query = f"""SELECT d.*, (r.minx + r.maxx) / 2.0 AS lon, (r.miny + r.maxy) / 2.0 AS lat @@ -170,8 +172,7 @@ def read_geojson(file_path): df = pd.json_normalize(data,max_level=1) df.columns = df.columns.str.replace('properties.','') df = df.drop(columns=['type']) - # Geometry seems to be unused or dropped, in case it is needed: - # geometry type e.g. MULTIPOLYGON, is stored in geometry.type + # Geometry type e.g. MULTIPOLYGON, is stored in geometry.type # and the coordinates are stored in geometry.coordinates # crs stored in data['crs'] e.g. # data['crs'] = { "type": "name", "properties": { "name": "urn:ogc:def:crs:EPSG::5070" } }