From dd55940bd9d48b7408995f3d94b05d74502b2a78 Mon Sep 17 00:00:00 2001 From: Simon Norris Date: Fri, 16 Aug 2024 10:36:51 -0700 Subject: [PATCH] cache as parquet only --- download.py | 32 +++++--------------------------- process.sh | 13 ++++++++----- 2 files changed, 13 insertions(+), 32 deletions(-) diff --git a/download.py b/download.py index b058af5..fabeb72 100644 --- a/download.py +++ b/download.py @@ -258,13 +258,6 @@ def download_source(source): @click.command() @click.argument("sources_file", type=click.Path(exists=True), default="sources.json") -@click.option( - "--out_format", - "-of", - default="GPKG", - type=click.Choice(["GPKG", "OpenFileGDB", "Parquet"], case_sensitive=False), - help="Output file format", -) @click.option( "--source_alias", "-s", @@ -277,9 +270,9 @@ def download_source(source): @click.option( "--out_path", "-o", - type=click.Path(exists=True), + type=click.Path(), default=".", - help="Output path to cache data (local folder or object storage)", + help="Output path to write data (local or s3://)", ) @verbose_opt @quiet_opt @@ -301,27 +294,12 @@ def download(sources_file, out_format, source_alias, dry_run, out_path, verbose, if not dry_run: for source in sources: df = download_source(source) - - # determine file extension from format - if out_format == "OpenFileGDB": - extension = "gdb" - else: - extension = out_format.lower() - layer = ( "hr_" + str(source["index"]).zfill(2) + "_" + source["alias"].lower() ) - - out_file = os.path.join(out_path, layer + "." + extension) - - # one file per layer, overwrite existing file - # (rather than switching between write and append mode) - if out_format in ["OpenFileGDB", "GPKG"]: - df.to_file(out_file, driver=out_format, layer=layer) - - # parquet is one file per layer as default - elif out_format == "Parquet": - df.to_parquet(out_file) + # parquet is one file per layer and direct write to s3 is supported + out_file = os.path.join(out_path, layer + ".parquet") + df.to_parquet(out_file) LOG.info(f"{source['alias']} written to {out_file}") diff --git a/process.sh b/process.sh index eb9ad78..08f77cd 100755 --- a/process.sh +++ b/process.sh @@ -1,13 +1,16 @@ #!/bin/bash set -euxo pipefail +PSQL="psql $DATABASE_URL -v ON_ERROR_STOP=1" + # load 250k grid bcdata bc2pg WHSE_BASEMAPPING.NTS_250K_GRID -# load +# load source data + # create output table -psql $DATABASE_URL -c "DROP TABLE IF EXISTS designations; +$PSQL -c "DROP TABLE IF EXISTS designations; CREATE TABLE designations ( designations_id serial primary key, index integer, @@ -27,11 +30,11 @@ psql $DATABASE_URL -c "DROP TABLE IF EXISTS designations; );" # run overlay -psql $DATABASE_URL -tXA \ +$PSQL -tXA \ -c "SELECT DISTINCT map_tile FROM whse_basemapping.nts_250k_grid ORDER BY map_tile" \ - | parallel --tag psql $DATABASE_URL -f sql/overlay.sql -v tile={1} + | parallel --tag $PSQL -f sql/overlay.sql -v tile={1} # dump result to file ogr2ogr \ @@ -71,7 +74,7 @@ ogr2ogr \ zip -r harvest_restrictions.gdb.zip harvest_restrictions.gdb # summarize results -psql $DATABASE_URL -f sql/summarize.sql --csv > harvest_restrictions_summary.csv +$PSQL -f sql/summarize.sql --csv > harvest_restrictions_summary.csv # post to s3 aws s3 cp harvest_restrictions.gdb.zip s3://$OBJECTSTORE_BUCKET/dss_projects_2024/harvest_restrictions/harvest_restrictions.gdb.zip