From dd55940bd9d48b7408995f3d94b05d74502b2a78 Mon Sep 17 00:00:00 2001
From: Simon Norris <snorris@hillcrestgeo.ca>
Date: Fri, 16 Aug 2024 10:36:51 -0700
Subject: [PATCH] cache as parquet only

---
 download.py | 32 +++++---------------------------
 process.sh  | 13 ++++++++-----
 2 files changed, 13 insertions(+), 32 deletions(-)

diff --git a/download.py b/download.py
index b058af5..fabeb72 100644
--- a/download.py
+++ b/download.py
@@ -258,13 +258,6 @@ def download_source(source):
 
 @click.command()
 @click.argument("sources_file", type=click.Path(exists=True), default="sources.json")
-@click.option(
-    "--out_format",
-    "-of",
-    default="GPKG",
-    type=click.Choice(["GPKG", "OpenFileGDB", "Parquet"], case_sensitive=False),
-    help="Output file format",
-)
 @click.option(
     "--source_alias",
     "-s",
@@ -277,9 +270,9 @@ def download_source(source):
 @click.option(
     "--out_path",
     "-o",
-    type=click.Path(exists=True),
+    type=click.Path(),
     default=".",
-    help="Output path to cache data (local folder or object storage)",
+    help="Output path to write data (local or s3://)",
 )
 @verbose_opt
 @quiet_opt
@@ -301,27 +294,12 @@ def download(sources_file, out_format, source_alias, dry_run, out_path, verbose,
     if not dry_run:
         for source in sources:
             df = download_source(source)
-
-            # determine file extension from format
-            if out_format == "OpenFileGDB":
-                extension = "gdb"
-            else:
-                extension = out_format.lower()
-
             layer = (
                 "hr_" + str(source["index"]).zfill(2) + "_" + source["alias"].lower()
             )
-
-            out_file = os.path.join(out_path, layer + "." + extension)
-
-            # one file per layer, overwrite existing file
-            # (rather than switching between write and append mode)
-            if out_format in ["OpenFileGDB", "GPKG"]:
-                df.to_file(out_file, driver=out_format, layer=layer)
-
-            # parquet is one file per layer as default
-            elif out_format == "Parquet":
-                df.to_parquet(out_file)
+            # parquet is one file per layer and direct write to s3 is supported
+            out_file = os.path.join(out_path, layer + ".parquet")
+            df.to_parquet(out_file)
 
             LOG.info(f"{source['alias']} written to {out_file}")
 
diff --git a/process.sh b/process.sh
index eb9ad78..08f77cd 100755
--- a/process.sh
+++ b/process.sh
@@ -1,13 +1,16 @@
 #!/bin/bash
 set -euxo pipefail
 
+PSQL="psql $DATABASE_URL -v ON_ERROR_STOP=1"
+
 # load 250k grid
 bcdata bc2pg WHSE_BASEMAPPING.NTS_250K_GRID
 
-# load
+# load source data
+
 
 # create output table
-psql $DATABASE_URL -c "DROP TABLE IF EXISTS designations;
+$PSQL -c "DROP TABLE IF EXISTS designations;
   CREATE TABLE designations (
     designations_id serial primary key,
     index integer,
@@ -27,11 +30,11 @@ psql $DATABASE_URL -c "DROP TABLE IF EXISTS designations;
   );"
 
 # run overlay
-psql $DATABASE_URL -tXA \
+$PSQL -tXA \
 -c "SELECT DISTINCT map_tile
     FROM whse_basemapping.nts_250k_grid
     ORDER BY map_tile" \
-    | parallel --tag psql $DATABASE_URL -f sql/overlay.sql -v tile={1}
+    | parallel --tag $PSQL -f sql/overlay.sql -v tile={1}
 
 # dump result to file
 ogr2ogr   \
@@ -71,7 +74,7 @@ ogr2ogr   \
 zip -r harvest_restrictions.gdb.zip harvest_restrictions.gdb
 
 # summarize results
-psql $DATABASE_URL -f sql/summarize.sql --csv > harvest_restrictions_summary.csv
+$PSQL -f sql/summarize.sql --csv > harvest_restrictions_summary.csv
 
 # post to s3
 aws s3 cp harvest_restrictions.gdb.zip s3://$OBJECTSTORE_BUCKET/dss_projects_2024/harvest_restrictions/harvest_restrictions.gdb.zip