Skip to content

Commit

Permalink
cache as parquet only
Browse files Browse the repository at this point in the history
  • Loading branch information
smnorris committed Aug 16, 2024
1 parent 2b3e245 commit dd55940
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 32 deletions.
32 changes: 5 additions & 27 deletions download.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,13 +258,6 @@ def download_source(source):

@click.command()
@click.argument("sources_file", type=click.Path(exists=True), default="sources.json")
@click.option(
"--out_format",
"-of",
default="GPKG",
type=click.Choice(["GPKG", "OpenFileGDB", "Parquet"], case_sensitive=False),
help="Output file format",
)
@click.option(
"--source_alias",
"-s",
Expand All @@ -277,9 +270,9 @@ def download_source(source):
@click.option(
"--out_path",
"-o",
type=click.Path(exists=True),
type=click.Path(),
default=".",
help="Output path to cache data (local folder or object storage)",
help="Output path to write data (local or s3://)",
)
@verbose_opt
@quiet_opt
Expand All @@ -301,27 +294,12 @@ def download(sources_file, out_format, source_alias, dry_run, out_path, verbose,
if not dry_run:
for source in sources:
df = download_source(source)

# determine file extension from format
if out_format == "OpenFileGDB":
extension = "gdb"
else:
extension = out_format.lower()

layer = (
"hr_" + str(source["index"]).zfill(2) + "_" + source["alias"].lower()
)

out_file = os.path.join(out_path, layer + "." + extension)

# one file per layer, overwrite existing file
# (rather than switching between write and append mode)
if out_format in ["OpenFileGDB", "GPKG"]:
df.to_file(out_file, driver=out_format, layer=layer)

# parquet is one file per layer as default
elif out_format == "Parquet":
df.to_parquet(out_file)
# parquet is one file per layer and direct write to s3 is supported
out_file = os.path.join(out_path, layer + ".parquet")
df.to_parquet(out_file)

LOG.info(f"{source['alias']} written to {out_file}")

Expand Down
13 changes: 8 additions & 5 deletions process.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
#!/bin/bash
set -euxo pipefail

PSQL="psql $DATABASE_URL -v ON_ERROR_STOP=1"

# load 250k grid
bcdata bc2pg WHSE_BASEMAPPING.NTS_250K_GRID

# load
# load source data


# create output table
psql $DATABASE_URL -c "DROP TABLE IF EXISTS designations;
$PSQL -c "DROP TABLE IF EXISTS designations;
CREATE TABLE designations (
designations_id serial primary key,
index integer,
Expand All @@ -27,11 +30,11 @@ psql $DATABASE_URL -c "DROP TABLE IF EXISTS designations;
);"

# run overlay
psql $DATABASE_URL -tXA \
$PSQL -tXA \
-c "SELECT DISTINCT map_tile
FROM whse_basemapping.nts_250k_grid
ORDER BY map_tile" \
| parallel --tag psql $DATABASE_URL -f sql/overlay.sql -v tile={1}
| parallel --tag $PSQL -f sql/overlay.sql -v tile={1}

# dump result to file
ogr2ogr \
Expand Down Expand Up @@ -71,7 +74,7 @@ ogr2ogr \
zip -r harvest_restrictions.gdb.zip harvest_restrictions.gdb

# summarize results
psql $DATABASE_URL -f sql/summarize.sql --csv > harvest_restrictions_summary.csv
$PSQL -f sql/summarize.sql --csv > harvest_restrictions_summary.csv

# post to s3
aws s3 cp harvest_restrictions.gdb.zip s3://$OBJECTSTORE_BUCKET/dss_projects_2024/harvest_restrictions/harvest_restrictions.gdb.zip
Expand Down

0 comments on commit dd55940

Please sign in to comment.