From 8d0e82228baf38a79134dcadea18dcb3fdfddab5 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Wed, 12 Jun 2024 14:23:14 -0400 Subject: [PATCH 1/5] Add nginx --- devtools/datasette/fly/nginx.conf | 14 ++++++ devtools/datasette/fly/run.sh | 4 +- devtools/datasette/publish.py | 74 ++++++++++++++++++++----------- 3 files changed, 65 insertions(+), 27 deletions(-) create mode 100644 devtools/datasette/fly/nginx.conf diff --git a/devtools/datasette/fly/nginx.conf b/devtools/datasette/fly/nginx.conf new file mode 100644 index 0000000000..a525b78b21 --- /dev/null +++ b/devtools/datasette/fly/nginx.conf @@ -0,0 +1,14 @@ +daemon off; + +events { + worker_connections 1024; +} +http { + server { + listen 8080; + location / { + proxy_pass http://127.0.0.1:8081/; + proxy_set_header Host $host; + } + } +} \ No newline at end of file diff --git a/devtools/datasette/fly/run.sh b/devtools/datasette/fly/run.sh index 426ae11bd2..d041c05655 100755 --- a/devtools/datasette/fly/run.sh +++ b/devtools/datasette/fly/run.sh @@ -8,4 +8,6 @@ ls mv all_dbs.tar.zst /data zstd -f -d /data/all_dbs.tar.zst -o /data/all_dbs.tar tar -xf /data/all_dbs.tar --directory /data -datasette serve --host 0.0.0.0 ${DATABASES} --cors --inspect-file inspect-data.json --metadata metadata.yml --setting sql_time_limit_ms 5000 --port $PORT +cp nginx.conf /usr/share/nginx/nginx.conf +datasette serve --host 0.0.0.0 ${DATABASES} --cors --inspect-file inspect-data.json --metadata metadata.yml --setting sql_time_limit_ms 5000 --port $DATASETTE_PORT & +nginx -c nginx.conf # -g 'daemon off;' diff --git a/devtools/datasette/publish.py b/devtools/datasette/publish.py index ef6805506a..416d75f441 100644 --- a/devtools/datasette/publish.py +++ b/devtools/datasette/publish.py @@ -35,18 +35,22 @@ DOCKERFILE_TEMPLATE = """ FROM python:3.11.0-slim-bullseye -COPY . /app -WORKDIR /app - -RUN apt-get update -RUN apt-get install -y zstd -ENV DATASETTE_SECRET '{datasette_secret}' ENV DATABASES '{databases}' +ENV DATASETTE_PORT 8081 +ENV NGINX_PORT 8080 + +RUN apt-get update +RUN apt-get install -y zstd nginx RUN pip install -U datasette datasette-cluster-map datasette-vega datasette-block-robots -ENV PORT 8080 -EXPOSE 8080 + +COPY . /app +WORKDIR /app +RUN mkdir /data + +EXPOSE ${{NGINX_PORT}} +ENV DATASETTE_SECRET '{datasette_secret}' CMD ["./run.sh"] """ @@ -138,6 +142,7 @@ def deploy_datasette( python publish.py --fly -- --build-only """ + logging.info(f"Deploying to {deploy.upper()}...") pudl_output = PudlPaths().pudl_output pudl_output = PudlPaths().pudl_output @@ -150,10 +155,35 @@ def deploy_datasette( ) databases = list(only_databases if only_databases else all_databases) - # Make sure we have the expected metadata for databases - # headed to deployment. - if deploy != "metadata": - check_tables_have_metadata(metadata_yml, databases) + fly_dir = Path(__file__).parent.absolute() / "fly" + docker_path = fly_dir / "Dockerfile" + inspect_path = fly_dir / "inspect-data.json" + metadata_path = fly_dir / "metadata.yml" + + logging.info(f"Inspecting DBs for datasette: {databases}...") + inspect_output = inspect_data(databases, pudl_output) + with inspect_path.open("w") as f: + f.write(json.dumps(inspect_output)) + + logging.info(f"Writing Datasette metadata to: {metadata_path}") + with metadata_path.open("w") as f: + f.write(metadata_yml) + + if deploy == "metadata": + logging.info("Only writing metadata. Aborting now.") + + check_tables_have_metadata(metadata_yml, databases) + + logging.info("Writing Dockerfile...") + with docker_path.open("w") as f: + f.write(make_dockerfile(databases)) + + logging.info(f"Compressing {databases} and putting into docker context...") + check_call( + ["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + databases, # noqa: S603 + cwd=pudl_output, + ) + if deploy in {"production", "staging"}: fly_dir = Path(__file__).parent.absolute() / "fly" logging.info(f"Deploying {deploy} to fly.io...") @@ -189,21 +219,13 @@ def deploy_datasette( elif deploy == "local": logging.info("Running Datasette locally...") - metadata_path = pudl_output / "metadata.yml" - logging.info(f"Writing Datasette metadata to: {metadata_path}") - with metadata_path.open("w") as f: - f.write(metadata_yml) - - check_call( # noqa: S603 - ["/usr/bin/env", "datasette", "serve", "-m", "metadata.yml"] + databases, - cwd=pudl_output, + check_call( + ["/usr/bin/env", "docker", "build", "-t", "pudl_datasette:local", "."], # noqa: S603 + cwd=fly_dir, + ) + check_call( + ["/usr/bin/env", "docker", "run", "-p", "8080:8080", "pudl_datasette:local"] # noqa: S603 ) - - elif deploy == "metadata": - metadata_path = Path.cwd() / "metadata.yml" - logging.info(f"Writing Datasette metadata to: {metadata_path}") - with metadata_path.open("w") as f: - f.write(metadata_yml) else: logging.error(f"Unrecognized deployment destination: {deploy=}") From 497e30fc69081a5f6a2a895d7e84ccbfab10b373 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Wed, 12 Jun 2024 16:23:54 -0400 Subject: [PATCH 2/5] Get real IPs in fly.io logs * use nginx real IP module * get nginx logs to show in stdout/err and suppress datasette/gunicorn logs --- devtools/datasette/fly/50-mod-http-realip.conf | 1 + devtools/datasette/fly/nginx.conf | 5 ++++- devtools/datasette/fly/run.sh | 3 +-- devtools/datasette/publish.py | 7 ++++++- 4 files changed, 12 insertions(+), 4 deletions(-) create mode 100644 devtools/datasette/fly/50-mod-http-realip.conf diff --git a/devtools/datasette/fly/50-mod-http-realip.conf b/devtools/datasette/fly/50-mod-http-realip.conf new file mode 100644 index 0000000000..02eb895838 --- /dev/null +++ b/devtools/datasette/fly/50-mod-http-realip.conf @@ -0,0 +1 @@ +load_module modules/ngx_http_realip_module.so; diff --git a/devtools/datasette/fly/nginx.conf b/devtools/datasette/fly/nginx.conf index a525b78b21..ca2c983a1a 100644 --- a/devtools/datasette/fly/nginx.conf +++ b/devtools/datasette/fly/nginx.conf @@ -9,6 +9,9 @@ http { location / { proxy_pass http://127.0.0.1:8081/; proxy_set_header Host $host; + set_real_ip_from 0.0.0.0/0; + real_ip_header X-Forwarded-For; + real_ip_recursive on; } } -} \ No newline at end of file +} diff --git a/devtools/datasette/fly/run.sh b/devtools/datasette/fly/run.sh index d041c05655..bce7e1e361 100755 --- a/devtools/datasette/fly/run.sh +++ b/devtools/datasette/fly/run.sh @@ -8,6 +8,5 @@ ls mv all_dbs.tar.zst /data zstd -f -d /data/all_dbs.tar.zst -o /data/all_dbs.tar tar -xf /data/all_dbs.tar --directory /data -cp nginx.conf /usr/share/nginx/nginx.conf -datasette serve --host 0.0.0.0 ${DATABASES} --cors --inspect-file inspect-data.json --metadata metadata.yml --setting sql_time_limit_ms 5000 --port $DATASETTE_PORT & +datasette serve --host 0.0.0.0 ${DATABASES} --cors --inspect-file inspect-data.json --metadata metadata.yml --setting sql_time_limit_ms 5000 --port $DATASETTE_PORT > /dev/null & nginx -c nginx.conf # -g 'daemon off;' diff --git a/devtools/datasette/publish.py b/devtools/datasette/publish.py index 416d75f441..db5d761a15 100644 --- a/devtools/datasette/publish.py +++ b/devtools/datasette/publish.py @@ -44,10 +44,15 @@ RUN apt-get install -y zstd nginx RUN pip install -U datasette datasette-cluster-map datasette-vega datasette-block-robots +COPY nginx.conf /usr/share/nginx/nginx.conf +COPY 50-mod-http-realip.conf /etc/nginx/modules-enabled/ + +RUN mkdir /data \ + && ln -sf /dev/stdout /var/log/nginx/access.log \ + && ln -sf /dev/stderr /var/log/nginx/error.log COPY . /app WORKDIR /app -RUN mkdir /data EXPOSE ${{NGINX_PORT}} ENV DATASETTE_SECRET '{datasette_secret}' From 6239ce6f06712cea9b8b1b71f3793aedcc1d58d7 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Wed, 12 Jun 2024 16:45:23 -0400 Subject: [PATCH 3/5] Update docs / clean up --- devtools/datasette/fly/run.sh | 2 +- devtools/datasette/publish.py | 22 ++++++++++++---------- src/pudl/metadata/classes.py | 2 +- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/devtools/datasette/fly/run.sh b/devtools/datasette/fly/run.sh index bce7e1e361..0c201518a3 100755 --- a/devtools/datasette/fly/run.sh +++ b/devtools/datasette/fly/run.sh @@ -9,4 +9,4 @@ mv all_dbs.tar.zst /data zstd -f -d /data/all_dbs.tar.zst -o /data/all_dbs.tar tar -xf /data/all_dbs.tar --directory /data datasette serve --host 0.0.0.0 ${DATABASES} --cors --inspect-file inspect-data.json --metadata metadata.yml --setting sql_time_limit_ms 5000 --port $DATASETTE_PORT > /dev/null & -nginx -c nginx.conf # -g 'daemon off;' +nginx -c nginx.conf diff --git a/devtools/datasette/publish.py b/devtools/datasette/publish.py index db5d761a15..392f41b1e2 100644 --- a/devtools/datasette/publish.py +++ b/devtools/datasette/publish.py @@ -44,9 +44,11 @@ RUN apt-get install -y zstd nginx RUN pip install -U datasette datasette-cluster-map datasette-vega datasette-block-robots +# set up nginx + enable real IP module COPY nginx.conf /usr/share/nginx/nginx.conf COPY 50-mod-http-realip.conf /etc/nginx/modules-enabled/ +# the two symlinks allow nginx logs to get written out to stdout/stderr RUN mkdir /data \ && ln -sf /dev/stdout /var/log/nginx/access.log \ && ln -sf /dev/stderr /var/log/nginx/error.log @@ -115,7 +117,8 @@ def inspect_data(datasets: list[str], pudl_output: Path) -> str: "-l", "deploy", flag_value="local", - help="Deploy Datasette locally for testing or debugging purposes.", + help="Deploy Datasette locally for testing or debugging purposes. Note that" + "you have to stop the docker instance manually to terminate this server.", ) @click.option( "--metadata", @@ -150,9 +153,6 @@ def deploy_datasette( logging.info(f"Deploying to {deploy.upper()}...") pudl_output = PudlPaths().pudl_output - pudl_output = PudlPaths().pudl_output - metadata_yml = DatasetteMetadata.from_data_source_ids(pudl_output).to_yaml() - all_databases = ( ["pudl.sqlite"] + sorted(str(p.name) for p in pudl_output.glob("ferc*.sqlite")) @@ -164,20 +164,20 @@ def deploy_datasette( docker_path = fly_dir / "Dockerfile" inspect_path = fly_dir / "inspect-data.json" metadata_path = fly_dir / "metadata.yml" - - logging.info(f"Inspecting DBs for datasette: {databases}...") - inspect_output = inspect_data(databases, pudl_output) - with inspect_path.open("w") as f: - f.write(json.dumps(inspect_output)) + metadata_yml = DatasetteMetadata.from_data_source_ids(pudl_output).to_yaml() logging.info(f"Writing Datasette metadata to: {metadata_path}") with metadata_path.open("w") as f: f.write(metadata_yml) + check_tables_have_metadata(metadata_yml, databases) if deploy == "metadata": logging.info("Only writing metadata. Aborting now.") - check_tables_have_metadata(metadata_yml, databases) + logging.info(f"Inspecting DBs for datasette: {databases}...") + inspect_output = inspect_data(databases, pudl_output) + with inspect_path.open("w") as f: + f.write(json.dumps(inspect_output)) logging.info("Writing Dockerfile...") with docker_path.open("w") as f: @@ -189,6 +189,8 @@ def deploy_datasette( cwd=pudl_output, ) + # OK, now we have a Dockerfile + the right context. Time to run the dang + # container somehwere. if deploy in {"production", "staging"}: fly_dir = Path(__file__).parent.absolute() / "fly" logging.info(f"Deploying {deploy} to fly.io...") diff --git a/src/pudl/metadata/classes.py b/src/pudl/metadata/classes.py index 1b0a532a58..4ef2d4b0fd 100644 --- a/src/pudl/metadata/classes.py +++ b/src/pudl/metadata/classes.py @@ -2218,7 +2218,7 @@ def from_data_source_ids( xbrl_resources=xbrl_resources, ) - def to_yaml(self) -> None: + def to_yaml(self) -> str: """Output database, table, and column metadata to YAML file.""" template = _get_jinja_environment().get_template("datasette-metadata.yml.jinja") From 317bea48a8816ca71db6ec73e2ff3e249aa1b8ae Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Mon, 17 Jun 2024 17:43:16 -0400 Subject: [PATCH 4/5] Actually abort! --- devtools/datasette/publish.py | 1 + 1 file changed, 1 insertion(+) diff --git a/devtools/datasette/publish.py b/devtools/datasette/publish.py index 392f41b1e2..5b0bb915ab 100644 --- a/devtools/datasette/publish.py +++ b/devtools/datasette/publish.py @@ -173,6 +173,7 @@ def deploy_datasette( if deploy == "metadata": logging.info("Only writing metadata. Aborting now.") + return 0 logging.info(f"Inspecting DBs for datasette: {databases}...") inspect_output = inspect_data(databases, pudl_output) From 56216e596fa1036803c17500dcd7440e939c72ff Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Tue, 2 Jul 2024 12:29:25 -0400 Subject: [PATCH 5/5] Move noqa comments up. --- devtools/datasette/publish.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/devtools/datasette/publish.py b/devtools/datasette/publish.py index 5b0bb915ab..01071549ee 100644 --- a/devtools/datasette/publish.py +++ b/devtools/datasette/publish.py @@ -185,8 +185,8 @@ def deploy_datasette( f.write(make_dockerfile(databases)) logging.info(f"Compressing {databases} and putting into docker context...") - check_call( - ["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + databases, # noqa: S603 + check_call( # noqa: S603 + ["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + databases, cwd=pudl_output, ) @@ -227,12 +227,12 @@ def deploy_datasette( elif deploy == "local": logging.info("Running Datasette locally...") - check_call( - ["/usr/bin/env", "docker", "build", "-t", "pudl_datasette:local", "."], # noqa: S603 + check_call( # noqa: S603 + ["/usr/bin/env", "docker", "build", "-t", "pudl_datasette:local", "."], cwd=fly_dir, ) - check_call( - ["/usr/bin/env", "docker", "run", "-p", "8080:8080", "pudl_datasette:local"] # noqa: S603 + check_call( # noqa: S603 + ["/usr/bin/env", "docker", "run", "-p", "8080:8080", "pudl_datasette:local"] ) else: