Skip to content

Commit

Permalink
Log datasette access IPs (#3669)
Browse files Browse the repository at this point in the history
* Add nginx

* Get real IPs in fly.io logs

* use nginx real IP module
* get nginx logs to show in stdout/err and suppress datasette/gunicorn logs

* Update docs / clean up

* Actually abort!

* Move noqa comments up.
  • Loading branch information
jdangerx authored Jul 2, 2024
1 parent 7ed1295 commit a605d67
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 31 deletions.
1 change: 1 addition & 0 deletions devtools/datasette/fly/50-mod-http-realip.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
load_module modules/ngx_http_realip_module.so;
17 changes: 17 additions & 0 deletions devtools/datasette/fly/nginx.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
daemon off;

events {
worker_connections 1024;
}
http {
server {
listen 8080;
location / {
proxy_pass http://127.0.0.1:8081/;
proxy_set_header Host $host;
set_real_ip_from 0.0.0.0/0;
real_ip_header X-Forwarded-For;
real_ip_recursive on;
}
}
}
3 changes: 2 additions & 1 deletion devtools/datasette/fly/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ ls
mv all_dbs.tar.zst /data
zstd -f -d /data/all_dbs.tar.zst -o /data/all_dbs.tar
tar -xf /data/all_dbs.tar --directory /data
datasette serve --host 0.0.0.0 ${DATABASES} --cors --inspect-file inspect-data.json --metadata metadata.yml --setting sql_time_limit_ms 5000 --port $PORT
datasette serve --host 0.0.0.0 ${DATABASES} --cors --inspect-file inspect-data.json --metadata metadata.yml --setting sql_time_limit_ms 5000 --port $DATASETTE_PORT > /dev/null &
nginx -c nginx.conf
88 changes: 59 additions & 29 deletions devtools/datasette/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,29 @@

DOCKERFILE_TEMPLATE = """
FROM python:3.11.0-slim-bullseye
COPY . /app
WORKDIR /app
RUN apt-get update
RUN apt-get install -y zstd
ENV DATASETTE_SECRET '{datasette_secret}'
ENV DATABASES '{databases}'
ENV DATASETTE_PORT 8081
ENV NGINX_PORT 8080
RUN apt-get update
RUN apt-get install -y zstd nginx
RUN pip install -U datasette datasette-cluster-map datasette-vega datasette-block-robots
ENV PORT 8080
EXPOSE 8080
# set up nginx + enable real IP module
COPY nginx.conf /usr/share/nginx/nginx.conf
COPY 50-mod-http-realip.conf /etc/nginx/modules-enabled/
# the two symlinks allow nginx logs to get written out to stdout/stderr
RUN mkdir /data \
&& ln -sf /dev/stdout /var/log/nginx/access.log \
&& ln -sf /dev/stderr /var/log/nginx/error.log
COPY . /app
WORKDIR /app
EXPOSE ${{NGINX_PORT}}
ENV DATASETTE_SECRET '{datasette_secret}'
CMD ["./run.sh"]
"""

Expand Down Expand Up @@ -106,7 +117,8 @@ def inspect_data(datasets: list[str], pudl_output: Path) -> str:
"-l",
"deploy",
flag_value="local",
help="Deploy Datasette locally for testing or debugging purposes.",
help="Deploy Datasette locally for testing or debugging purposes. Note that"
"you have to stop the docker instance manually to terminate this server.",
)
@click.option(
"--metadata",
Expand Down Expand Up @@ -138,22 +150,48 @@ def deploy_datasette(
python publish.py --fly -- --build-only
"""
logging.info(f"Deploying to {deploy.upper()}...")
pudl_output = PudlPaths().pudl_output

pudl_output = PudlPaths().pudl_output
metadata_yml = DatasetteMetadata.from_data_source_ids(pudl_output).to_yaml()

all_databases = (
["pudl.sqlite"]
+ sorted(str(p.name) for p in pudl_output.glob("ferc*.sqlite"))
+ ["censusdp1tract.sqlite"]
)
databases = list(only_databases if only_databases else all_databases)

# Make sure we have the expected metadata for databases
# headed to deployment.
if deploy != "metadata":
check_tables_have_metadata(metadata_yml, databases)
fly_dir = Path(__file__).parent.absolute() / "fly"
docker_path = fly_dir / "Dockerfile"
inspect_path = fly_dir / "inspect-data.json"
metadata_path = fly_dir / "metadata.yml"
metadata_yml = DatasetteMetadata.from_data_source_ids(pudl_output).to_yaml()

logging.info(f"Writing Datasette metadata to: {metadata_path}")
with metadata_path.open("w") as f:
f.write(metadata_yml)
check_tables_have_metadata(metadata_yml, databases)

if deploy == "metadata":
logging.info("Only writing metadata. Aborting now.")
return 0

logging.info(f"Inspecting DBs for datasette: {databases}...")
inspect_output = inspect_data(databases, pudl_output)
with inspect_path.open("w") as f:
f.write(json.dumps(inspect_output))

logging.info("Writing Dockerfile...")
with docker_path.open("w") as f:
f.write(make_dockerfile(databases))

logging.info(f"Compressing {databases} and putting into docker context...")
check_call( # noqa: S603
["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + databases,
cwd=pudl_output,
)

# OK, now we have a Dockerfile + the right context. Time to run the dang
# container somehwere.
if deploy in {"production", "staging"}:
fly_dir = Path(__file__).parent.absolute() / "fly"
logging.info(f"Deploying {deploy} to fly.io...")
Expand Down Expand Up @@ -189,21 +227,13 @@ def deploy_datasette(

elif deploy == "local":
logging.info("Running Datasette locally...")
metadata_path = pudl_output / "metadata.yml"
logging.info(f"Writing Datasette metadata to: {metadata_path}")
with metadata_path.open("w") as f:
f.write(metadata_yml)

check_call( # noqa: S603
["/usr/bin/env", "datasette", "serve", "-m", "metadata.yml"] + databases,
cwd=pudl_output,
["/usr/bin/env", "docker", "build", "-t", "pudl_datasette:local", "."],
cwd=fly_dir,
)
check_call( # noqa: S603
["/usr/bin/env", "docker", "run", "-p", "8080:8080", "pudl_datasette:local"]
)

elif deploy == "metadata":
metadata_path = Path.cwd() / "metadata.yml"
logging.info(f"Writing Datasette metadata to: {metadata_path}")
with metadata_path.open("w") as f:
f.write(metadata_yml)

else:
logging.error(f"Unrecognized deployment destination: {deploy=}")
Expand Down
2 changes: 1 addition & 1 deletion src/pudl/metadata/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2218,7 +2218,7 @@ def from_data_source_ids(
xbrl_resources=xbrl_resources,
)

def to_yaml(self) -> None:
def to_yaml(self) -> str:
"""Output database, table, and column metadata to YAML file."""
template = _get_jinja_environment().get_template("datasette-metadata.yml.jinja")

Expand Down

0 comments on commit a605d67

Please sign in to comment.