From 40e4a7a6652d0d214904ffc47322a8a79ba64910 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Mon, 3 Mar 2025 20:13:37 -0800 Subject: [PATCH] runner.aws_batch: Support overlay volumes (e.g. --augur) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The latest image, nextstrain/base:build-20250304T041009Z, provides a mechanism in the entrypoint to support bundling of overlays in the workdir ZIP archive by way of upwards-traversing archive member paths.¹ For example, an Augur overlay is bundled into the workdir ZIP archive with member paths starting with ../augur/ and ends up overwriting files in the image's /nextstrain/augur/ since the AWS Batch workdir is always /nextstrain/build/. Extending overlay support to AWS Batch has been very low priority and something I thought was unlikely to ever happen. However, in the course of working on AWS Batch support for `nextstrain run`, it turned out to be easiest/most straightforward/most minimal changes to bundle the pathogen source directory with the working analysis directory in the workdir ZIP archive, i.e. as a "pathogen" overlay. This naturally led to supporting overlays more generally, which I've done here. One caveat compared to overlays in runtimes with the concept of volume mounts (Docker, Singularity) is that any files in the image that do not exist in the overlaid files will remain present since nothing removes them. This is potentially problematic and will be annoying if run into but most of the time should be a non-issue. It is also solvable if we care to exert the effort and extra code to do so. I don't right now. ¹ --- CHANGES.md | 10 ++++ nextstrain/cli/command/build.py | 29 ++++++++++-- nextstrain/cli/runner/aws_batch/__init__.py | 12 ++++- nextstrain/cli/runner/aws_batch/s3.py | 52 ++++++++++++++++----- nextstrain/cli/runner/docker.py | 4 ++ nextstrain/cli/util.py | 8 ++-- 6 files changed, 94 insertions(+), 21 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index e1e17277..2cf50758 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -30,6 +30,16 @@ development source code and as such may not be routinely kept up to date. between argument/option descriptions. ([#419][]) +* AWS Batch builds now support development overlays such as [`--augur`][] and + [`--auspice`][]. To use this functionality, you'll need at least + `nextstrain/base:build-20250304T041009Z` or newer of the Nextstrain Docker + runtime image. Compatibility of the runtime image is checked automatically + when overlays are used with AWS Batch. + ([#419][]) + +[`--augur`]: https://docs.nextstrain.org/projects/cli/en/__NEXT__/commands/build/#cmdoption-nextstrain-build-augur +[`--auspice`]: https://docs.nextstrain.org/projects/cli/en/__NEXT__/commands/build/#cmdoption-nextstrain-build-auspice + ## Bug fixes * Fixed a rare but possible error case in `nextstrain view` and `nextstrain diff --git a/nextstrain/cli/command/build.py b/nextstrain/cli/command/build.py index f19c46e3..83d5d231 100644 --- a/nextstrain/cli/command/build.py +++ b/nextstrain/cli/command/build.py @@ -24,8 +24,8 @@ from ..argparse import add_extended_help_flags, AppendOverwriteDefault, SKIP_AUTO_DEFAULT_IN_HELP from ..debug import debug from ..errors import UsageError, UserError -from ..runner import docker, singularity -from ..util import byte_quantity, runner_name, warn +from ..runner import docker, singularity, aws_batch +from ..util import byte_quantity, runner_name, split_image_name, warn from ..volume import NamedVolume @@ -306,10 +306,31 @@ def assert_overlay_volumes_support(opts): """ overlay_volumes = opts.volumes - if overlay_volumes and opts.__runner__ not in {docker, singularity}: + if not overlay_volumes: + return + + if opts.__runner__ not in {docker, singularity, aws_batch}: raise UserError(f""" The {runner_name(opts.__runner__)} runtime does not support overlays (e.g. of {overlay_volumes[0].name}). - Use the Docker or Singularity runtimes (via --docker or --singularity) if overlays are necessary. + Use the Docker, Singularity, or AWS Batch runtimes (via --docker, + --singularity, or --aws-batch) if overlays are necessary. + """) + + if opts.__runner__ is aws_batch and not docker.image_supports(docker.IMAGE_FEATURE.aws_batch_overlays, opts.image): + raise UserError(f""" + The Nextstrain runtime image version in use + + {opts.image} + + is too old to support overlays (e.g. of {overlay_volumes[0].name}) with AWS Batch. + + If overlays are necessary, please update the runtime image to at + least version + + {split_image_name(opts.image)[0]}:{docker.IMAGE_FEATURE.aws_batch_overlays.value} + + using `nextstrain update docker`. Alternatively, use the Docker or + Singularity runtime (via --docker or --singularity) instead. """) diff --git a/nextstrain/cli/runner/aws_batch/__init__.py b/nextstrain/cli/runner/aws_batch/__init__.py index 9d8219ef..9629743b 100644 --- a/nextstrain/cli/runner/aws_batch/__init__.py +++ b/nextstrain/cli/runner/aws_batch/__init__.py @@ -164,8 +164,15 @@ def register_arguments(parser) -> None: def run(opts, argv, working_volume = None, extra_env: Env = {}, cpus: int = None, memory: int = None) -> int: + docker.assert_volumes_exist(opts.volumes) + + # "build" is a special-cased volume for AWS Batch, as /nextstrain/build is + # the fixed initial working directory and what we'll populate by extracting + # a ZIP file. build_volume = next((v for v in opts.volumes if v and v.name == "build"), None) + opts.volumes = [v for v in opts.volumes if v is not build_volume] + # Unlike other runners, the AWS Batch runner currently *requires* a working # dir in most usages. This is ok as we only provide the AWS Batch runner # for commands which also require a working dir (e.g. build), whereas other @@ -213,8 +220,11 @@ def run(opts, argv, working_volume = None, extra_env: Env = {}, cpus: int = None # Upload workdir to S3 so it can be fetched at the start of the Batch job. print_stage("Uploading %s to S3" % local_workdir) + for volume in opts.volumes: + print(" and %s as %s" % (volume.src.resolve(strict = True), volume.name)) + bucket = s3.bucket(opts.s3_bucket) - remote_workdir = s3.upload_workdir(local_workdir, bucket, run_id, opts.exclude_from_upload) + remote_workdir = s3.upload_workdir(local_workdir, bucket, run_id, opts.exclude_from_upload, opts.volumes) print("uploaded:", s3.object_url(remote_workdir)) diff --git a/nextstrain/cli/runner/aws_batch/s3.py b/nextstrain/cli/runner/aws_batch/s3.py index 663576e4..6f7c54b4 100644 --- a/nextstrain/cli/runner/aws_batch/s3.py +++ b/nextstrain/cli/runner/aws_batch/s3.py @@ -5,21 +5,24 @@ import binascii import boto3 import fsspec +import os.path from botocore.config import Config from botocore.exceptions import ClientError from calendar import timegm from os import utime -from pathlib import Path +from pathlib import Path, PurePath from time import struct_time -from typing import Callable, Generator, Iterable, List, Optional, Any +from typing import Callable, Generator, Iterable, List, Optional, Any, Union from urllib.parse import urlparse from zipfile import ZipFile, ZipInfo from ... import env +from ...debug import DEBUGGING from ...types import Env, S3Bucket, S3Object from ...util import glob_matcher +from ...volume import NamedVolume -PathMatcher = Callable[[Path], bool] +PathMatcher = Callable[[Union[Path, PurePath]], bool] def object_url(object: S3Object) -> str: @@ -38,10 +41,10 @@ def object_from_url(s3url: str) -> S3Object: return bucket(url.netloc).Object(key) -def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str, patterns: List[str] = None) -> S3Object: +def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str, patterns: List[str] = None, volumes: List[NamedVolume] = []) -> S3Object: """ - Upload a ZIP archive of the local *workdir* to the remote S3 *bucket* for - the given *run_id*. + Upload a ZIP archive of the local *workdir* (and optional *volumes*) to the + remote S3 *bucket* for the given *run_id*. An optional list of *patterns* (shell-style advanced globs) can be passed to selectively exclude part of the local *workdir* from being uploaded. @@ -80,8 +83,23 @@ def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str, patterns: List[ with fsspec.open(object_url(remote_workdir), "wb", auto_mkdir = False) as remote_file: with ZipFile(remote_file, "w") as zipfile: for path in walk(workdir, excluded): - print("zipping:", path) - zipfile.write(str(path), str(path.relative_to(workdir))) + dst = path.relative_to(workdir) + print(f"zipping: {path}" + (f" (as {dst})" if DEBUGGING else "")) + zipfile.write(str(path), dst) + + for volume in volumes: + # XXX TODO: Use the "walk_up" argument to Path.relative_to() + # once we require Python 3.12. + # -trs, 10 Feb 2025 + try: + prefix = PurePath(volume.name).relative_to("build") + except ValueError: + prefix = PurePath("..", volume.name) + + for path in walk(volume.src, always_excluded): + dst = prefix / path.relative_to(volume.src) + print(f"zipping: {path}" + (f" (as {dst})" if DEBUGGING else "")) + zipfile.write(str(path), dst) return remote_workdir @@ -138,9 +156,19 @@ def download_workdir(remote_workdir: S3Object, workdir: Path, patterns: List[str # …and extract its contents to the workdir. with ZipFile(remote_file) as zipfile: - for member in zipfile.infolist(): - path = Path(member.filename) - + # Completely ignore archive members with unsafe paths (absolute or + # upwards-traversing) instead of relying on zipfile.extract()'s + # default of munging them to be "safe". Munging seems more + # confusing than skipping, and skipping is essential in the case of + # additional volumes being uploaded in the workdir initially. + safe_members = [ + (filename, member) + for filename, member + in ((PurePath(m.filename), m) for m in zipfile.infolist()) + if not filename.is_absolute() + and os.path.pardir not in filename.parts ] + + for path, member in safe_members: # Inclusions negate exclusions but aren't an exhaustive # list of what is included. if selected(path) and (included(path) or not excluded(path)): @@ -179,7 +207,7 @@ def path_matcher(patterns: Iterable[str]) -> PathMatcher: Generate a function which matches a Path object against the list of glob *patterns*. """ - def matches(path: Path) -> bool: + def matches(path: Union[Path, PurePath]) -> bool: return any(map(path.match, patterns)) return matches diff --git a/nextstrain/cli/runner/docker.py b/nextstrain/cli/runner/docker.py index aecd30e4..08a851fe 100644 --- a/nextstrain/cli/runner/docker.py +++ b/nextstrain/cli/runner/docker.py @@ -112,6 +112,10 @@ class IMAGE_FEATURE(Enum): # /nextstrain/env.d support first present. envd = "build-20230613T204512Z" + # AWS Batch: support for volume overlays (i.e. ../ in archive members and + # file overwriting) in ZIP extraction. + aws_batch_overlays = "build-20250304T041009Z" + def register_arguments(parser) -> None: # Docker development options diff --git a/nextstrain/cli/util.py b/nextstrain/cli/util.py index f7b7769a..3161dd02 100644 --- a/nextstrain/cli/util.py +++ b/nextstrain/cli/util.py @@ -9,7 +9,7 @@ from importlib.metadata import distribution as distribution_info, PackageNotFoundError from typing import Any, Callable, Iterable, Literal, Mapping, List, Optional, Sequence, Tuple, Union, overload from packaging.version import parse as parse_version -from pathlib import Path +from pathlib import Path, PurePath from shlex import quote as shquote from shutil import which from textwrap import dedent, indent @@ -553,7 +553,7 @@ def split_image_name(name: str, implicit_latest: bool = True) -> Tuple[str, Opti return (repository, tag) -def glob_matcher(patterns: Sequence[str], *, root: Path = None) -> Callable[[Union[str, Path]], bool]: +def glob_matcher(patterns: Sequence[str], *, root: Path = None) -> Callable[[Union[str, Path, PurePath]], bool]: """ Generate a function which matches a string or path-like object against the list of Bash-like glob *patterns*. @@ -563,13 +563,13 @@ def glob_matcher(patterns: Sequence[str], *, root: Path = None) -> Callable[[Uni See :func:`glob_match` for supported pattern features. """ - def matcher(path: Union[str, Path]) -> bool: + def matcher(path: Union[str, Path, PurePath]) -> bool: return glob_match(path, patterns, root = root) return matcher -def glob_match(path: Union[str, Path], patterns: Union[str, Sequence[str]], *, root: Path = None) -> bool: +def glob_match(path: Union[str, Path, PurePath], patterns: Union[str, Sequence[str]], *, root: Path = None) -> bool: """ Test if *path* matches any of the glob *patterns*.