diff --git a/CHANGES.md b/CHANGES.md index e1e17277..2cf50758 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -30,6 +30,16 @@ development source code and as such may not be routinely kept up to date. between argument/option descriptions. ([#419][]) +* AWS Batch builds now support development overlays such as [`--augur`][] and + [`--auspice`][]. To use this functionality, you'll need at least + `nextstrain/base:build-20250304T041009Z` or newer of the Nextstrain Docker + runtime image. Compatibility of the runtime image is checked automatically + when overlays are used with AWS Batch. + ([#419][]) + +[`--augur`]: https://docs.nextstrain.org/projects/cli/en/__NEXT__/commands/build/#cmdoption-nextstrain-build-augur +[`--auspice`]: https://docs.nextstrain.org/projects/cli/en/__NEXT__/commands/build/#cmdoption-nextstrain-build-auspice + ## Bug fixes * Fixed a rare but possible error case in `nextstrain view` and `nextstrain diff --git a/nextstrain/cli/command/build.py b/nextstrain/cli/command/build.py index f19c46e3..83d5d231 100644 --- a/nextstrain/cli/command/build.py +++ b/nextstrain/cli/command/build.py @@ -24,8 +24,8 @@ from ..argparse import add_extended_help_flags, AppendOverwriteDefault, SKIP_AUTO_DEFAULT_IN_HELP from ..debug import debug from ..errors import UsageError, UserError -from ..runner import docker, singularity -from ..util import byte_quantity, runner_name, warn +from ..runner import docker, singularity, aws_batch +from ..util import byte_quantity, runner_name, split_image_name, warn from ..volume import NamedVolume @@ -306,10 +306,31 @@ def assert_overlay_volumes_support(opts): """ overlay_volumes = opts.volumes - if overlay_volumes and opts.__runner__ not in {docker, singularity}: + if not overlay_volumes: + return + + if opts.__runner__ not in {docker, singularity, aws_batch}: raise UserError(f""" The {runner_name(opts.__runner__)} runtime does not support overlays (e.g. of {overlay_volumes[0].name}). - Use the Docker or Singularity runtimes (via --docker or --singularity) if overlays are necessary. + Use the Docker, Singularity, or AWS Batch runtimes (via --docker, + --singularity, or --aws-batch) if overlays are necessary. + """) + + if opts.__runner__ is aws_batch and not docker.image_supports(docker.IMAGE_FEATURE.aws_batch_overlays, opts.image): + raise UserError(f""" + The Nextstrain runtime image version in use + + {opts.image} + + is too old to support overlays (e.g. of {overlay_volumes[0].name}) with AWS Batch. + + If overlays are necessary, please update the runtime image to at + least version + + {split_image_name(opts.image)[0]}:{docker.IMAGE_FEATURE.aws_batch_overlays.value} + + using `nextstrain update docker`. Alternatively, use the Docker or + Singularity runtime (via --docker or --singularity) instead. """) diff --git a/nextstrain/cli/runner/aws_batch/__init__.py b/nextstrain/cli/runner/aws_batch/__init__.py index 9d8219ef..9629743b 100644 --- a/nextstrain/cli/runner/aws_batch/__init__.py +++ b/nextstrain/cli/runner/aws_batch/__init__.py @@ -164,8 +164,15 @@ def register_arguments(parser) -> None: def run(opts, argv, working_volume = None, extra_env: Env = {}, cpus: int = None, memory: int = None) -> int: + docker.assert_volumes_exist(opts.volumes) + + # "build" is a special-cased volume for AWS Batch, as /nextstrain/build is + # the fixed initial working directory and what we'll populate by extracting + # a ZIP file. build_volume = next((v for v in opts.volumes if v and v.name == "build"), None) + opts.volumes = [v for v in opts.volumes if v is not build_volume] + # Unlike other runners, the AWS Batch runner currently *requires* a working # dir in most usages. This is ok as we only provide the AWS Batch runner # for commands which also require a working dir (e.g. build), whereas other @@ -213,8 +220,11 @@ def run(opts, argv, working_volume = None, extra_env: Env = {}, cpus: int = None # Upload workdir to S3 so it can be fetched at the start of the Batch job. print_stage("Uploading %s to S3" % local_workdir) + for volume in opts.volumes: + print(" and %s as %s" % (volume.src.resolve(strict = True), volume.name)) + bucket = s3.bucket(opts.s3_bucket) - remote_workdir = s3.upload_workdir(local_workdir, bucket, run_id, opts.exclude_from_upload) + remote_workdir = s3.upload_workdir(local_workdir, bucket, run_id, opts.exclude_from_upload, opts.volumes) print("uploaded:", s3.object_url(remote_workdir)) diff --git a/nextstrain/cli/runner/aws_batch/s3.py b/nextstrain/cli/runner/aws_batch/s3.py index 663576e4..6f7c54b4 100644 --- a/nextstrain/cli/runner/aws_batch/s3.py +++ b/nextstrain/cli/runner/aws_batch/s3.py @@ -5,21 +5,24 @@ import binascii import boto3 import fsspec +import os.path from botocore.config import Config from botocore.exceptions import ClientError from calendar import timegm from os import utime -from pathlib import Path +from pathlib import Path, PurePath from time import struct_time -from typing import Callable, Generator, Iterable, List, Optional, Any +from typing import Callable, Generator, Iterable, List, Optional, Any, Union from urllib.parse import urlparse from zipfile import ZipFile, ZipInfo from ... import env +from ...debug import DEBUGGING from ...types import Env, S3Bucket, S3Object from ...util import glob_matcher +from ...volume import NamedVolume -PathMatcher = Callable[[Path], bool] +PathMatcher = Callable[[Union[Path, PurePath]], bool] def object_url(object: S3Object) -> str: @@ -38,10 +41,10 @@ def object_from_url(s3url: str) -> S3Object: return bucket(url.netloc).Object(key) -def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str, patterns: List[str] = None) -> S3Object: +def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str, patterns: List[str] = None, volumes: List[NamedVolume] = []) -> S3Object: """ - Upload a ZIP archive of the local *workdir* to the remote S3 *bucket* for - the given *run_id*. + Upload a ZIP archive of the local *workdir* (and optional *volumes*) to the + remote S3 *bucket* for the given *run_id*. An optional list of *patterns* (shell-style advanced globs) can be passed to selectively exclude part of the local *workdir* from being uploaded. @@ -80,8 +83,23 @@ def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str, patterns: List[ with fsspec.open(object_url(remote_workdir), "wb", auto_mkdir = False) as remote_file: with ZipFile(remote_file, "w") as zipfile: for path in walk(workdir, excluded): - print("zipping:", path) - zipfile.write(str(path), str(path.relative_to(workdir))) + dst = path.relative_to(workdir) + print(f"zipping: {path}" + (f" (as {dst})" if DEBUGGING else "")) + zipfile.write(str(path), dst) + + for volume in volumes: + # XXX TODO: Use the "walk_up" argument to Path.relative_to() + # once we require Python 3.12. + # -trs, 10 Feb 2025 + try: + prefix = PurePath(volume.name).relative_to("build") + except ValueError: + prefix = PurePath("..", volume.name) + + for path in walk(volume.src, always_excluded): + dst = prefix / path.relative_to(volume.src) + print(f"zipping: {path}" + (f" (as {dst})" if DEBUGGING else "")) + zipfile.write(str(path), dst) return remote_workdir @@ -138,9 +156,19 @@ def download_workdir(remote_workdir: S3Object, workdir: Path, patterns: List[str # …and extract its contents to the workdir. with ZipFile(remote_file) as zipfile: - for member in zipfile.infolist(): - path = Path(member.filename) - + # Completely ignore archive members with unsafe paths (absolute or + # upwards-traversing) instead of relying on zipfile.extract()'s + # default of munging them to be "safe". Munging seems more + # confusing than skipping, and skipping is essential in the case of + # additional volumes being uploaded in the workdir initially. + safe_members = [ + (filename, member) + for filename, member + in ((PurePath(m.filename), m) for m in zipfile.infolist()) + if not filename.is_absolute() + and os.path.pardir not in filename.parts ] + + for path, member in safe_members: # Inclusions negate exclusions but aren't an exhaustive # list of what is included. if selected(path) and (included(path) or not excluded(path)): @@ -179,7 +207,7 @@ def path_matcher(patterns: Iterable[str]) -> PathMatcher: Generate a function which matches a Path object against the list of glob *patterns*. """ - def matches(path: Path) -> bool: + def matches(path: Union[Path, PurePath]) -> bool: return any(map(path.match, patterns)) return matches diff --git a/nextstrain/cli/runner/docker.py b/nextstrain/cli/runner/docker.py index aecd30e4..08a851fe 100644 --- a/nextstrain/cli/runner/docker.py +++ b/nextstrain/cli/runner/docker.py @@ -112,6 +112,10 @@ class IMAGE_FEATURE(Enum): # /nextstrain/env.d support first present. envd = "build-20230613T204512Z" + # AWS Batch: support for volume overlays (i.e. ../ in archive members and + # file overwriting) in ZIP extraction. + aws_batch_overlays = "build-20250304T041009Z" + def register_arguments(parser) -> None: # Docker development options diff --git a/nextstrain/cli/util.py b/nextstrain/cli/util.py index f7b7769a..3161dd02 100644 --- a/nextstrain/cli/util.py +++ b/nextstrain/cli/util.py @@ -9,7 +9,7 @@ from importlib.metadata import distribution as distribution_info, PackageNotFoundError from typing import Any, Callable, Iterable, Literal, Mapping, List, Optional, Sequence, Tuple, Union, overload from packaging.version import parse as parse_version -from pathlib import Path +from pathlib import Path, PurePath from shlex import quote as shquote from shutil import which from textwrap import dedent, indent @@ -553,7 +553,7 @@ def split_image_name(name: str, implicit_latest: bool = True) -> Tuple[str, Opti return (repository, tag) -def glob_matcher(patterns: Sequence[str], *, root: Path = None) -> Callable[[Union[str, Path]], bool]: +def glob_matcher(patterns: Sequence[str], *, root: Path = None) -> Callable[[Union[str, Path, PurePath]], bool]: """ Generate a function which matches a string or path-like object against the list of Bash-like glob *patterns*. @@ -563,13 +563,13 @@ def glob_matcher(patterns: Sequence[str], *, root: Path = None) -> Callable[[Uni See :func:`glob_match` for supported pattern features. """ - def matcher(path: Union[str, Path]) -> bool: + def matcher(path: Union[str, Path, PurePath]) -> bool: return glob_match(path, patterns, root = root) return matcher -def glob_match(path: Union[str, Path], patterns: Union[str, Sequence[str]], *, root: Path = None) -> bool: +def glob_match(path: Union[str, Path, PurePath], patterns: Union[str, Sequence[str]], *, root: Path = None) -> bool: """ Test if *path* matches any of the glob *patterns*.