Skip to content

Commit

Permalink
runner.aws_batch: Support overlay volumes (e.g. --augur)
Browse files Browse the repository at this point in the history
The latest image, nextstrain/base:build-20250304T041009Z, provides a
mechanism in the entrypoint to support bundling of overlays in the
workdir ZIP archive by way of upwards-traversing archive member paths.¹
For example, an Augur overlay is bundled into the workdir ZIP archive
with member paths starting with ../augur/ and ends up overwriting files
in the image's /nextstrain/augur/ since the AWS Batch workdir is always
/nextstrain/build/.

Extending overlay support to AWS Batch has been very low priority and
something I thought was unlikely to ever happen.  However, in the course
of working on AWS Batch support for `nextstrain run`, it turned out to
be easiest/most straightforward/most minimal changes to bundle the
pathogen source directory with the working analysis directory in the
workdir ZIP archive, i.e. as a "pathogen" overlay.  This naturally led
to supporting overlays more generally, which I've done here.

One caveat compared to overlays in runtimes with the concept of volume
mounts (Docker, Singularity) is that any files in the image that do not
exist in the overlaid files will remain present since nothing removes
them.  This is potentially problematic and will be annoying if run into
but most of the time should be a non-issue.  It is also solvable if we
care to exert the effort and extra code to do so.  I don't right now.

¹ <nextstrain/docker-base#241>
  • Loading branch information
tsibley committed Mar 5, 2025
1 parent 7dae612 commit 40e4a7a
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 21 deletions.
10 changes: 10 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,16 @@ development source code and as such may not be routinely kept up to date.
between argument/option descriptions.
([#419][])

* AWS Batch builds now support development overlays such as [`--augur`][] and
[`--auspice`][]. To use this functionality, you'll need at least
`nextstrain/base:build-20250304T041009Z` or newer of the Nextstrain Docker
runtime image. Compatibility of the runtime image is checked automatically
when overlays are used with AWS Batch.
([#419][])

[`--augur`]: https://docs.nextstrain.org/projects/cli/en/__NEXT__/commands/build/#cmdoption-nextstrain-build-augur
[`--auspice`]: https://docs.nextstrain.org/projects/cli/en/__NEXT__/commands/build/#cmdoption-nextstrain-build-auspice

## Bug fixes

* Fixed a rare but possible error case in `nextstrain view` and `nextstrain
Expand Down
29 changes: 25 additions & 4 deletions nextstrain/cli/command/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
from ..argparse import add_extended_help_flags, AppendOverwriteDefault, SKIP_AUTO_DEFAULT_IN_HELP
from ..debug import debug
from ..errors import UsageError, UserError
from ..runner import docker, singularity
from ..util import byte_quantity, runner_name, warn
from ..runner import docker, singularity, aws_batch
from ..util import byte_quantity, runner_name, split_image_name, warn
from ..volume import NamedVolume


Expand Down Expand Up @@ -306,10 +306,31 @@ def assert_overlay_volumes_support(opts):
"""
overlay_volumes = opts.volumes

if overlay_volumes and opts.__runner__ not in {docker, singularity}:
if not overlay_volumes:
return

if opts.__runner__ not in {docker, singularity, aws_batch}:
raise UserError(f"""
The {runner_name(opts.__runner__)} runtime does not support overlays (e.g. of {overlay_volumes[0].name}).
Use the Docker or Singularity runtimes (via --docker or --singularity) if overlays are necessary.
Use the Docker, Singularity, or AWS Batch runtimes (via --docker,
--singularity, or --aws-batch) if overlays are necessary.
""")

if opts.__runner__ is aws_batch and not docker.image_supports(docker.IMAGE_FEATURE.aws_batch_overlays, opts.image):
raise UserError(f"""
The Nextstrain runtime image version in use
{opts.image}
is too old to support overlays (e.g. of {overlay_volumes[0].name}) with AWS Batch.
If overlays are necessary, please update the runtime image to at
least version
{split_image_name(opts.image)[0]}:{docker.IMAGE_FEATURE.aws_batch_overlays.value}
using `nextstrain update docker`. Alternatively, use the Docker or
Singularity runtime (via --docker or --singularity) instead.
""")


Expand Down
12 changes: 11 additions & 1 deletion nextstrain/cli/runner/aws_batch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,15 @@ def register_arguments(parser) -> None:


def run(opts, argv, working_volume = None, extra_env: Env = {}, cpus: int = None, memory: int = None) -> int:
docker.assert_volumes_exist(opts.volumes)

# "build" is a special-cased volume for AWS Batch, as /nextstrain/build is
# the fixed initial working directory and what we'll populate by extracting
# a ZIP file.
build_volume = next((v for v in opts.volumes if v and v.name == "build"), None)

opts.volumes = [v for v in opts.volumes if v is not build_volume]

# Unlike other runners, the AWS Batch runner currently *requires* a working
# dir in most usages. This is ok as we only provide the AWS Batch runner
# for commands which also require a working dir (e.g. build), whereas other
Expand Down Expand Up @@ -213,8 +220,11 @@ def run(opts, argv, working_volume = None, extra_env: Env = {}, cpus: int = None
# Upload workdir to S3 so it can be fetched at the start of the Batch job.
print_stage("Uploading %s to S3" % local_workdir)

for volume in opts.volumes:
print(" and %s as %s" % (volume.src.resolve(strict = True), volume.name))

bucket = s3.bucket(opts.s3_bucket)
remote_workdir = s3.upload_workdir(local_workdir, bucket, run_id, opts.exclude_from_upload)
remote_workdir = s3.upload_workdir(local_workdir, bucket, run_id, opts.exclude_from_upload, opts.volumes)

print("uploaded:", s3.object_url(remote_workdir))

Expand Down
52 changes: 40 additions & 12 deletions nextstrain/cli/runner/aws_batch/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,24 @@
import binascii
import boto3
import fsspec
import os.path
from botocore.config import Config
from botocore.exceptions import ClientError
from calendar import timegm
from os import utime
from pathlib import Path
from pathlib import Path, PurePath
from time import struct_time
from typing import Callable, Generator, Iterable, List, Optional, Any
from typing import Callable, Generator, Iterable, List, Optional, Any, Union
from urllib.parse import urlparse
from zipfile import ZipFile, ZipInfo
from ... import env
from ...debug import DEBUGGING
from ...types import Env, S3Bucket, S3Object
from ...util import glob_matcher
from ...volume import NamedVolume


PathMatcher = Callable[[Path], bool]
PathMatcher = Callable[[Union[Path, PurePath]], bool]


def object_url(object: S3Object) -> str:
Expand All @@ -38,10 +41,10 @@ def object_from_url(s3url: str) -> S3Object:
return bucket(url.netloc).Object(key)


def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str, patterns: List[str] = None) -> S3Object:
def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str, patterns: List[str] = None, volumes: List[NamedVolume] = []) -> S3Object:
"""
Upload a ZIP archive of the local *workdir* to the remote S3 *bucket* for
the given *run_id*.
Upload a ZIP archive of the local *workdir* (and optional *volumes*) to the
remote S3 *bucket* for the given *run_id*.
An optional list of *patterns* (shell-style advanced globs) can be passed
to selectively exclude part of the local *workdir* from being uploaded.
Expand Down Expand Up @@ -80,8 +83,23 @@ def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str, patterns: List[
with fsspec.open(object_url(remote_workdir), "wb", auto_mkdir = False) as remote_file:
with ZipFile(remote_file, "w") as zipfile:
for path in walk(workdir, excluded):
print("zipping:", path)
zipfile.write(str(path), str(path.relative_to(workdir)))
dst = path.relative_to(workdir)
print(f"zipping: {path}" + (f" (as {dst})" if DEBUGGING else ""))
zipfile.write(str(path), dst)

for volume in volumes:
# XXX TODO: Use the "walk_up" argument to Path.relative_to()
# once we require Python 3.12.
# -trs, 10 Feb 2025
try:
prefix = PurePath(volume.name).relative_to("build")
except ValueError:
prefix = PurePath("..", volume.name)

for path in walk(volume.src, always_excluded):
dst = prefix / path.relative_to(volume.src)
print(f"zipping: {path}" + (f" (as {dst})" if DEBUGGING else ""))
zipfile.write(str(path), dst)

return remote_workdir

Expand Down Expand Up @@ -138,9 +156,19 @@ def download_workdir(remote_workdir: S3Object, workdir: Path, patterns: List[str

# …and extract its contents to the workdir.
with ZipFile(remote_file) as zipfile:
for member in zipfile.infolist():
path = Path(member.filename)

# Completely ignore archive members with unsafe paths (absolute or
# upwards-traversing) instead of relying on zipfile.extract()'s
# default of munging them to be "safe". Munging seems more
# confusing than skipping, and skipping is essential in the case of
# additional volumes being uploaded in the workdir initially.
safe_members = [
(filename, member)
for filename, member
in ((PurePath(m.filename), m) for m in zipfile.infolist())
if not filename.is_absolute()
and os.path.pardir not in filename.parts ]

for path, member in safe_members:
# Inclusions negate exclusions but aren't an exhaustive
# list of what is included.
if selected(path) and (included(path) or not excluded(path)):
Expand Down Expand Up @@ -179,7 +207,7 @@ def path_matcher(patterns: Iterable[str]) -> PathMatcher:
Generate a function which matches a Path object against the list of glob
*patterns*.
"""
def matches(path: Path) -> bool:
def matches(path: Union[Path, PurePath]) -> bool:
return any(map(path.match, patterns))

return matches
Expand Down
4 changes: 4 additions & 0 deletions nextstrain/cli/runner/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@ class IMAGE_FEATURE(Enum):
# /nextstrain/env.d support first present.
envd = "build-20230613T204512Z"

# AWS Batch: support for volume overlays (i.e. ../ in archive members and
# file overwriting) in ZIP extraction.
aws_batch_overlays = "build-20250304T041009Z"


def register_arguments(parser) -> None:
# Docker development options
Expand Down
8 changes: 4 additions & 4 deletions nextstrain/cli/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from importlib.metadata import distribution as distribution_info, PackageNotFoundError
from typing import Any, Callable, Iterable, Literal, Mapping, List, Optional, Sequence, Tuple, Union, overload
from packaging.version import parse as parse_version
from pathlib import Path
from pathlib import Path, PurePath
from shlex import quote as shquote
from shutil import which
from textwrap import dedent, indent
Expand Down Expand Up @@ -553,7 +553,7 @@ def split_image_name(name: str, implicit_latest: bool = True) -> Tuple[str, Opti
return (repository, tag)


def glob_matcher(patterns: Sequence[str], *, root: Path = None) -> Callable[[Union[str, Path]], bool]:
def glob_matcher(patterns: Sequence[str], *, root: Path = None) -> Callable[[Union[str, Path, PurePath]], bool]:
"""
Generate a function which matches a string or path-like object against the
list of Bash-like glob *patterns*.
Expand All @@ -563,13 +563,13 @@ def glob_matcher(patterns: Sequence[str], *, root: Path = None) -> Callable[[Uni
See :func:`glob_match` for supported pattern features.
"""
def matcher(path: Union[str, Path]) -> bool:
def matcher(path: Union[str, Path, PurePath]) -> bool:
return glob_match(path, patterns, root = root)

return matcher


def glob_match(path: Union[str, Path], patterns: Union[str, Sequence[str]], *, root: Path = None) -> bool:
def glob_match(path: Union[str, Path, PurePath], patterns: Union[str, Sequence[str]], *, root: Path = None) -> bool:
"""
Test if *path* matches any of the glob *patterns*.
Expand Down

0 comments on commit 40e4a7a

Please sign in to comment.