Skip to content

Commit

Permalink
add option for additional ebs storage and retry strategy to aws batch (
Browse files Browse the repository at this point in the history
  • Loading branch information
ndharasz authored Jul 2, 2024
1 parent 26b82f1 commit 7f64cb4
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 11 deletions.
23 changes: 17 additions & 6 deletions numerai/cli/node/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Config command for Numerai CLI"""

import json
import os
import click
Expand Down Expand Up @@ -94,6 +95,12 @@
help="Forces your webhook to register with Numerai. "
"Use in conjunction with options that prevent webhook auto-registering.",
)
@click.option(
"--volume",
"-v",
type=int,
help="Specify additional block storage in GB. Currently only supported in AWS.",
)
@click.pass_context
def config(
ctx,
Expand All @@ -107,6 +114,7 @@ def config(
cron,
timeout_minutes,
register_webhook,
volume,
):
"""
Uses Terraform to create a full Numerai Compute cluster in your desired provider.
Expand Down Expand Up @@ -262,12 +270,12 @@ def config(
provider_registry_conf = create_gcp_registry(provider, verbose=verbose)
node_conf.update(provider_registry_conf)
registry_parts = node_conf["registry_id"].split("/")
node_conf[
"artifact_registry_login_url"
] = f"https://{registry_parts[3]}-docker.pkg.dev/"
node_conf[
"docker_repo"
] = f"{registry_parts[3]}-docker.pkg.dev/{registry_parts[1]}/numerai-container-registry/{node}:latest"
node_conf["artifact_registry_login_url"] = (
f"https://{registry_parts[3]}-docker.pkg.dev/"
)
node_conf["docker_repo"] = (
f"{registry_parts[3]}-docker.pkg.dev/{registry_parts[1]}/numerai-container-registry/{node}:latest"
)
docker.login(node_conf, verbose)
try:
docker.manifest_inspect(node_conf["docker_repo"], verbose)
Expand All @@ -276,6 +284,9 @@ def config(
docker.tag("hello-world:linux", node_conf["docker_repo"], verbose)
docker.push(node_conf["docker_repo"], verbose)
nodes_config[node] = node_conf
elif provider == "aws":
if volume is not None:
node_conf["volume"] = volume

store_config(NODES_PATH, nodes_config)

Expand Down
6 changes: 6 additions & 0 deletions numerai/terraform/aws/-inputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,9 @@ variable "gateway_stage_path" {
type = string
default = "v1"
}

variable "volume_size" {
description = "Size of the EC2 volumes in GB"
type = number
default = 0
}
1 change: 1 addition & 0 deletions numerai/terraform/aws/-main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@ module "aws" {
nodes = local.aws_nodes
node_container_port = var.node_container_port
gateway_stage_path = var.gateway_stage_path
volume_size = var.volume_size
}
6 changes: 6 additions & 0 deletions numerai/terraform/aws/aws/-inputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,9 @@ variable "gateway_stage_path" {
type = string
default = "v1"
}

variable "volume_size" {
description = "Size of the EC2 volumes in GB"
type = number
nullable = true
}
55 changes: 50 additions & 5 deletions numerai/terraform/aws/aws/cluster.tf
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,41 @@ resource "aws_iam_role_policy_attachment" "aws_batch_service_role" {
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole"
}

data "aws_ami" "ecs_al2" {
most_recent = true
filter {
name = "name"
values = ["amzn2-ami-ecs-hvm-*-x86_64-ebs"]
}
}

resource "aws_launch_template" "node" {
image_id = data.aws_ami.ecs_al2.id
dynamic "block_device_mappings" {
for_each = var.volume_size > 0 ? {size: var.volume_size} : {}
content {
device_name = "/dev/xvda"

ebs {
encrypted = true
volume_size = each.size
volume_type = "gp3"
}
}
}
}

resource "aws_batch_compute_environment" "node" {
compute_environment_name = local.node_prefix
compute_environment_name_prefix = "${local.node_prefix}-"

compute_resources {
instance_role = aws_iam_instance_profile.batch_ecs_instance_role.arn

launch_template {
launch_template_id = aws_launch_template.node.id
version = "$Latest"
}

max_vcpus = 64

security_group_ids = [
Expand All @@ -106,6 +135,10 @@ resource "aws_batch_compute_environment" "node" {
service_role = aws_iam_role.aws_batch_service_role.arn
type = "MANAGED"
depends_on = [aws_iam_role_policy_attachment.aws_batch_service_role]

lifecycle {
create_before_destroy = true
}
}


Expand All @@ -129,9 +162,10 @@ resource "aws_batch_job_queue" "node" {
state = "ENABLED"
priority = 1

compute_environments = [
aws_batch_compute_environment.node.arn
]
compute_environment_order {
order = 1
compute_environment = aws_batch_compute_environment.node.arn
}
}

resource "aws_batch_job_definition" "node" {
Expand All @@ -145,7 +179,18 @@ resource "aws_batch_job_definition" "node" {

retry_strategy {
attempts = 2

evaluate_on_exit {
on_reason = "CannotInspectContainerError:*"
action = "RETRY"
}
evaluate_on_exit {
on_reason = "CannotPullContainerError:*"
action = "RETRY"
}
evaluate_on_exit {
action = "RETRY"
on_reason = "CannotStartContainerError:*"
}
evaluate_on_exit {
action = "RETRY"
on_reason = "Task failed to start"
Expand Down

0 comments on commit 7f64cb4

Please sign in to comment.