From 05706947d36e43ac3e9d1380ced49844030e8281 Mon Sep 17 00:00:00 2001 From: Noah Harasz Date: Fri, 12 Jul 2024 17:14:23 -0700 Subject: [PATCH] v1.1.1 - fix aws volumes and queue limits (#90) - fix aws volume handling by adding a separate function to handle setting up volumes - fix potential issue with queue limitation --- numerai/cli/__init__.py | 1 + numerai/cli/destroy_all.py | 1 + numerai/cli/misc.py | 57 ++++++++++++++--- numerai/cli/node/__init__.py | 1 - numerai/cli/node/config.py | 92 ++++------------------------ numerai/cli/node/destroy.py | 3 +- numerai/cli/setup.py | 30 +++++---- numerai/cli/util/terraform.py | 78 +++++++++++++++++++++++ numerai/terraform/aws/-inputs.tf | 6 -- numerai/terraform/aws/-main.tf | 1 - numerai/terraform/aws/aws/-inputs.tf | 6 -- numerai/terraform/aws/aws/-locals.tf | 1 + numerai/terraform/aws/aws/cluster.tf | 33 +++++----- numerai/terraform/aws/aws/webhook.tf | 2 +- setup.py | 2 +- 15 files changed, 176 insertions(+), 138 deletions(-) create mode 100644 numerai/cli/util/terraform.py diff --git a/numerai/cli/__init__.py b/numerai/cli/__init__.py index 6e25bd2..e44ddcf 100644 --- a/numerai/cli/__init__.py +++ b/numerai/cli/__init__.py @@ -35,5 +35,6 @@ def main(): numerai.add_command(upgrade.upgrade) numerai.add_command(misc.copy_example) numerai.add_command(misc.list_constants) + numerai.add_command(misc.add_volume_aws) numerai.add_command(destroy_all.destroy_all) numerai() diff --git a/numerai/cli/destroy_all.py b/numerai/cli/destroy_all.py index 07b0fd8..8834dbc 100644 --- a/numerai/cli/destroy_all.py +++ b/numerai/cli/destroy_all.py @@ -1,4 +1,5 @@ """Destroy command for Numerai CLI""" + import click from numerapi import base_api diff --git a/numerai/cli/misc.py b/numerai/cli/misc.py index 9e86878..3957355 100644 --- a/numerai/cli/misc.py +++ b/numerai/cli/misc.py @@ -1,9 +1,8 @@ -import json - -import click - from numerai.cli.constants import * from numerai.cli.util import files +from numerai.cli.util.terraform import apply_terraform + +import click @click.command() @@ -12,13 +11,15 @@ "-e", type=click.Choice(EXAMPLES), default=DEFAULT_EXAMPLE, - help=f"Specify the example to copy, defaults to {DEFAULT_EXAMPLE}. " f"Options are {EXAMPLES}.", + help=f"Specify the example to copy, defaults to {DEFAULT_EXAMPLE}. " + f"Options are {EXAMPLES}.", ) @click.option( "--dest", "-d", type=str, - help=f"Destination folder to which example code is written. " f"Defaults to the name of the example.", + help=f"Destination folder to which example code is written. " + f"Defaults to the name of the example.", ) @click.option("--verbose", "-v", is_flag=True) def copy_example(example, dest, verbose): @@ -49,14 +50,50 @@ def list_constants(): click.secho( f" {size} -> cpus: {preset[0] / 1024}, " f"mem: {preset[1] / 1024} GB {suffix}", - fg="green" - if size == DEFAULT_SIZE or size == DEFAULT_SIZE_GCP - else "yellow", + fg=( + "green" + if size == DEFAULT_SIZE or size == DEFAULT_SIZE_GCP + else "yellow" + ), ) - click.secho("Due to GCP Cloud Run size constraints, 'mem' sizes are not allowed when using GCP.") + click.secho( + "Due to GCP Cloud Run size constraints, 'mem' sizes are not allowed when using GCP." + ) click.secho( "For AWS, use one of these sizes, or specify your own CPU and Memory in cores and GB using --cpu and --memory options.\n" "See https://learn.microsoft.com/en-us/azure/container-apps/containers#configuration for Azure,\n" "or https://cloud.google.com/run/docs/configuring/services/memory-limits for GCP \n" "to learn more info about allowed size presets for those providers." ) + + +@click.command() +@click.option( + "--size", + "-s", + type=int, + required=True, + help="Specify the volume size in GB you'd like your AWS nodes to share.", +) +@click.option("--verbose", "-v", is_flag=True) +def add_volume_aws(size, verbose): + """ + Set the volume size for AWS nodes. This volume is shared by all nodes. + """ + click.secho("Setting volume size for AWS nodes...", fg="yellow") + # get nodes config object + nodes_config = files.load_or_init_nodes() + print(nodes_config) + # set volume size for all nodes to same size + for node in nodes_config: + nodes_config[node]["volume"] = size + files.store_config(NODES_PATH, nodes_config) + files.copy_file( + NODES_PATH, + f"{CONFIG_PATH}/{PROVIDER_AWS}/", + force=True, + verbose=verbose, + ) + click.secho(f"Applying terraform to add {size} GB volume...", fg="yellow") + apply_terraform(nodes_config, [PROVIDER_AWS], PROVIDER_AWS, verbose=verbose) + click.secho("Volume size updated successfully!", fg="green") diff --git a/numerai/cli/node/__init__.py b/numerai/cli/node/__init__.py index ce09d0d..1dce751 100644 --- a/numerai/cli/node/__init__.py +++ b/numerai/cli/node/__init__.py @@ -25,7 +25,6 @@ def tournaments_dict(): def get_models(tournament): napi = base_api.Api(*get_numerai_keys()) models = napi.get_models(tournament) - tournaments = napi.raw_query('query { tournaments { name tournament } }') name_prefix = tournaments_dict()[tournament] model_dict = {} for model_name, model_id in models.items(): diff --git a/numerai/cli/node/config.py b/numerai/cli/node/config.py index 6bf4df8..cbb4476 100644 --- a/numerai/cli/node/config.py +++ b/numerai/cli/node/config.py @@ -1,8 +1,7 @@ """Config command for Numerai CLI""" -import json import os -import click + from numerapi import base_api from numerai.cli.constants import ( DEFAULT_PROVIDER, @@ -19,7 +18,6 @@ CONFIG_PATH, PROVIDER_GCP, ) -from numerai.cli.util.docker import terraform, check_for_dockerfile from numerai.cli.util import docker from numerai.cli.util.files import ( load_or_init_nodes, @@ -27,7 +25,14 @@ copy_example, copy_file, ) -from numerai.cli.util.keys import get_provider_keys, get_numerai_keys, load_or_init_keys +from numerai.cli.util.keys import get_provider_keys, get_numerai_keys +from numerai.cli.util.terraform import ( + apply_terraform, + create_azure_registry, + create_gcp_registry, +) + +import click @click.command() @@ -95,12 +100,6 @@ help="Forces your webhook to register with Numerai. " "Use in conjunction with options that prevent webhook auto-registering.", ) -@click.option( - "--volume", - "-v", - type=int, - help="Specify additional block storage in GB. Currently only supported in AWS.", -) @click.pass_context def config( ctx, @@ -114,7 +113,6 @@ def config( cron, timeout_minutes, register_webhook, - volume, ): """ Uses Terraform to create a full Numerai Compute cluster in your desired provider. @@ -234,7 +232,7 @@ def config( click.secho(f'Current node config: "{node_conf}"...') # double check there is a dockerfile in the path we are about to configure - check_for_dockerfile(nodes_config[node]["path"]) + docker.check_for_dockerfile(nodes_config[node]["path"]) store_config(NODES_PATH, nodes_config) # Added after tf directory restructure: copy nodes.json to providers' tf directory @@ -284,43 +282,10 @@ def config( docker.tag("hello-world:linux", node_conf["docker_repo"], verbose) docker.push(node_conf["docker_repo"], verbose) nodes_config[node] = node_conf - elif provider == "aws": - if volume is not None: - node_conf["volume"] = volume store_config(NODES_PATH, nodes_config) - # Apply terraform for any affected provider - for affected_provider in affected_providers: - if affected_provider in PROVIDERS: - click.secho(f"Updating resources in {affected_provider}") - terraform( - "apply -auto-approve", - verbose, - affected_provider, - env_vars=load_or_init_keys(affected_provider), - inputs={"node_config_file": "nodes.json"}, - ) - else: - click.secho(f"provider {affected_provider} not supported", fg="red") - exit(1) - click.secho("cloud resources created successfully", fg="green") - - # terraform output for node config, same for aws and azure - click.echo(f"saving node configuration to {NODES_PATH}...") - - res = terraform(f"output -json {provider}_nodes", verbose, provider).decode("utf-8") - try: - nodes = json.loads(res) - except json.JSONDecodeError: - click.secho("failed to save node configuration, please retry.", fg="red") - return - for node_name, data in nodes.items(): - nodes_config[node_name].update(data) - - store_config(NODES_PATH, nodes_config) - if verbose: - click.secho(f"new config:\n{json.dumps(load_or_init_nodes(), indent=2)}") + apply_terraform(nodes_config, affected_providers, provider, verbose) webhook_url = nodes_config[node]["webhook_url"] napi = base_api.Api(*get_numerai_keys()) @@ -336,38 +301,3 @@ def config( "Prediction Node configured successfully. " "Next: deploy and test your node", fg="green", ) - - -def create_azure_registry(provider, provider_keys, verbose): - """Creates a registry for azure""" - terraform("init -upgrade", verbose, provider) - terraform( - 'apply -target="azurerm_container_registry.registry[0]" -target="azurerm_resource_group.acr_rg[0]" -auto-approve ', - verbose, - "azure", - env_vars=provider_keys, - inputs={"node_config_file": "nodes.json"}, - ) - res = terraform("output -json acr_repo_details", True, provider).decode("utf-8") - return json.loads(res) - - -def create_gcp_registry(provider, verbose): - """Creates a registry for GCP""" - terraform("init -upgrade", verbose, provider) - terraform( - 'apply -target="google_project_service.cloud_resource_manager" -auto-approve ', - verbose, - "gcp", - inputs={"node_config_file": "nodes.json"}, - ) - terraform( - 'apply -target="google_artifact_registry_repository.registry[0]" -auto-approve ', - verbose, - "gcp", - inputs={"node_config_file": "nodes.json"}, - ) - res = terraform("output -json artifact_registry_details", True, provider).decode( - "utf-8" - ) - return json.loads(res) diff --git a/numerai/cli/node/destroy.py b/numerai/cli/node/destroy.py index 0903bb6..a09a480 100644 --- a/numerai/cli/node/destroy.py +++ b/numerai/cli/node/destroy.py @@ -1,4 +1,5 @@ """Destroy command for Numerai CLI""" + import click from numerapi import base_api @@ -41,7 +42,7 @@ def destroy(ctx, preserve_node_config, verbose): fg="red", ) return - + if not preserve_node_config: click.secho("backing up nodes.json...") copy_file(NODES_PATH, f"{NODES_PATH}.backup", force=True, verbose=True) diff --git a/numerai/cli/setup.py b/numerai/cli/setup.py index fd72c38..c4addd5 100644 --- a/numerai/cli/setup.py +++ b/numerai/cli/setup.py @@ -1,4 +1,5 @@ """Setup command for Numerai CLI""" + import click import logging @@ -16,8 +17,9 @@ prompt=True, help=f"Initialize with this providers API keys.", ) +@click.option("--skip-key-setup", "-s", is_flag=True) @click.option("--verbose", "-v", is_flag=True) -def setup(provider, verbose): +def setup(provider, skip_key_setup, verbose): """ Initializes cli and provider API keys. """ @@ -38,20 +40,22 @@ def setup(provider, verbose): return # setup numerai keys - click.secho( - "Initializing numerai keys " "(press enter to keep value in brackets)...", - fg="yellow", - ) - maybe_create(KEYS_PATH, protected=True) - config_numerai_keys() + if not skip_key_setup: + click.secho( + "Initializing numerai keys " "(press enter to keep value in brackets)...", + fg="yellow", + ) + maybe_create(KEYS_PATH, protected=True) + config_numerai_keys() # setup provider keys - click.secho( - f"\nInitializing {provider} keys " - f"(press enter to keep value in brackets)...", - fg="yellow", - ) - config_provider_keys(provider) + if not skip_key_setup: + click.secho( + f"\nInitializing {provider} keys " + f"(press enter to keep value in brackets)...", + fg="yellow", + ) + config_provider_keys(provider) # copy tf files click.secho("copying terraform files...") diff --git a/numerai/cli/util/terraform.py b/numerai/cli/util/terraform.py new file mode 100644 index 0000000..0f23b4d --- /dev/null +++ b/numerai/cli/util/terraform.py @@ -0,0 +1,78 @@ +import json + +from numerai.cli.constants import PROVIDERS, NODES_PATH +from numerai.cli.util.docker import terraform +from numerai.cli.util import docker +from numerai.cli.util.files import load_or_init_nodes, store_config +from numerai.cli.util.keys import load_or_init_keys + +import click + + +def apply_terraform(nodes_config, affected_providers, provider, verbose): + # Apply terraform for any affected provider + for affected_provider in affected_providers: + if affected_provider in PROVIDERS: + click.secho(f"Updating resources in {affected_provider}") + terraform( + "apply -auto-approve", + verbose, + affected_provider, + env_vars=load_or_init_keys(affected_provider), + inputs={"node_config_file": "nodes.json"}, + ) + else: + click.secho(f"provider {affected_provider} not supported", fg="red") + exit(1) + click.secho("cloud resources created successfully", fg="green") + + # terraform output for node config, same for aws and azure + click.echo(f"saving node configuration to {NODES_PATH}...") + + res = terraform(f"output -json {provider}_nodes", verbose, provider).decode("utf-8") + try: + nodes = json.loads(res) + except json.JSONDecodeError: + click.secho("failed to save node configuration, please retry.", fg="red") + return + for node_name, data in nodes.items(): + nodes_config[node_name].update(data) + + store_config(NODES_PATH, nodes_config) + if verbose: + click.secho(f"new config:\n{json.dumps(load_or_init_nodes(), indent=2)}") + + +def create_azure_registry(provider, provider_keys, verbose): + """Creates a registry for azure""" + terraform("init -upgrade", verbose, provider) + terraform( + 'apply -target="azurerm_container_registry.registry[0]" -target="azurerm_resource_group.acr_rg[0]" -auto-approve ', + verbose, + "azure", + env_vars=provider_keys, + inputs={"node_config_file": "nodes.json"}, + ) + res = terraform("output -json acr_repo_details", True, provider).decode("utf-8") + return json.loads(res) + + +def create_gcp_registry(provider, verbose): + """Creates a registry for GCP""" + terraform("init -upgrade", verbose, provider) + terraform( + 'apply -target="google_project_service.cloud_resource_manager" -auto-approve ', + verbose, + "gcp", + inputs={"node_config_file": "nodes.json"}, + ) + terraform( + 'apply -target="google_artifact_registry_repository.registry[0]" -auto-approve ', + verbose, + "gcp", + inputs={"node_config_file": "nodes.json"}, + ) + res = terraform("output -json artifact_registry_details", True, provider).decode( + "utf-8" + ) + return json.loads(res) diff --git a/numerai/terraform/aws/-inputs.tf b/numerai/terraform/aws/-inputs.tf index 41f0918..52c4624 100644 --- a/numerai/terraform/aws/-inputs.tf +++ b/numerai/terraform/aws/-inputs.tf @@ -27,9 +27,3 @@ variable "gateway_stage_path" { type = string default = "v1" } - -variable "volume_size" { - description = "Size of the EC2 volumes in GB" - type = number - default = 0 -} \ No newline at end of file diff --git a/numerai/terraform/aws/-main.tf b/numerai/terraform/aws/-main.tf index ff49e8f..5e1652c 100644 --- a/numerai/terraform/aws/-main.tf +++ b/numerai/terraform/aws/-main.tf @@ -23,5 +23,4 @@ module "aws" { nodes = local.aws_nodes node_container_port = var.node_container_port gateway_stage_path = var.gateway_stage_path - volume_size = var.volume_size } diff --git a/numerai/terraform/aws/aws/-inputs.tf b/numerai/terraform/aws/aws/-inputs.tf index 653fe46..bbbbb2e 100644 --- a/numerai/terraform/aws/aws/-inputs.tf +++ b/numerai/terraform/aws/aws/-inputs.tf @@ -26,9 +26,3 @@ variable "gateway_stage_path" { type = string default = "v1" } - -variable "volume_size" { - description = "Size of the EC2 volumes in GB" - type = number - nullable = true -} \ No newline at end of file diff --git a/numerai/terraform/aws/aws/-locals.tf b/numerai/terraform/aws/aws/-locals.tf index 2e929b9..8e4c08c 100644 --- a/numerai/terraform/aws/aws/-locals.tf +++ b/numerai/terraform/aws/aws/-locals.tf @@ -1,3 +1,4 @@ locals { node_prefix = "numerai-submission" + max_node_volume_size = max([for node, config in var.nodes : config.volume]...) } diff --git a/numerai/terraform/aws/aws/cluster.tf b/numerai/terraform/aws/aws/cluster.tf index c0d3634..afafd5b 100644 --- a/numerai/terraform/aws/aws/cluster.tf +++ b/numerai/terraform/aws/aws/cluster.tf @@ -94,14 +94,15 @@ data "aws_ami" "ecs_al2" { resource "aws_launch_template" "node" { image_id = data.aws_ami.ecs_al2.id + update_default_version = true dynamic "block_device_mappings" { - for_each = var.volume_size > 0 ? {size: var.volume_size} : {} + for_each = local.max_node_volume_size > 0 ? {size: local.max_node_volume_size} : {} content { device_name = "/dev/xvda" ebs { encrypted = true - volume_size = each.size + volume_size = local.max_node_volume_size volume_type = "gp3" } } @@ -142,22 +143,8 @@ resource "aws_batch_compute_environment" "node" { } -############# -# Job Setup # -############# - -resource "aws_cloudwatch_log_group" "ec2" { - for_each = { for name, config in var.nodes : name => config } - - name = "/ec2/service/${each.key}" - retention_in_days = "14" -} - - resource "aws_batch_job_queue" "node" { - for_each = { for name, config in var.nodes : name => config } - - name = each.key + name = "${local.node_prefix}-queue" state = "ENABLED" priority = 1 @@ -168,6 +155,18 @@ resource "aws_batch_job_queue" "node" { } } + +############# +# Job Setup # +############# + +resource "aws_cloudwatch_log_group" "ec2" { + for_each = { for name, config in var.nodes : name => config } + + name = "/ec2/service/${each.key}" + retention_in_days = "14" +} + resource "aws_batch_job_definition" "node" { for_each = { for name, config in var.nodes : name => config } diff --git a/numerai/terraform/aws/aws/webhook.tf b/numerai/terraform/aws/aws/webhook.tf index 7033f7e..ebdc2c0 100644 --- a/numerai/terraform/aws/aws/webhook.tf +++ b/numerai/terraform/aws/aws/webhook.tf @@ -15,7 +15,7 @@ resource "aws_lambda_function" "submission" { environment { variables = { JOB_DEFINITION = aws_batch_job_definition.node[each.key].name - JOB_QUEUE = aws_batch_job_queue.node[each.key].name + JOB_QUEUE = aws_batch_job_queue.node.name } } } diff --git a/setup.py b/setup.py index edc3fdd..8e0fe37 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ setup( name="numerai-cli", - version="1.1.0", + version="1.1.1", description="A library for deploying Numer.ai Prediction Nodes.", url="https://github.com/numerai/numerai-cli", author="Numer.ai",