From 672827fbe2e4990b65e75936c59d774f0431009d Mon Sep 17 00:00:00 2001 From: cailinw Date: Thu, 14 Oct 2021 05:28:18 +0000 Subject: [PATCH 01/14] Add and tests for get/update clusters --- Banyan/src/Banyan.jl | 1 + Banyan/src/clusters.jl | 11 +++++++++++ Banyan/test/clusters.jl | 27 ++++++++++++++++++++++++--- 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl index 3d999128..63d3648a 100644 --- a/Banyan/src/Banyan.jl +++ b/Banyan/src/Banyan.jl @@ -22,6 +22,7 @@ export Cluster, get_clusters, get_running_clusters, get_cluster, + get_cluster_status, get_cluster_s3_bucket_name, assert_cluster_is_ready diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index 2ce96b3d..0eaffe58 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -92,6 +92,15 @@ function delete_cluster(name::String; kwargs...) ) end +function update_cluster(name::String; kwargs...) + configure(; kwargs...) + @debug "Updating cluster" + send_request_get_response( + :update_cluster, + Dict{String, Any}("cluster_name" => name) + ) +end + function assert_cluster_is_ready(name::String; kwargs...) @info "Setting cluster status to running" @@ -154,4 +163,6 @@ end get_cluster(name::String=get_cluster_name(), kwargs...) = get_clusters(; kwargs...)[name] +get_cluster_status(name::String=get_cluster_name(), kwargs...) = get_clusters(; kwargs...)[name].status + get_running_clusters(args...; kwargs...) = filter(entry -> entry[2].status == :running, get_clusters(args...; kwargs...)) \ No newline at end of file diff --git a/Banyan/test/clusters.jl b/Banyan/test/clusters.jl index e676f168..d0cb4865 100644 --- a/Banyan/test/clusters.jl +++ b/Banyan/test/clusters.jl @@ -1,8 +1,29 @@ -# # TODO: Migrate to using ReTest +@testset "Get clusters" begin + cluster_name = ENV["BANYAN_CLUSTER_NAME"] + + clusters = get_clusters() + get_cluster_s3_bucket_name(cluster_name) + running_clusters = get_running_clusters() + + @test haskey(clusters, cluster_name) + @test all(c -> c[2].status == :running, running_clusters) + +end + +@testset "Update clusters" begin + cluster_name = ENV["BANYAN_CLUSTER_NAME"] + + update_cluster(cluster_name) + cluster_status = get_cluster_status(cluster_name) + + @test cluster_status == :updating + + while get_cluster_status(cluster_name) == :updating + sleep(5) + end +end -# using AWSCore, AWSS3, HTTP -# include("../src/clusters.jl") # # Test `clusters.jl:load_json` From c8313f2eba2ee888c51caef711ce591c1e67943b Mon Sep 17 00:00:00 2001 From: cailinw Date: Thu, 14 Oct 2021 10:50:38 +0000 Subject: [PATCH 02/14] Add scaledown_time option and made ec2_key_pair optional for create_cluster --- Banyan/src/clusters.jl | 11 +++++++++-- Banyan/src/utils.jl | 6 ------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index 0eaffe58..a6b06aa0 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -6,6 +6,8 @@ function create_cluster(; iam_policy_arn::Union{String,Nothing} = nothing, s3_bucket_arn::Union{String,Nothing} = nothing, s3_bucket_name::Union{String,Nothing} = nothing, + scaledown_time = 25, + ec2_key_pair_name = nothing, vpc_id = nothing, subnet_id = nothing, kwargs..., @@ -33,7 +35,7 @@ function create_cluster(; # Construct arguments # Configure using parameters - c = configure(; require_ec2_key_pair_name = true, kwargs...) + c = configure(; kwargs...) if isnothing(s3_bucket_arn) && isnothing(s3_bucket_name) s3_bucket_arn = @@ -54,11 +56,16 @@ function create_cluster(; "cluster_name" => name, "instance_type" => instance_type, "num_nodes" => max_num_nodes, - "ec2_key_pair" => c["aws"]["ec2_key_pair_name"], "aws_region" => get_aws_config_region(), "s3_read_write_resource" => s3_bucket_arn, + "scaledown_time" => scaledown_time, "recreate" => false, ) + if !isnothing(ec2_key_pair_name) + cluster_config["ec2_key_pair"] = ec2_key_pair_name + else haskey(c["aws", "ec2_key_pair_name"]) + cluster_config["ec2_key_pair"] = c["aws"]["ec2_key_pair_name"] + end if !isnothing(iam_policy_arn) cluster_config["additional_policy"] = iam_policy_arn end diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl index b368b779..45e5ad5c 100644 --- a/Banyan/src/utils.jl +++ b/Banyan/src/utils.jl @@ -150,8 +150,6 @@ function configure(; kwargs...) user_id = if_in_or(:user_id, kwargs) api_key = if_in_or(:api_key, kwargs) ec2_key_pair_name = if_in_or(:ec2_key_pair_name, kwargs) - require_ec2_key_pair_name = - if_in_or(:require_ec2_key_pair_name, kwargs, false) banyanconfig_path = if_in_or(:banyanconfig_path, kwargs) # Load config @@ -214,10 +212,6 @@ function configure(; kwargs...) banyan_config["aws"]["ec2_key_pair_name"] = ec2_key_pair_name is_modified = true end - if require_ec2_key_pair_name && - !(haskey(banyan_config["aws"], "ec2_key_pair_name")) - error("Name of an EC2 key pair required but not provided; visit here to create a key pair: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html#having-ec2-create-your-key-pair") - end # # aws.region # if !isnothing(region) && ( From a5fa7678d26bfdeead6b6a5e5a93241d784e33e6 Mon Sep 17 00:00:00 2001 From: cailinw Date: Thu, 14 Oct 2021 11:46:21 +0000 Subject: [PATCH 03/14] Add min_num_workers and initial_num_workers --- Banyan/src/clusters.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index a6b06aa0..afe507a9 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -3,6 +3,8 @@ function create_cluster(; name::Union{String,Nothing} = nothing, instance_type::Union{String,Nothing} = "m4.4xlarge", max_num_nodes::Union{Int,Nothing} = 8, + initial_num_workers::Union{Int,Nothing} = 0, + min_num_workers::Union{Int,Nothing} = 0, iam_policy_arn::Union{String,Nothing} = nothing, s3_bucket_arn::Union{String,Nothing} = nothing, s3_bucket_name::Union{String,Nothing} = nothing, @@ -56,6 +58,8 @@ function create_cluster(; "cluster_name" => name, "instance_type" => instance_type, "num_nodes" => max_num_nodes, + "initial_num_workers" => initial_num_workers, + "min_num_workers" => min_num_workers, "aws_region" => get_aws_config_region(), "s3_read_write_resource" => s3_bucket_arn, "scaledown_time" => scaledown_time, From a9497a11a95d58df60b216768565073621c9b49d Mon Sep 17 00:00:00 2001 From: cailinw Date: Thu, 14 Oct 2021 23:34:24 +0000 Subject: [PATCH 04/14] Change num_nodes to max_num_workers --- Banyan/src/clusters.jl | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index afe507a9..c9836545 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -2,7 +2,7 @@ function create_cluster(; name::Union{String,Nothing} = nothing, instance_type::Union{String,Nothing} = "m4.4xlarge", - max_num_nodes::Union{Int,Nothing} = 8, + max_num_workers::Union{Int,Nothing} = 8, initial_num_workers::Union{Int,Nothing} = 0, min_num_workers::Union{Int,Nothing} = 0, iam_policy_arn::Union{String,Nothing} = nothing, @@ -57,7 +57,7 @@ function create_cluster(; cluster_config = Dict( "cluster_name" => name, "instance_type" => instance_type, - "num_nodes" => max_num_nodes, + "max_num_workers" => max_num_workers, "initial_num_workers" => initial_num_workers, "min_num_workers" => min_num_workers, "aws_region" => get_aws_config_region(), @@ -176,4 +176,14 @@ get_cluster(name::String=get_cluster_name(), kwargs...) = get_clusters(; kwargs. get_cluster_status(name::String=get_cluster_name(), kwargs...) = get_clusters(; kwargs...)[name].status -get_running_clusters(args...; kwargs...) = filter(entry -> entry[2].status == :running, get_clusters(args...; kwargs...)) \ No newline at end of file +get_running_clusters(args...; kwargs...) = filter(entry -> entry[2].status == :running, get_clusters(args...; kwargs...)) + +function wait_for_cluster(name::String=get_cluster_name(), kwargs...) + t = 5 + while get_cluster_status(name; kwargs...) != :running + sleep(t) + if t < 80 + t *= 2 + end + end +end \ No newline at end of file From ed3bafd2a6433642789846c8564cbf6376019b7a Mon Sep 17 00:00:00 2001 From: cailinw Date: Thu, 14 Oct 2021 23:46:23 +0000 Subject: [PATCH 05/14] Add tests for clusters --- Banyan/src/Banyan.jl | 3 ++- Banyan/test/clusters.jl | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl index 63d3648a..b087dbe7 100644 --- a/Banyan/src/Banyan.jl +++ b/Banyan/src/Banyan.jl @@ -24,7 +24,8 @@ export Cluster, get_cluster, get_cluster_status, get_cluster_s3_bucket_name, - assert_cluster_is_ready + assert_cluster_is_ready, + wait_for_cluster # Job management export Job, diff --git a/Banyan/test/clusters.jl b/Banyan/test/clusters.jl index d0cb4865..8193a10c 100644 --- a/Banyan/test/clusters.jl +++ b/Banyan/test/clusters.jl @@ -1,3 +1,5 @@ +using Random + @testset "Get clusters" begin cluster_name = ENV["BANYAN_CLUSTER_NAME"] @@ -23,6 +25,18 @@ end end end +@testset "Benchmark create_cluster with $instance_type instance type" for instance_type in [ + "t3.xlarge" #, "t3.2xlarge", "c5.2xlarge", "m4.4xlarge", "m4.10xlarge" +] + cluster_name = "cluster_$(Random.randstring(['a':'z'; '0':'9'], 12))" + @elapsed create_cluster( + name=cluster_name, + instance_type=instance_type + ) + + delete_cluster(cluster_name) +end + From 1473bcf789a4d3c3ae8a7e4bbdc3b93ec88a9e52 Mon Sep 17 00:00:00 2001 From: cailinw Date: Fri, 15 Oct 2021 15:46:49 +0000 Subject: [PATCH 06/14] Add nowait for create_cluster; Improve wait_for_cluster; Fix issues; Add timing test for create_cluster --- Banyan/src/Banyan.jl | 1 + Banyan/src/clusters.jl | 26 +++++++++++++++++++++++--- Banyan/test/clusters.jl | 25 ++++++++++++++++++------- 3 files changed, 42 insertions(+), 10 deletions(-) diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl index b087dbe7..4ba1fae8 100644 --- a/Banyan/src/Banyan.jl +++ b/Banyan/src/Banyan.jl @@ -19,6 +19,7 @@ export Cluster, create_cluster, update_cluster, destroy_cluster, + delete_cluster, get_clusters, get_running_clusters, get_cluster, diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index c9836545..6aaabdb4 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -12,6 +12,7 @@ function create_cluster(; ec2_key_pair_name = nothing, vpc_id = nothing, subnet_id = nothing, + nowait=false, kwargs..., ) clusters = get_clusters(; kwargs...) @@ -67,7 +68,7 @@ function create_cluster(; ) if !isnothing(ec2_key_pair_name) cluster_config["ec2_key_pair"] = ec2_key_pair_name - else haskey(c["aws", "ec2_key_pair_name"]) + elseif haskey(c["aws"], "ec2_key_pair_name") cluster_config["ec2_key_pair"] = c["aws"]["ec2_key_pair_name"] end if !isnothing(iam_policy_arn) @@ -85,7 +86,11 @@ function create_cluster(; # Send request to create cluster send_request_get_response(:create_cluster, cluster_config) - return Cluster(name, :creating, 0, s3_bucket_arn) + if !nowait + wait_for_cluster(name) + end + + return Cluster(name, get_cluster_status(name), 0, s3_bucket_arn) end function destroy_cluster(name::String; kwargs...) @@ -180,10 +185,25 @@ get_running_clusters(args...; kwargs...) = filter(entry -> entry[2].status == :r function wait_for_cluster(name::String=get_cluster_name(), kwargs...) t = 5 - while get_cluster_status(name; kwargs...) != :running + cluster_status = get_cluster_status(name; kwargs...) + while (cluster_status == :creating || cluster_status == :updating) + if cluster_status == :creating + @info "Cluster $(name) is getting set up" + else + @info "Cluster $(name) is updating" + end sleep(t) if t < 80 t *= 2 end + cluster_status = get_cluster_status(name; kwargs...) + end + if cluster_status == :running + @info "Cluster $(name) is running and ready for jobs" + elseif cluster_status == :terminated + @info "Cluster $(name) no longer exists" + elseif cluster_status!= :creating && cluster_status != :updating + @info "Cluster $(name) set up has failed" + delete_cluster(name) end end \ No newline at end of file diff --git a/Banyan/test/clusters.jl b/Banyan/test/clusters.jl index 8193a10c..7ac08ff1 100644 --- a/Banyan/test/clusters.jl +++ b/Banyan/test/clusters.jl @@ -26,15 +26,26 @@ end end @testset "Benchmark create_cluster with $instance_type instance type" for instance_type in [ - "t3.xlarge" #, "t3.2xlarge", "c5.2xlarge", "m4.4xlarge", "m4.10xlarge" + "t3.xlarge", "t3.2xlarge", "c5.2xlarge", "m4.4xlarge", "m4.10xlarge" ] - cluster_name = "cluster_$(Random.randstring(['a':'z'; '0':'9'], 12))" - @elapsed create_cluster( - name=cluster_name, - instance_type=instance_type - ) - + cluster_name = "cluster-$(Random.randstring(['a':'z'; '0':'9'], 6))" + @show cluster_name + t = @elapsed begin + c = create_cluster( + name=cluster_name, + instance_type=instance_type, + max_num_workers=16 + ) + end delete_cluster(cluster_name) + + # Save results to file + open("create_cluster_times.txt", "a") do f + write(f, "$(instance_type)\t$(string(t))") + end + + # Verify that cluster was spun up + @test c.status == :running end From 7c2cde76402109a05b55eb51adb257260d1d5907 Mon Sep 17 00:00:00 2001 From: cailinw Date: Sat, 16 Oct 2021 07:01:11 +0000 Subject: [PATCH 07/14] Edit --- Banyan/src/clusters.jl | 7 +- Banyan/src/utils.jl | 4 +- Banyan/test/clusters.jl | 204 ++++++++++++++-------------------------- 3 files changed, 75 insertions(+), 140 deletions(-) diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index 6aaabdb4..4f150746 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -23,7 +23,7 @@ function create_cluster(; # Check if the configuration for this cluster name already exists # If it does, then recreate cluster if haskey(clusters, name) - if clusters[name][status] == "terminated" + if clusters[name].status == :terminated @warn "Cluster configuration with name $name already exists. Ignoring new configuration and re-creating cluster." send_request_get_response( :create_cluster, @@ -31,7 +31,7 @@ function create_cluster(; ) return else - error("Cluster with name $name already exists") + error("Cluster with name $name already exists and has status $(string(clusters[name].status))") end end @@ -41,8 +41,7 @@ function create_cluster(; c = configure(; kwargs...) if isnothing(s3_bucket_arn) && isnothing(s3_bucket_name) - s3_bucket_arn = - "arn:aws:s3:::banyan-cluster-data-" * name * "-" * bytes2hex(rand(UInt8, 4)) + s3_bucket_arn = "arn:aws:s3:::banyan-cluster-data-$name-$(string(bytes2hex(rand(UInt8, 4))))" s3_bucket_name = last(split(s3_bucket_arn, ":")) s3_create_bucket(get_aws_config(), s3_bucket_name) elseif isnothing(s3_bucket_arn) diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl index 45e5ad5c..cb1eeaa3 100644 --- a/Banyan/src/utils.jl +++ b/Banyan/src/utils.jl @@ -352,11 +352,13 @@ function send_request_get_response(method, content::Dict) ) if resp.status == 403 throw(ErrorException("Please use a valid user ID and API key. Sign into the dashboard to retrieve these credentials.")) - elseif resp.status == 500 || resp.status == 504 + elseif resp.status == 504 # HTTP request timed out, for example if isa(data, Dict) && haskey(data, "message") data = data["message"] end + @info data + elseif resp.status == 500 || resp.status == 504 throw(ErrorException(data)) elseif resp.status == 502 throw(ErrorException("Sorry there has been an error. Please contact support")) diff --git a/Banyan/test/clusters.jl b/Banyan/test/clusters.jl index 7ac08ff1..8c9d9fc0 100644 --- a/Banyan/test/clusters.jl +++ b/Banyan/test/clusters.jl @@ -20,14 +20,81 @@ end @test cluster_status == :updating - while get_cluster_status(cluster_name) == :updating + while cluster_status == :updating sleep(5) + cluster_status = get_cluster_status(cluster_name) end + @test cluster_status == :running +end + +@testset "Create clusters" begin +end + +function bucket_exists(s3_bucket_name) + ispath(S3Path("s3://$(s3_bucket_name)", config=Banyan.get_aws_config())) +end + +@testset "Destroy and delete clusters with $s3_bucket S3 bucket" for s3_bucket in [ + "default", "user-provided" + ] + Random.seed!() + cluster_name = "cluster-$(Random.randstring(['a':'z'; '0':'9'], 6))" + @show cluster_name + + if s3_bucket == "default" + s3_bucket = nothing + elseif s3_bucket == "user-provided" + s3_bucket = Random.randstring(['a':'z'; '0':'9'], 6) + s3_create_bucket(Banyan.get_aws_config(), s3_bucket) + end + + # Create a cluster (at least initiate) and check that S3 bucket exists + c = create_cluster( + name=cluster_name, + instance_type="t3.large", + s3_bucket_name=s3_bucket, + nowait=true + ) + sleep(30) # Just to ensure that cluster creation has initiated + s3_bucket_name = get_cluster_s3_bucket_name(cluster_name) + s3_bucket_exists = bucket_exists(s3_bucket_name) + if !isnothing(s3_bucket) + @test s3_bucket == s3_bucket_name + end + @test s3_bucket_exists + + # Destroy cluster and check that S3 bucket still exists + destroy_cluster(cluster_name) + s3_bucket_exists = bucket_exists(s3_bucket_name) + @test s3_bucket_exists + sleep(30) # Just to ensure that cluster destruction is complete + + # Re-create cluster and check that S3 bucket exists and is same as before + while get_cluster_status(cluster_name) != :terminated + sleep(30) + end + c_r = create_cluster( + name=cluster_name, + nowait=true + ) + s3_bucket_name_r = get_cluster_s3_bucket_name(cluster_name) + s3_bucket_exists = bucket_exists(s3_bucket_name_r) + @test s3_bucket_exists + @test s3_bucket_name == s3_bucket_name_r + + # Delete cluster + delete_cluster(cluster_name) + s3_bucket_exists = bucket_exists(s3_bucket_name_r) + @test !s3_bucket_exists + + # Check that the cluster cannot be created again + @test_throws ErrorException create_cluster(name=cluster_name, nowait=true) end @testset "Benchmark create_cluster with $instance_type instance type" for instance_type in [ "t3.xlarge", "t3.2xlarge", "c5.2xlarge", "m4.4xlarge", "m4.10xlarge" ] + Random.seed!() cluster_name = "cluster-$(Random.randstring(['a':'z'; '0':'9'], 6))" @show cluster_name t = @elapsed begin @@ -41,142 +108,9 @@ end # Save results to file open("create_cluster_times.txt", "a") do f - write(f, "$(instance_type)\t$(string(t))") + write(f, "$(instance_type)\t$(string(t/60))\n") end # Verify that cluster was spun up @test c.status == :running end - - - - -# # Test `clusters.jl:load_json` -# function test_load_json() -# # Test failure if filename is not valid -# @test_throws ErrorException Banyan.load_json("res/Banyanfile.json") - -# # Test failure if local file does not exist -# @test_throws ErrorException Banyan.load_json("file://res/filedoesnotexist.json") -# # Test valid local file can be loaded -# banyanfile = Banyan.load_json("file://res/Banyanfile.json") -# @test typeof(banyanfile) <: Dict - -# # Test failure if s3 file does not exist -# # TODO: Add this -# # Test valid s3 file can be loaded -# # TODO: Add this - -# # Test failure if http(s) file does not exist -# @test_throws HTTP.ExceptionRequest.StatusError Banyan.load_json("https://raw.githubusercontent.com/banyan-team/banyan-julia/v0.1.0/Banyan/test/res/filedoesnotexist.json") -# # Test valid http(s) file can be loaded -# banyanfile = Banyan.load_json("https://raw.githubusercontent.com/banyan-team/banyan-julia/v0.1.0/Banyan/test/res/Banyanfile.json") -# @test typeof(banyanfile) <: Dict - -# end - - -# # Test `clusters.jl:load_file` -# function test_load_file() -# # Test failure if filename is not valid -# @test_throws ErrorException Banyan.load_file("res/code_dep.jl") - -# # Test failure if local file does not exist -# @test_throws ErrorException Banyan.load_file("file://res/filedoesnotexist.jl") -# # Test valid local json file can be loaded -# f = Banyan.load_file("file://res/Banyanfile.json") -# @test typeof(f) == String -# # Test valid local julia file can be loaded -# f = Banyan.load_file("file://res/code_dep.jl") -# @test typeof(f) == String - -# # Test failure if s3 file does not exist -# # TODO: Add this -# # Test valid s3 file can be loaded -# # TODO: Add this - -# # Test failure if http(s) file does not exist -# @test_throws HTTP.ExceptionRequest.StatusError Banyan.load_file("https://raw.githubusercontent.com/banyan-team/banyan-julia/v0.1.0/Banyan/test/res/filedoesnotexist.json") -# # Test valid http(s) file can be loaded -# f = Banyan.load_file("https://raw.githubusercontent.com/banyan-team/banyan-julia/v0.1.0/Banyan/test/res/Banyanfile.json") -# @test typeof(f) == String -# end - - -# # Test `clusters.jl:create_cluster` in cases where it should fail -# function test_create_cluster_failure_cases() - -# end - - -# # Test `clusters.jl:create_cluster` in cases where it should succeed -# function test_create_cluster_success_cases() -# end - - -# # Test `clusters.jl:destroy_cluster` -# function test_destroy_cluster() -# end - - -# # Test `clusters.jl:get_cluster` and `clusters.jl:get_clusters` -# function test_get_clusters() -# end - - -# # Test `clusters.jl:get_jobs_for_cluster` -# function test_get_jobs_for_cluster() -# end - - -# # Test `clusters.jl:assert_cluster_is_ready` -# function test_assert_cluster_is_ready() -# end - - -# # Test `clusters.jl:update_cluster` -# function test_update_cluster() -# end - - - - -# @testset "Test loading files" begin -# run("load json") do -# test_load_json() -# end -# run("load file") do -# test_load_file() -# end -# end - - - - -# @testset "Test creating clusters" begin -# run("create cluster") do -# test_create_cluster_failure_cases() -# test_create_cluster_success_cases() -# end -# end - - -# @testset "Test destroying clusters" begin -# run("destroy cluster") do -# test_destroy_cluster() -# end -# end - - -# @testset "Test managing clusters" begin -# run("get clusters info") do -# test_get_clusters() -# test_get_jobs_for_cluster() -# end -# run("set cluster status to ready") do -# test_assert_cluster_is_ready() -# end -# run("update cluster") do -# test_update_cluster() -# end -# end From ed69761368cb359bcd3e8969e7b7c97f197dc4c5 Mon Sep 17 00:00:00 2001 From: cailinw Date: Sat, 16 Oct 2021 17:28:58 +0000 Subject: [PATCH 08/14] Make create_job wait for cluster --- Banyan/src/jobs.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Banyan/src/jobs.jl b/Banyan/src/jobs.jl index 2e7bcfb5..aa7cabca 100644 --- a/Banyan/src/jobs.jl +++ b/Banyan/src/jobs.jl @@ -174,6 +174,8 @@ function create_job(; jobs[current_job_id] = Job(cluster_name, current_job_id, nworkers, sample_rate) jobs[current_job_id].current_status = "running" + wait_for_cluster(cluster_name) + @debug "Finished creating job $job_id" return job_id end From c047bfb9487cf62ea36e241400962194eb158fbc Mon Sep 17 00:00:00 2001 From: cailinw Date: Sat, 16 Oct 2021 23:16:43 +0000 Subject: [PATCH 09/14] Debug 2 buckets getting created --- Banyan/src/clusters.jl | 21 ++++++++++++++++++--- Banyan/test/clusters.jl | 17 ++++++++++++----- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index 4f150746..5594153e 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -15,6 +15,10 @@ function create_cluster(; nowait=false, kwargs..., ) + + # Configure using parameters + c = configure(; kwargs...) + clusters = get_clusters(; kwargs...) if isnothing(name) name = "Cluster " * string(length(clusters) + 1) @@ -29,7 +33,11 @@ function create_cluster(; :create_cluster, Dict("cluster_name" => name, "recreate" => true), ) - return + if !nowait + wait_for_cluster(name) + end + println("About to return") + return get_cluster(name) else error("Cluster with name $name already exists and has status $(string(clusters[name].status))") end @@ -37,8 +45,9 @@ function create_cluster(; # Construct arguments - # Configure using parameters - c = configure(; kwargs...) + println("Following passed to create_cluster") + @show s3_bucket_arn + @show s3_bucket_name if isnothing(s3_bucket_arn) && isnothing(s3_bucket_name) s3_bucket_arn = "arn:aws:s3:::banyan-cluster-data-$name-$(string(bytes2hex(rand(UInt8, 4))))" @@ -53,6 +62,10 @@ function create_cluster(; error("Bucket $s3_bucket_name does not exist in connected AWS account") end + println("And after") + @show s3_bucket_arn + @show s3_bucket_name + # Construct cluster creation cluster_config = Dict( "cluster_name" => name, @@ -80,6 +93,8 @@ function create_cluster(; cluster_config["subnet_id"] = subnet_id end + println("s3_bucket_arn=$(s3_bucket_arn)") + @info "Creating cluster" # Send request to create cluster diff --git a/Banyan/test/clusters.jl b/Banyan/test/clusters.jl index 8c9d9fc0..fcf894ea 100644 --- a/Banyan/test/clusters.jl +++ b/Banyan/test/clusters.jl @@ -27,15 +27,15 @@ end @test cluster_status == :running end -@testset "Create clusters" begin -end - function bucket_exists(s3_bucket_name) ispath(S3Path("s3://$(s3_bucket_name)", config=Banyan.get_aws_config())) end +@testset "Create clusters" begin +end + @testset "Destroy and delete clusters with $s3_bucket S3 bucket" for s3_bucket in [ - "default", "user-provided" + "user-provided" # "default", ] Random.seed!() cluster_name = "cluster-$(Random.randstring(['a':'z'; '0':'9'], 6))" @@ -48,6 +48,8 @@ end s3_create_bucket(Banyan.get_aws_config(), s3_bucket) end + println("s3_bucket is ", s3_bucket) + # Create a cluster (at least initiate) and check that S3 bucket exists c = create_cluster( name=cluster_name, @@ -71,7 +73,7 @@ end # Re-create cluster and check that S3 bucket exists and is same as before while get_cluster_status(cluster_name) != :terminated - sleep(30) + sleep(15) end c_r = create_cluster( name=cluster_name, @@ -84,6 +86,11 @@ end # Delete cluster delete_cluster(cluster_name) + @show s3_bucket_name + @show s3_bucket_name_r + @show bucket_exists(s3_bucket_name) + @show bucket_exists(s3_bucket_name_r) + sleep(30) # Just to ensure that bucket has been deleted s3_bucket_exists = bucket_exists(s3_bucket_name_r) @test !s3_bucket_exists From 5cc17832ad13de6fbe9ce3d71447e038ae73e84c Mon Sep 17 00:00:00 2001 From: cailinw Date: Mon, 18 Oct 2021 01:59:58 +0000 Subject: [PATCH 10/14] Move s3 bucket creation to backend --- Banyan/src/clusters.jl | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index 5594153e..6af534a1 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -36,7 +36,6 @@ function create_cluster(; if !nowait wait_for_cluster(name) end - println("About to return") return get_cluster(name) else error("Cluster with name $name already exists and has status $(string(clusters[name].status))") @@ -44,28 +43,17 @@ function create_cluster(; end # Construct arguments - - println("Following passed to create_cluster") - @show s3_bucket_arn - @show s3_bucket_name - - if isnothing(s3_bucket_arn) && isnothing(s3_bucket_name) - s3_bucket_arn = "arn:aws:s3:::banyan-cluster-data-$name-$(string(bytes2hex(rand(UInt8, 4))))" - s3_bucket_name = last(split(s3_bucket_arn, ":")) - s3_create_bucket(get_aws_config(), s3_bucket_name) - elseif isnothing(s3_bucket_arn) + if !isnothing(s3_bucket_name) s3_bucket_arn = "arn:aws:s3:::$s3_bucket_name*" - elseif isnothing(s3_bucket_name) + elseif !isnothing(s3_bucket_arn) s3_bucket_name = last(split(s3_bucket_arn, ":")) end - if !(s3_bucket_name in s3_list_buckets(get_aws_config())) + if isnothing(s3_bucket_arn) + s3_bucket_arn = "" + elseif !(s3_bucket_name in s3_list_buckets(get_aws_config())) error("Bucket $s3_bucket_name does not exist in connected AWS account") end - println("And after") - @show s3_bucket_arn - @show s3_bucket_name - # Construct cluster creation cluster_config = Dict( "cluster_name" => name, @@ -93,8 +81,6 @@ function create_cluster(; cluster_config["subnet_id"] = subnet_id end - println("s3_bucket_arn=$(s3_bucket_arn)") - @info "Creating cluster" # Send request to create cluster From f5ff5452c14b2491b22a8f2c5f3779f66c645f95 Mon Sep 17 00:00:00 2001 From: cailinw Date: Tue, 19 Oct 2021 00:28:51 +0000 Subject: [PATCH 11/14] Add general test for create_cluster --- Banyan/test/clusters.jl | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/Banyan/test/clusters.jl b/Banyan/test/clusters.jl index fcf894ea..aa36516b 100644 --- a/Banyan/test/clusters.jl +++ b/Banyan/test/clusters.jl @@ -32,10 +32,22 @@ function bucket_exists(s3_bucket_name) end @testset "Create clusters" begin + Random.seed!() + cluster_name = "cluster-$(Random.randstring(['a':'z'; '0':'9'], 6))" + create_cluster( + name=cluster_name, + instance_type="t3.xlarge", # 4 vCPUs + max_num_workers=32, + initial_num_workers=8, + min_num_workers=2, + scaledown_time=30, + ) + @test c.status == :running + delete_cluster(cluster_name) end @testset "Destroy and delete clusters with $s3_bucket S3 bucket" for s3_bucket in [ - "user-provided" # "default", + "default", "user-provided" ] Random.seed!() cluster_name = "cluster-$(Random.randstring(['a':'z'; '0':'9'], 6))" @@ -48,8 +60,6 @@ end s3_create_bucket(Banyan.get_aws_config(), s3_bucket) end - println("s3_bucket is ", s3_bucket) - # Create a cluster (at least initiate) and check that S3 bucket exists c = create_cluster( name=cluster_name, @@ -86,10 +96,6 @@ end # Delete cluster delete_cluster(cluster_name) - @show s3_bucket_name - @show s3_bucket_name_r - @show bucket_exists(s3_bucket_name) - @show bucket_exists(s3_bucket_name_r) sleep(30) # Just to ensure that bucket has been deleted s3_bucket_exists = bucket_exists(s3_bucket_name_r) @test !s3_bucket_exists @@ -99,7 +105,7 @@ end end @testset "Benchmark create_cluster with $instance_type instance type" for instance_type in [ - "t3.xlarge", "t3.2xlarge", "c5.2xlarge", "m4.4xlarge", "m4.10xlarge" + "t3.2xlarge" #, "t3.2xlarge", "c5.2xlarge", "m4.4xlarge", "m4.10xlarge" ] Random.seed!() cluster_name = "cluster-$(Random.randstring(['a':'z'; '0':'9'], 6))" @@ -108,7 +114,8 @@ end c = create_cluster( name=cluster_name, instance_type=instance_type, - max_num_workers=16 + max_num_workers=32, + initial_num_workers=1 ) end delete_cluster(cluster_name) From cf5d465b5265680b10d19e6d6ed0d4c570f782f5 Mon Sep 17 00:00:00 2001 From: cailinw Date: Tue, 19 Oct 2021 07:20:37 +0000 Subject: [PATCH 12/14] Print out reason for cluster failure --- Banyan/src/cluster.jl | 1 + Banyan/src/clusters.jl | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/Banyan/src/cluster.jl b/Banyan/src/cluster.jl index a4fee04d..12fd9dc9 100644 --- a/Banyan/src/cluster.jl +++ b/Banyan/src/cluster.jl @@ -1,6 +1,7 @@ struct Cluster name::String status::Symbol + status_explanation::String num_jobs_running::Int32 s3_bucket_arn::String end \ No newline at end of file diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index 6af534a1..066541aa 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -90,7 +90,7 @@ function create_cluster(; wait_for_cluster(name) end - return Cluster(name, get_cluster_status(name), 0, s3_bucket_arn) + return Cluster(name, get_cluster_status(name), "", 0, s3_bucket_arn) end function destroy_cluster(name::String; kwargs...) @@ -129,6 +129,7 @@ end struct Cluster name::String status::Symbol + status_explanation::String num_jobs_running::Int32 s3_bucket_arn::String end @@ -165,6 +166,7 @@ function get_clusters(; kwargs...) name => Cluster( name, parsestatus(c["status"]), + haskey(c, "status_explanation") ? c["status_explanation"] : "", c["num_jobs"], c["s3_read_write_resource"], ) for (name, c) in response["clusters"] @@ -179,10 +181,16 @@ end get_cluster(name::String=get_cluster_name(), kwargs...) = get_clusters(; kwargs...)[name] -get_cluster_status(name::String=get_cluster_name(), kwargs...) = get_clusters(; kwargs...)[name].status - get_running_clusters(args...; kwargs...) = filter(entry -> entry[2].status == :running, get_clusters(args...; kwargs...)) +function get_cluster_status(name::String=get_cluster_name(), kwargs...) + c = get_clusters(; kwargs...)[name] + if c.status == :failed + @info c.status_explanation + end + c.status +end + function wait_for_cluster(name::String=get_cluster_name(), kwargs...) t = 5 cluster_status = get_cluster_status(name; kwargs...) @@ -202,8 +210,8 @@ function wait_for_cluster(name::String=get_cluster_name(), kwargs...) @info "Cluster $(name) is running and ready for jobs" elseif cluster_status == :terminated @info "Cluster $(name) no longer exists" - elseif cluster_status!= :creating && cluster_status != :updating + elseif cluster_status != :creating && cluster_status != :updating @info "Cluster $(name) set up has failed" - delete_cluster(name) + # delete_cluster(name) end end \ No newline at end of file From 6bc5c11a7c3656ec96ac9f4f496b8df0b0051ea8 Mon Sep 17 00:00:00 2001 From: cailinw Date: Tue, 19 Oct 2021 07:20:37 +0000 Subject: [PATCH 13/14] Print out reason for cluster failure --- Banyan/src/cluster.jl | 1 + Banyan/src/clusters.jl | 18 +++++++++++++----- Banyan/test/clusters.jl | 4 ++-- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/Banyan/src/cluster.jl b/Banyan/src/cluster.jl index a4fee04d..12fd9dc9 100644 --- a/Banyan/src/cluster.jl +++ b/Banyan/src/cluster.jl @@ -1,6 +1,7 @@ struct Cluster name::String status::Symbol + status_explanation::String num_jobs_running::Int32 s3_bucket_arn::String end \ No newline at end of file diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index 6af534a1..066541aa 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -90,7 +90,7 @@ function create_cluster(; wait_for_cluster(name) end - return Cluster(name, get_cluster_status(name), 0, s3_bucket_arn) + return Cluster(name, get_cluster_status(name), "", 0, s3_bucket_arn) end function destroy_cluster(name::String; kwargs...) @@ -129,6 +129,7 @@ end struct Cluster name::String status::Symbol + status_explanation::String num_jobs_running::Int32 s3_bucket_arn::String end @@ -165,6 +166,7 @@ function get_clusters(; kwargs...) name => Cluster( name, parsestatus(c["status"]), + haskey(c, "status_explanation") ? c["status_explanation"] : "", c["num_jobs"], c["s3_read_write_resource"], ) for (name, c) in response["clusters"] @@ -179,10 +181,16 @@ end get_cluster(name::String=get_cluster_name(), kwargs...) = get_clusters(; kwargs...)[name] -get_cluster_status(name::String=get_cluster_name(), kwargs...) = get_clusters(; kwargs...)[name].status - get_running_clusters(args...; kwargs...) = filter(entry -> entry[2].status == :running, get_clusters(args...; kwargs...)) +function get_cluster_status(name::String=get_cluster_name(), kwargs...) + c = get_clusters(; kwargs...)[name] + if c.status == :failed + @info c.status_explanation + end + c.status +end + function wait_for_cluster(name::String=get_cluster_name(), kwargs...) t = 5 cluster_status = get_cluster_status(name; kwargs...) @@ -202,8 +210,8 @@ function wait_for_cluster(name::String=get_cluster_name(), kwargs...) @info "Cluster $(name) is running and ready for jobs" elseif cluster_status == :terminated @info "Cluster $(name) no longer exists" - elseif cluster_status!= :creating && cluster_status != :updating + elseif cluster_status != :creating && cluster_status != :updating @info "Cluster $(name) set up has failed" - delete_cluster(name) + # delete_cluster(name) end end \ No newline at end of file diff --git a/Banyan/test/clusters.jl b/Banyan/test/clusters.jl index aa36516b..c2ef0744 100644 --- a/Banyan/test/clusters.jl +++ b/Banyan/test/clusters.jl @@ -105,7 +105,7 @@ end end @testset "Benchmark create_cluster with $instance_type instance type" for instance_type in [ - "t3.2xlarge" #, "t3.2xlarge", "c5.2xlarge", "m4.4xlarge", "m4.10xlarge" + "t3.xlarge", "t3.2xlarge", "c5.2xlarge", "m4.4xlarge", "m4.10xlarge" ] Random.seed!() cluster_name = "cluster-$(Random.randstring(['a':'z'; '0':'9'], 6))" @@ -114,7 +114,7 @@ end c = create_cluster( name=cluster_name, instance_type=instance_type, - max_num_workers=32, + max_num_workers=16, initial_num_workers=1 ) end From 3d04a3bbe038609cbb45e8309185425d5985781f Mon Sep 17 00:00:00 2001 From: cailinw Date: Tue, 19 Oct 2021 23:04:22 +0000 Subject: [PATCH 14/14] Respond to comments --- Banyan/src/clusters.jl | 6 +++--- Banyan/src/jobs.jl | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl index 066541aa..87c1a717 100644 --- a/Banyan/src/clusters.jl +++ b/Banyan/src/clusters.jl @@ -2,8 +2,8 @@ function create_cluster(; name::Union{String,Nothing} = nothing, instance_type::Union{String,Nothing} = "m4.4xlarge", - max_num_workers::Union{Int,Nothing} = 8, - initial_num_workers::Union{Int,Nothing} = 0, + max_num_workers::Union{Int,Nothing} = 2048, + initial_num_workers::Union{Int,Nothing} = 16, min_num_workers::Union{Int,Nothing} = 0, iam_policy_arn::Union{String,Nothing} = nothing, s3_bucket_arn::Union{String,Nothing} = nothing, @@ -211,7 +211,7 @@ function wait_for_cluster(name::String=get_cluster_name(), kwargs...) elseif cluster_status == :terminated @info "Cluster $(name) no longer exists" elseif cluster_status != :creating && cluster_status != :updating - @info "Cluster $(name) set up has failed" + @info "Cluster $(name) setup has failed" # delete_cluster(name) end end \ No newline at end of file diff --git a/Banyan/src/jobs.jl b/Banyan/src/jobs.jl index aa7cabca..ae1a3462 100644 --- a/Banyan/src/jobs.jl +++ b/Banyan/src/jobs.jl @@ -39,7 +39,7 @@ get_cluster_name() = get_job().cluster_name function create_job(; cluster_name::Union{String,Nothing} = nothing, - nworkers::Union{Integer,Nothing} = 2, + nworkers::Union{Integer,Nothing} = 16, print_logs::Union{Bool,Nothing} = false, store_logs_in_s3::Union{Bool,Nothing} = true, store_logs_on_cluster::Union{Bool,Nothing} = false,