diff --git a/README.md b/README.md index 50f593004..5a7edc3f5 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,22 @@ module "wandb" { - Run `terraform init` and `terraform apply` +## Cluster Sizing + +By default, the type of kubernetes instances, number of instances, redis cluster size, and database instance sizes are +standardized via configurations in [./deployment-size.tf](deployment-size.tf), and is configured via the `size` input +variable. + +Available sizes are, `small`, `medium`, `large`, `xlarge`, and `xxlarge`. Default is `small`. + +All the values set via `deployment-size.tf` can be overridden by setting the appropriate input variables. + +- `kubernetes_instance_types` - The instance type for the EKS nodes +- `kubernetes_min_nodes_per_az` - The minimum number of nodes in each AZ for the EKS cluster +- `kubernetes_max_nodes_per_az` - The maximum number of nodes in each AZ for the EKS cluster +- `elasticache_node_type` - The instance type for the redis cluster +- `database_instance_class` - The instance type for the database + ## Examples We have included documentation and reference examples for additional common @@ -124,7 +140,7 @@ Upgrades must be executed in step-wise fashion from one version to the next. You | Name | Version | |------|---------| -| [aws](#provider\_aws) | ~> 4.0 | +| [aws](#provider\_aws) | 4.67.0 | ## Modules @@ -164,12 +180,14 @@ Upgrades must be executed in step-wise fashion from one version to the next. You | [bucket\_kms\_key\_arn](#input\_bucket\_kms\_key\_arn) | n/a | `string` | `""` | no | | [bucket\_name](#input\_bucket\_name) | n/a | `string` | `""` | no | | [bucket\_path](#input\_bucket\_path) | path of where to store data for the instance-level bucket | `string` | `""` | no | +| [clickhouse\_endpoint\_service\_id](#input\_clickhouse\_endpoint\_service\_id) | The service ID of the VPC endpoint service for Clickhouse | `string` | `""` | no | +| [controller\_image\_tag](#input\_controller\_image\_tag) | Tag of the controller image to deploy | `string` | `"1.14.0"` | no | | [create\_bucket](#input\_create\_bucket) | ######################################### External Bucket # ######################################### Most users will not need these settings. They are ment for users who want a bucket and sqs that are in a different account. | `bool` | `true` | no | | [create\_elasticache](#input\_create\_elasticache) | Boolean indicating whether to provision an elasticache instance (true) or not (false). | `bool` | `true` | no | | [create\_vpc](#input\_create\_vpc) | Boolean indicating whether to deploy a VPC (true) or not (false). | `bool` | `true` | no | | [custom\_domain\_filter](#input\_custom\_domain\_filter) | A custom domain filter to be used by external-dns instead of the default FQDN. If not set, the local FQDN is used. | `string` | `null` | no | | [database\_binlog\_format](#input\_database\_binlog\_format) | Specifies the binlog\_format value to set for the database | `string` | `"ROW"` | no | -| [database\_engine\_version](#input\_database\_engine\_version) | Version for MySQL Auora | `string` | `"8.0.mysql_aurora.3.05.2"` | no | +| [database\_engine\_version](#input\_database\_engine\_version) | Version for MySQL Aurora | `string` | `"8.0.mysql_aurora.3.07.1"` | no | | [database\_innodb\_lru\_scan\_depth](#input\_database\_innodb\_lru\_scan\_depth) | Specifies the innodb\_lru\_scan\_depth value to set for the database | `number` | `128` | no | | [database\_instance\_class](#input\_database\_instance\_class) | Instance type to use by database master instance. | `string` | `"db.r5.large"` | no | | [database\_kms\_key\_arn](#input\_database\_kms\_key\_arn) | n/a | `string` | `""` | no | @@ -183,14 +201,16 @@ Upgrades must be executed in step-wise fashion from one version to the next. You | [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS cluster kubernetes version | `string` | n/a | yes | | [eks\_policy\_arns](#input\_eks\_policy\_arns) | Additional IAM policy to apply to the EKS cluster | `list(string)` | `[]` | no | | [elasticache\_node\_type](#input\_elasticache\_node\_type) | The type of the redis cache node to deploy | `string` | `"cache.t2.medium"` | no | -| [enable\_dummy\_dns](#input\_enable\_dummy\_dns) | Boolean indicating whether or not to enable dummy DNS for the old alb | `bool` | `false` | no | -| [enable\_operator\_alb](#input\_enable\_operator\_alb) | Boolean indicating whether to use operatore ALB (true) or not (false). | `bool` | `false` | no | +| [enable\_clickhouse](#input\_enable\_clickhouse) | Provision clickhouse resources | `bool` | `false` | no | | [enable\_yace](#input\_enable\_yace) | deploy yet another cloudwatch exporter to fetch aws resources metrics | `bool` | `true` | no | | [external\_dns](#input\_external\_dns) | Using external DNS. A `subdomain` must also be specified if this value is true. | `bool` | `false` | no | | [extra\_fqdn](#input\_extra\_fqdn) | Additional fqdn's must be in the same hosted zone as `domain_name`. | `list(string)` | `[]` | no | +| [kms\_clickhouse\_key\_alias](#input\_kms\_clickhouse\_key\_alias) | KMS key alias for AWS KMS Customer managed key used by Clickhouse CMEK. | `string` | `null` | no | +| [kms\_clickhouse\_key\_policy](#input\_kms\_clickhouse\_key\_policy) | The policy that will define the permissions for the clickhouse kms key. | `string` | `""` | no | | [kms\_key\_alias](#input\_kms\_key\_alias) | KMS key alias for AWS KMS Customer managed key. | `string` | `null` | no | | [kms\_key\_deletion\_window](#input\_kms\_key\_deletion\_window) | Duration in days to destroy the key after it is deleted. Must be between 7 and 30 days. | `number` | `7` | no | | [kms\_key\_policy](#input\_kms\_key\_policy) | The policy that will define the permissions for the kms key. | `string` | `""` | no | +| [kms\_key\_policy\_administrator\_arn](#input\_kms\_key\_policy\_administrator\_arn) | The principal that will be allowed to manage the kms key. | `string` | `""` | no | | [kubernetes\_alb\_internet\_facing](#input\_kubernetes\_alb\_internet\_facing) | Indicates whether or not the ALB controlled by the Amazon ALB ingress controller is internet-facing or internal. | `bool` | `true` | no | | [kubernetes\_alb\_subnets](#input\_kubernetes\_alb\_subnets) | List of subnet ID's the ALB will use for ingress traffic. | `list(string)` | `[]` | no | | [kubernetes\_instance\_types](#input\_kubernetes\_instance\_types) | EC2 Instance type for primary node group. | `list(string)` |
[| no | @@ -212,6 +232,7 @@ Upgrades must be executed in step-wise fashion from one version to the next. You | [network\_private\_subnets](#input\_network\_private\_subnets) | A list of the identities of the private subnetworks in which resources will be deployed. | `list(string)` | `[]` | no | | [network\_public\_subnet\_cidrs](#input\_network\_public\_subnet\_cidrs) | List of private subnet CIDR ranges to create in VPC. | `list(string)` |
"m5.large"
]
[| no | | [network\_public\_subnets](#input\_network\_public\_subnets) | A list of the identities of the public subnetworks in which resources will be deployed. | `list(string)` | `[]` | no | +| [operator\_chart\_version](#input\_operator\_chart\_version) | Version of the operator chart to deploy | `string` | `"1.3.4"` | no | | [other\_wandb\_env](#input\_other\_wandb\_env) | Extra environment variables for W&B | `map(any)` | `{}` | no | | [parquet\_wandb\_env](#input\_parquet\_wandb\_env) | Extra environment variables for W&B | `map(string)` | `{}` | no | | [private\_link\_allowed\_account\_ids](#input\_private\_link\_allowed\_account\_ids) | List of AWS account IDs allowed to access the VPC Endpoint Service | `list(string)` | `[]` | no | @@ -246,7 +267,7 @@ Upgrades must be executed in step-wise fashion from one version to the next. You | [eks\_node\_count](#output\_eks\_node\_count) | n/a | | [eks\_node\_instance\_type](#output\_eks\_node\_instance\_type) | n/a | | [elasticache\_connection\_string](#output\_elasticache\_connection\_string) | n/a | -| [internal\_app\_port](#output\_internal\_app\_port) | n/a | +| [kms\_clickhouse\_key\_arn](#output\_kms\_clickhouse\_key\_arn) | The Amazon Resource Name of the KMS key used to encrypt Weave data at rest in Clickhouse. | | [kms\_key\_arn](#output\_kms\_key\_arn) | The Amazon Resource Name of the KMS key used to encrypt data at rest. | | [network\_id](#output\_network\_id) | The identity of the VPC in which resources are deployed. | | [network\_private\_subnets](#output\_network\_private\_subnets) | The identities of the private subnetworks deployed within the VPC. | @@ -263,6 +284,45 @@ Upgrades must be executed in step-wise fashion from one version to the next. You See our upgrade guide [here](./docs/operator-migration/readme.md) +### Upgrading from 4.x -> 5.x + +5.0.0 introduced autoscaling to the EKS cluster and made the `size` variable the preferred way to set the cluster size. +Previously, unless the `size` variable was set explicitly, there were default values for the following variables: +- `kubernetes_instance_types` +- `kubernetes_node_count` +- `elasticache_node_type` +- `database_instance_class` + +The `size` variable is now defaulted to `small`, and the following values to can be used to partially override the values +set by the `size` variable: +- `kubernetes_instance_types` +- `kubernetes_min_nodes_per_az` +- `kubernetes_max_nodes_per_az` +- `elasticache_node_type` +- `database_instance_class` + +For more information on the available sizes, see the [Cluster Sizing](#cluster-sizing) section. + +If having the cluster scale nodes in and out is not desired, the `kubernetes_min_nodes_per_az` and +`kubernetes_max_nodes_per_az` can be set to the same value to prevent the cluster from scaling. + +This upgrade is also intended to be used when upgrading eks to 1.29. + +We have upgraded the following dependencies and Kubernetes addons: + +- MySQL Aurora (8.0.mysql_aurora.3.07.1) +- redis (7.1) +- external-dns helm chart (v1.15.0) +- aws-efs-csi-driver (v2.0.7-eksbuild.1) +- aws-ebs-csi-driver (v1.35.0-eksbuild.1) +- coredns (v1.11.3-eksbuild.1) +- kube-proxy (v1.29.7-eksbuild.9) +- vpc-cni (v1.18.3-eksbuild.3) + +> :warning: Please remove the `enable_dummy_dns` and `enable_operator_alb` variables +> as they are no longer valid flags. They were provided to support older versions of +> the module that relied on an alb not created by the ingress controller. + ### Upgrading from 3.x -> 4.x - If egress access for retrieving the wandb/controller image is not available, Terraform apply may experience failures. diff --git a/deployment-size.tf b/deployment-size.tf index f6aedbe6d..1c04edad1 100644 --- a/deployment-size.tf +++ b/deployment-size.tf @@ -6,34 +6,39 @@ locals { deployment_size = { small = { - db = "db.r6g.large", - node_count = 2, - node_instance = "r6i.xlarge" - cache = "cache.m6g.large" + db = "db.r6g.large", + min_nodes_per_az = 1, + max_nodes_per_az = 2, + node_instance = "r6i.xlarge" + cache = "cache.m6g.large" }, medium = { - db = "db.r6g.xlarge", - node_count = 2, - node_instance = "r6i.xlarge" - cache = "cache.m6g.large" + db = "db.r6g.xlarge", + min_nodes_per_az = 1, + max_nodes_per_az = 2, + node_instance = "r6i.xlarge" + cache = "cache.m6g.large" }, large = { - db = "db.r6g.2xlarge", - node_count = 2, - node_instance = "r6i.2xlarge" - cache = "cache.m6g.xlarge" + db = "db.r6g.2xlarge", + min_nodes_per_az = 1, + max_nodes_per_az = 2, + node_instance = "r6i.2xlarge" + cache = "cache.m6g.xlarge" }, xlarge = { - db = "db.r6g.4xlarge", - node_count = 3, - node_instance = "r6i.2xlarge" - cache = "cache.m6g.xlarge" + db = "db.r6g.4xlarge", + min_nodes_per_az = 1, + max_nodes_per_az = 2, + node_instance = "r6i.2xlarge" + cache = "cache.m6g.xlarge" }, xxlarge = { - db = "db.r6g.8xlarge", - node_count = 3, - node_instance = "r6i.4xlarge" - cache = "cache.m6g.2xlarge" + db = "db.r6g.8xlarge", + min_nodes_per_az = 1, + max_nodes_per_az = 3, + node_instance = "r6i.4xlarge" + cache = "cache.m6g.2xlarge" } } } \ No newline at end of file diff --git a/examples/byo-vpc-eks-sql-redis/main.tf b/examples/byo-vpc-eks-sql-redis/main.tf index c8e51ecf1..f2be8eee6 100644 --- a/examples/byo-vpc-eks-sql-redis/main.tf +++ b/examples/byo-vpc-eks-sql-redis/main.tf @@ -118,20 +118,14 @@ locals { module "app_lb" { source = "../../modules/app_lb" - namespace = var.namespace - load_balancing_scheme = var.public_access ? "PUBLIC" : "PRIVATE" - acm_certificate_arn = local.acm_certificate_arn - zone_id = var.zone_id - - fqdn = local.full_fqdn - extra_fqdn = local.extra_fqdn + namespace = var.namespace allowed_inbound_cidr = var.allowed_inbound_cidr allowed_inbound_ipv6_cidr = var.allowed_inbound_ipv6_cidr - target_port = local.internal_app_port - network_id = local.network_id - network_private_subnets = local.network_private_subnets - network_public_subnets = local.network_public_subnets + private_endpoint_cidr = var.allowed_private_endpoint_cidr + enable_private_only_traffic = var.enable_private_only_traffic + + network_id = local.network_id } module "private_link" { @@ -145,6 +139,9 @@ module "private_link" { alb_name = local.lb_name_truncated vpc_id = local.network_id + enable_private_only_traffic = var.enable_private_only_traffic + nlb_security_group = module.app_lb.nlb_security_group + depends_on = [ module.wandb ] diff --git a/examples/byo-vpc-eks-sql-redis/variables.tf b/examples/byo-vpc-eks-sql-redis/variables.tf index 2c8ff6171..993e8b569 100644 --- a/examples/byo-vpc-eks-sql-redis/variables.tf +++ b/examples/byo-vpc-eks-sql-redis/variables.tf @@ -237,6 +237,19 @@ variable "network_private_subnets" { type = list(string) } +variable "allowed_private_endpoint_cidr" { + description = "Private CIDRs allowed to access wandb-server." + nullable = false + type = list(string) + default = [] +} + +variable "enable_private_only_traffic" { + description = "Enable private only traffic from customer private network" + type = bool + default = false +} + variable "network_public_subnets" { description = "A list of the identities of the public subnetworks in which resources will be deployed." type = list(string) diff --git a/examples/byo-vpc-eks/main.tf b/examples/byo-vpc-eks/main.tf index d31013c9f..53f60ab50 100644 --- a/examples/byo-vpc-eks/main.tf +++ b/examples/byo-vpc-eks/main.tf @@ -75,11 +75,11 @@ module "wandb_infra" { } data "aws_eks_cluster" "app_cluster" { - name = module.wandb_infra.cluster_id + name = module.wandb_infra.cluster_name } data "aws_eks_cluster_auth" "app_cluster" { - name = module.wandb_infra.cluster_id + name = module.wandb_infra.cluster_name } provider "kubernetes" { diff --git a/examples/byo-vpc-eks/variables.tf b/examples/byo-vpc-eks/variables.tf index 1ab09894d..4ddffbe8e 100644 --- a/examples/byo-vpc-eks/variables.tf +++ b/examples/byo-vpc-eks/variables.tf @@ -70,14 +70,12 @@ variable "bucket_kms_key_arn" { default = "" } - variable "allowed_inbound_cidr" { default = ["0.0.0.0/0"] nullable = false type = list(string) } - variable "allowed_inbound_ipv6_cidr" { default = ["::/0"] nullable = false diff --git a/examples/byo-vpc-sql/main.tf b/examples/byo-vpc-sql/main.tf index 55fbaf5cf..6e0dbb05c 100644 --- a/examples/byo-vpc-sql/main.tf +++ b/examples/byo-vpc-sql/main.tf @@ -23,11 +23,11 @@ data "aws_sqs_queue" "file_storage" { } data "aws_eks_cluster" "app_cluster" { - name = module.app_eks.cluster_id + name = module.app_eks.cluster_name } data "aws_eks_cluster_auth" "app_cluster" { - name = module.app_eks.cluster_id + name = module.app_eks.cluster_name } provider "kubernetes" { @@ -161,13 +161,12 @@ module "app_eks" { namespace = var.namespace kms_key_arn = local.kms_key_arn - instance_types = try([local.deployment_size[var.size].node_instance], var.kubernetes_instance_types) - desired_capacity = try(local.deployment_size[var.size].node_count, var.kubernetes_node_count) - map_accounts = var.kubernetes_map_accounts - map_roles = var.kubernetes_map_roles - map_users = var.kubernetes_map_users + instance_types = try([local.deployment_size[var.size].node_instance], var.kubernetes_instance_types) + map_accounts = var.kubernetes_map_accounts + map_roles = var.kubernetes_map_roles + map_users = var.kubernetes_map_users - bucket_kms_key_arn = local.use_external_bucket ? var.bucket_kms_key_arn : local.kms_key_arn + bucket_kms_key_arns = local.use_external_bucket ? var.bucket_kms_key_arn : local.kms_key_arn bucket_arn = data.aws_s3_bucket.file_storage.arn bucket_sqs_queue_arn = local.use_internal_queue ? null : data.aws_sqs_queue.file_storage.0.arn @@ -202,20 +201,14 @@ locals { module "app_lb" { source = "../../modules/app_lb" - namespace = var.namespace - load_balancing_scheme = var.public_access ? "PUBLIC" : "PRIVATE" - acm_certificate_arn = local.acm_certificate_arn - zone_id = var.zone_id - - fqdn = local.full_fqdn - extra_fqdn = local.extra_fqdn + namespace = var.namespace allowed_inbound_cidr = var.allowed_inbound_cidr allowed_inbound_ipv6_cidr = var.allowed_inbound_ipv6_cidr - target_port = local.internal_app_port - network_id = local.network_id - network_private_subnets = local.network_private_subnets - network_public_subnets = local.network_public_subnets + private_endpoint_cidr = var.allowed_private_endpoint_cidr + enable_private_only_traffic = var.enable_private_only_traffic + + network_id = local.network_id } module "private_link" { @@ -229,6 +222,9 @@ module "private_link" { alb_name = local.lb_name_truncated vpc_id = local.network_id + enable_private_only_traffic = var.enable_private_only_traffic + nlb_security_group = module.app_lb.nlb_security_group + depends_on = [ module.wandb ] diff --git a/examples/byo-vpc-sql/variables.tf b/examples/byo-vpc-sql/variables.tf index aadff404f..7581ceb61 100644 --- a/examples/byo-vpc-sql/variables.tf +++ b/examples/byo-vpc-sql/variables.tf @@ -232,6 +232,19 @@ variable "network_private_subnets" { type = list(string) } +variable "allowed_private_endpoint_cidr" { + description = "Private CIDRs allowed to access wandb-server." + nullable = false + type = list(string) + default = [] +} + +variable "enable_private_only_traffic" { + description = "Enable private only traffic from customer private network" + type = bool + default = false +} + variable "network_public_subnets" { description = "A list of the identities of the public subnetworks in which resources will be deployed." type = list(string) @@ -411,6 +424,7 @@ variable "bucket_kms_key_arn" { default = "" } + ########################################## # Redis # ########################################## diff --git a/examples/byo-vpc/main.tf b/examples/byo-vpc/main.tf index 34896373d..85aaed5bf 100644 --- a/examples/byo-vpc/main.tf +++ b/examples/byo-vpc/main.tf @@ -19,9 +19,6 @@ module "wandb_infra" { public_access = true external_dns = true - enable_dummy_dns = var.enable_dummy_dns - enable_operator_alb = var.enable_operator_alb - deletion_protection = true create_vpc = false @@ -63,11 +60,11 @@ module "wandb_infra" { } data "aws_eks_cluster" "app_cluster" { - name = module.wandb_infra.cluster_id + name = module.wandb_infra.cluster_name } data "aws_eks_cluster_auth" "app_cluster" { - name = module.wandb_infra.cluster_id + name = module.wandb_infra.cluster_name } provider "kubernetes" { diff --git a/examples/byo-vpc/variables.tf b/examples/byo-vpc/variables.tf index c44730894..eaf0771b1 100644 --- a/examples/byo-vpc/variables.tf +++ b/examples/byo-vpc/variables.tf @@ -103,18 +103,6 @@ variable "other_wandb_env" { default = {} } -variable "enable_operator_alb" { - type = bool - default = false - description = "Boolean indicating whether to use operatore ALB (true) or not (false)." -} - -variable "enable_dummy_dns" { - type = bool - default = false - description = "Boolean indicating whether or not to enable dummy DNS for the old alb" -} - variable "vpc_id" { type = string description = "VPC network ID" diff --git a/examples/public-dns-external/main.tf b/examples/public-dns-external/main.tf index 04eeda8d5..df34e7650 100644 --- a/examples/public-dns-external/main.tf +++ b/examples/public-dns-external/main.tf @@ -28,7 +28,7 @@ module "wandb_infra" { allowed_inbound_cidr = var.allowed_inbound_cidr allowed_inbound_ipv6_cidr = ["::/0"] - eks_cluster_version = "1.26" + eks_cluster_version = "1.29" kubernetes_public_access = true kubernetes_public_access_cidrs = ["0.0.0.0/0"] @@ -53,11 +53,11 @@ module "wandb_infra" { } data "aws_eks_cluster" "app_cluster" { - name = module.wandb_infra.cluster_id + name = module.wandb_infra.cluster_name } data "aws_eks_cluster_auth" "app_cluster" { - name = module.wandb_infra.cluster_id + name = module.wandb_infra.cluster_name } provider "kubernetes" { @@ -84,35 +84,6 @@ provider "helm" { } } -module "wandb_app" { - source = "wandb/wandb/kubernetes" - version = "1.12.0" - - license = var.wandb_license - - host = module.wandb_infra.url - bucket = "s3://${module.wandb_infra.bucket_name}" - bucket_path = var.bucket_path - bucket_aws_region = module.wandb_infra.bucket_region - bucket_queue = "internal://" - bucket_kms_key_arn = module.wandb_infra.kms_key_arn - database_connection_string = "mysql://${module.wandb_infra.database_connection_string}" - redis_connection_string = "redis://${module.wandb_infra.elasticache_connection_string}?tls=true&ttlInSeconds=604800" - - wandb_image = var.wandb_image - wandb_version = var.wandb_version - - service_port = module.wandb_infra.internal_app_port - - # If we dont wait, tf will start trying to deploy while the work group is - # still spinning up - depends_on = [module.wandb_infra] - - other_wandb_env = merge({ - "GORILLA_CUSTOMER_SECRET_STORE_SOURCE" = "aws-secretmanager://${var.namespace}?namespace=${var.namespace}" - }, var.other_wandb_env) -} - output "bucket_name" { value = module.wandb_infra.bucket_name } diff --git a/examples/public-dns-with-route53/main.tf b/examples/public-dns-with-route53/main.tf index f3448dd3a..b1a0efc7d 100644 --- a/examples/public-dns-with-route53/main.tf +++ b/examples/public-dns-with-route53/main.tf @@ -27,11 +27,11 @@ module "wandb_infra" { } data "aws_eks_cluster" "app_cluster" { - name = module.wandb_infra.cluster_id + name = module.wandb_infra.cluster_name } data "aws_eks_cluster_auth" "app_cluster" { - name = module.wandb_infra.cluster_id + name = module.wandb_infra.cluster_name } provider "kubernetes" { diff --git a/examples/standard/main.tf b/examples/standard/main.tf index aae263ea9..5cdc964eb 100644 --- a/examples/standard/main.tf +++ b/examples/standard/main.tf @@ -42,11 +42,11 @@ module "wandb_infra" { } data "aws_eks_cluster" "app_cluster" { - name = module.wandb_infra.cluster_id + name = module.wandb_infra.cluster_name } data "aws_eks_cluster_auth" "app_cluster" { - name = module.wandb_infra.cluster_id + name = module.wandb_infra.cluster_name } provider "kubernetes" { @@ -73,7 +73,6 @@ provider "helm" { } } - output "bucket_name" { value = module.wandb_infra.bucket_name } diff --git a/main.tf b/main.tf index 8bece8a3b..5d41c2c31 100644 --- a/main.tf +++ b/main.tf @@ -21,6 +21,11 @@ locals { use_external_bucket = var.bucket_name != "" s3_kms_key_arn = local.use_external_bucket || var.bucket_kms_key_arn != "" ? var.bucket_kms_key_arn : local.default_kms_key use_internal_queue = local.use_external_bucket || var.use_internal_queue + elasticache_node_type = coalesce(var.elasticache_node_type, local.deployment_size[var.size].cache) + database_instance_class = coalesce(var.database_instance_class, local.deployment_size[var.size].db) + kubernetes_instance_types = coalesce(var.kubernetes_instance_types, [local.deployment_size[var.size].node_instance]) + kubernetes_min_nodes_per_az = coalesce(var.kubernetes_min_nodes_per_az, local.deployment_size[var.size].min_nodes_per_az) + kubernetes_max_nodes_per_az = coalesce(var.kubernetes_max_nodes_per_az, local.deployment_size[var.size].max_nodes_per_az) } module "file_storage" { @@ -84,7 +89,7 @@ module "database" { database_name = var.database_name master_username = var.database_master_username - instance_class = try(local.deployment_size[var.size].db, var.database_instance_class) + instance_class = local.database_instance_class engine_version = var.database_engine_version snapshot_identifier = var.database_snapshot_identifier sort_buffer_size = var.database_sort_buffer_size @@ -136,11 +141,13 @@ module "app_eks" { namespace = var.namespace kms_key_arn = local.default_kms_key - instance_types = try([local.deployment_size[var.size].node_instance], var.kubernetes_instance_types) - desired_capacity = try(local.deployment_size[var.size].node_count, var.kubernetes_node_count) - map_accounts = var.kubernetes_map_accounts - map_roles = var.kubernetes_map_roles - map_users = var.kubernetes_map_users + instance_types = local.kubernetes_instance_types + min_nodes = local.kubernetes_min_nodes_per_az + max_nodes = local.kubernetes_max_nodes_per_az + + map_accounts = var.kubernetes_map_accounts + map_roles = var.kubernetes_map_roles + map_users = var.kubernetes_map_users bucket_kms_key_arns = compact([ local.default_kms_key, @@ -173,30 +180,17 @@ module "app_eks" { aws_loadbalancer_controller_tags = var.aws_loadbalancer_controller_tags } -locals { - full_fqdn = var.enable_dummy_dns ? "old.${local.fqdn}" : local.fqdn - extra_fqdn = var.enable_dummy_dns ? [for fqdn in var.extra_fqdn : "old.${fqdn}"] : var.extra_fqdn -} module "app_lb" { source = "./modules/app_lb" - namespace = var.namespace - load_balancing_scheme = var.public_access ? "PUBLIC" : "PRIVATE" - acm_certificate_arn = local.acm_certificate_arn - zone_id = var.zone_id + namespace = var.namespace - fqdn = local.full_fqdn - extra_fqdn = local.extra_fqdn allowed_inbound_cidr = var.allowed_inbound_cidr allowed_inbound_ipv6_cidr = var.allowed_inbound_ipv6_cidr - target_port = local.internal_app_port network_id = local.network_id - network_private_subnets = local.network_private_subnets - network_public_subnets = local.network_public_subnets enable_private_only_traffic = var.private_only_traffic private_endpoint_cidr = var.allowed_private_endpoint_cidr - } module "private_link" { @@ -211,16 +205,7 @@ module "private_link" { vpc_id = local.network_id enable_private_only_traffic = var.private_only_traffic nlb_security_group = module.app_lb.nlb_security_group - depends_on = [ - module.app_lb, - module.wandb - ] -} - -resource "aws_autoscaling_attachment" "autoscaling_attachment" { - for_each = module.app_eks.autoscaling_group_names - autoscaling_group_name = each.value - lb_target_group_arn = module.app_lb.tg_app_arn + depends_on = [module.app_lb] } locals { @@ -240,7 +225,7 @@ module "redis" { vpc_id = local.network_id redis_subnet_group_name = local.network_elasticache_subnet_group_name vpc_subnets_cidr_blocks = local.network_elasticache_subnet_cidrs - node_type = try(local.deployment_size[var.size].cache, var.elasticache_node_type) + node_type = local.elasticache_node_type kms_key_arn = local.database_kms_key_arn } @@ -316,12 +301,12 @@ module "wandb" { "alb.ingress.kubernetes.io/listen-ports" = "[{\\\"HTTPS\\\": 443}]" "alb.ingress.kubernetes.io/certificate-arn" = local.acm_certificate_arn }, - length(var.extra_fqdn) > 0 && var.enable_dummy_dns ? { + length(var.extra_fqdn) > 0 ? { "external-dns.alpha.kubernetes.io/hostname" = <<-EOF ${local.fqdn}\,${join("\\,", var.extra_fqdn)}\,${local.fqdn} EOF } : { - "external-dns.alpha.kubernetes.io/hostname" = var.enable_operator_alb ? local.fqdn : "" + "external-dns.alpha.kubernetes.io/hostname" = local.fqdn }, length(var.kubernetes_alb_subnets) > 0 ? { "alb.ingress.kubernetes.io/subnets" = <<-EOF @@ -331,11 +316,7 @@ module "wandb" { } - app = var.enable_operator_alb ? {} : { - extraEnv = merge({ - "GORILLA_GLUE_LIST" = "true" - }, var.app_wandb_env) - } + app = {} # To support otel rds and redis metrics, we need operator-wandb chart min version 0.13.8 (yace subchart) yace = var.enable_yace ? { @@ -371,4 +352,3 @@ module "wandb" { } } } - diff --git a/modules/app_eks/add-ons.tf b/modules/app_eks/add-ons.tf index 3db67c0f7..d41b908ef 100644 --- a/modules/app_eks/add-ons.tf +++ b/modules/app_eks/add-ons.tf @@ -27,48 +27,45 @@ resource "aws_iam_role" "oidc" { assume_role_policy = data.aws_iam_policy_document.oidc_assume_role.json } - - -### add-ons for eks version 1.28 - +### add-ons for eks version 1.29 resource "aws_eks_addon" "aws_efs_csi_driver" { - depends_on = [ - aws_eks_addon.vpc_cni - ] - cluster_name = var.namespace - addon_name = "aws-efs-csi-driver" - addon_version = "v2.0.4-eksbuild.1" - resolve_conflicts = "OVERWRITE" + depends_on = [ + aws_eks_addon.vpc_cni + ] + cluster_name = var.namespace + addon_name = "aws-efs-csi-driver" + addon_version = "v2.0.7-eksbuild.1" + resolve_conflicts = "OVERWRITE" } resource "aws_eks_addon" "aws_ebs_csi_driver" { depends_on = [ aws_eks_addon.vpc_cni ] - cluster_name = var.namespace - addon_name = "aws-ebs-csi-driver" - addon_version = "v1.31.0-eksbuild.1" - resolve_conflicts = "OVERWRITE" + cluster_name = var.namespace + addon_name = "aws-ebs-csi-driver" + addon_version = "v1.35.0-eksbuild.1" + resolve_conflicts = "OVERWRITE" } resource "aws_eks_addon" "coredns" { depends_on = [ aws_eks_addon.vpc_cni ] - cluster_name = var.namespace - addon_name = "coredns" - addon_version = "v1.10.1-eksbuild.11" - resolve_conflicts = "OVERWRITE" + cluster_name = var.namespace + addon_name = "coredns" + addon_version = "v1.11.3-eksbuild.1" + resolve_conflicts = "OVERWRITE" } resource "aws_eks_addon" "kube_proxy" { depends_on = [ aws_eks_addon.vpc_cni ] - cluster_name = var.namespace - addon_name = "kube-proxy" - addon_version = "v1.28.8-eksbuild.5" - resolve_conflicts = "OVERWRITE" + cluster_name = var.namespace + addon_name = "kube-proxy" + addon_version = "v1.29.7-eksbuild.9" + resolve_conflicts = "OVERWRITE" } resource "aws_eks_addon" "vpc_cni" { @@ -77,7 +74,7 @@ resource "aws_eks_addon" "vpc_cni" { ] cluster_name = var.namespace addon_name = "vpc-cni" - addon_version = "v1.18.2-eksbuild.1" + addon_version = "v1.18.3-eksbuild.3" resolve_conflicts = "OVERWRITE" service_account_role_arn = aws_iam_role.oidc.arn } diff --git a/modules/app_eks/cluster_autoscaler/ClusterAutoscaler.json b/modules/app_eks/cluster_autoscaler/ClusterAutoscaler.json new file mode 100644 index 000000000..28dbbe8f5 --- /dev/null +++ b/modules/app_eks/cluster_autoscaler/ClusterAutoscaler.json @@ -0,0 +1,34 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:DescribeAutoScalingInstances", + "autoscaling:DescribeLaunchConfigurations", + "autoscaling:DescribeScalingActivities", + "ec2:DescribeImages", + "ec2:DescribeInstanceTypes", + "ec2:DescribeLaunchTemplateVersions", + "ec2:GetInstanceTypesFromInstanceRequirements", + "eks:DescribeNodegroup" + ], + "Resource": ["*"] + }, + { + "Effect": "Allow", + "Action": [ + "autoscaling:SetDesiredCapacity", + "autoscaling:TerminateInstanceInAutoScalingGroup" + ], + "Resource": ["*"], + "Condition": { + "StringEquals": { + "aws:ResourceTag/k8s.io/cluster-autoscaler/enabled": "true", + "aws:ResourceTag/k8s.io/cluster-autoscaler/${namespace}": "owned" + } + } + } + ] +} diff --git a/modules/app_eks/cluster_autoscaler/cluster_autoscaler.tf b/modules/app_eks/cluster_autoscaler/cluster_autoscaler.tf new file mode 100644 index 000000000..b7930fc76 --- /dev/null +++ b/modules/app_eks/cluster_autoscaler/cluster_autoscaler.tf @@ -0,0 +1,49 @@ +data "aws_region" "current" {} + +resource "helm_release" "cluster-autoscaler" { + chart = "cluster-autoscaler" + name = "cluster-autoscaler" + repository = "https://kubernetes.github.io/autoscaler" + namespace = "cluster-autoscaler" + create_namespace = true + + set { + name = "fullnameOverride" + value = "cluster-autoscaler" + } + + set { + name = "autoDiscovery.clusterName" + value = var.namespace + } + + set { + name = "awsRegion" + value = data.aws_region.current.name + } + + set { + name = "rbac.serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn" + value = aws_iam_role.default.arn + } + + set { + name = "extraArgs.balance-similar-node-groups" + value = "true" + } + + set { + name = "extraArgs.balancing-ignore-label" + value = "eks.amazonaws.com/nodegroup" + } + + set { + name = "extraArgs.balancing-ignore-label" + value = "eks.amazonaws.com/sourceLaunchTemplateId" + } + + set { + name = "extraArgs.balancing-ignore-label" + value = "topology.ebs.csi.aws.com/zone" + } +} \ No newline at end of file diff --git a/modules/app_eks/cluster_autoscaler/iam.tf b/modules/app_eks/cluster_autoscaler/iam.tf new file mode 100644 index 000000000..8e880a783 --- /dev/null +++ b/modules/app_eks/cluster_autoscaler/iam.tf @@ -0,0 +1,32 @@ +data "aws_iam_policy_document" "default" { + statement { + actions = ["sts:AssumeRoleWithWebIdentity"] + effect = "Allow" + + condition { + test = "StringLike" + variable = "${replace(var.oidc_provider.url, "https://", "")}:sub" + values = ["system:serviceaccount:cluster-autoscaler:*"] + } + + principals { + identifiers = [var.oidc_provider.arn] + type = "Federated" + } + } +} + +resource "aws_iam_role" "default" { + assume_role_policy = data.aws_iam_policy_document.default.json + name = "${var.namespace}-cluster-autoscaler" +} + +resource "aws_iam_policy" "default" { + policy = templatefile("${path.module}/ClusterAutoscaler.json", { namespace = var.namespace }) + name = "${var.namespace}-cluster-autoscaler" +} + +resource "aws_iam_role_policy_attachment" "default" { + role = aws_iam_role.default.name + policy_arn = aws_iam_policy.default.arn +} \ No newline at end of file diff --git a/modules/app_eks/cluster_autoscaler/variables.tf b/modules/app_eks/cluster_autoscaler/variables.tf new file mode 100644 index 000000000..49fe5944b --- /dev/null +++ b/modules/app_eks/cluster_autoscaler/variables.tf @@ -0,0 +1,10 @@ +variable "namespace" { + type = string +} + +variable "oidc_provider" { + type = object({ + arn = string + url = string + }) +} diff --git a/modules/app_eks/external_dns/external_dns.tf b/modules/app_eks/external_dns/external_dns.tf index f23d67d80..53b7f29f1 100644 --- a/modules/app_eks/external_dns/external_dns.tf +++ b/modules/app_eks/external_dns/external_dns.tf @@ -2,7 +2,7 @@ resource "helm_release" "external_dns" { name = "external-dns" namespace = "kube-system" chart = "external-dns" - version = "1.14.1" + version = "1.15.0" repository = "https://kubernetes-sigs.github.io/external-dns" set { diff --git a/modules/app_eks/main.tf b/modules/app_eks/main.tf index 2e8620b51..652c19292 100644 --- a/modules/app_eks/main.tf +++ b/modules/app_eks/main.tf @@ -13,6 +13,10 @@ locals { create_launch_template = (local.encrypt_ebs_volume || local.system_reserved != "") } +data "aws_subnet" "private" { + count = length(var.network_private_subnets) + id = var.network_private_subnets[count.index] +} module "eks" { source = "terraform-aws-modules/eks/aws" @@ -41,25 +45,30 @@ module "eks" { } ] : null + # node_security_group_enable_recommended_rules = false worker_additional_security_group_ids = [aws_security_group.primary_workers.id] + node_groups_defaults = { + create_launch_template = local.create_launch_template, + disk_encrypted = local.encrypt_ebs_volume, + disk_kms_key_id = var.kms_key_arn, + disk_type = "gp3" + enable_monitoring = true + force_update_version = local.encrypt_ebs_volume, + iam_role_arn = aws_iam_role.node.arn, + instance_types = var.instance_types, + kubelet_extra_args = local.system_reserved != "" ? "--system-reserved=${local.system_reserved}" : "", + metadata_http_put_response_hop_limit = 2 + metadata_http_tokens = "required", + version = var.cluster_version, + } node_groups = { - primary = { - create_launch_template = local.create_launch_template, - desired_capacity = var.desired_capacity, - disk_encrypted = local.encrypt_ebs_volume, - disk_kms_key_id = var.kms_key_arn, - disk_type = "gp3" - enable_monitoring = true - force_update_version = local.encrypt_ebs_volume, - iam_role_arn = aws_iam_role.node.arn, - instance_types = var.instance_types, - kubelet_extra_args = local.system_reserved != "" ? "--system-reserved=${local.system_reserved}" : "", - max_capacity = 5, - metadata_http_put_response_hop_limit = 2 - metadata_http_tokens = "required", - min_capacity = var.desired_capacity, - version = var.cluster_version, + for idx, subnet in data.aws_subnet.private : "ng-${idx}" => { + subnets = [subnet.id] + name_prefix = "${var.namespace}-${regex(".*[[:digit:]]([[:alpha:]])", subnet.availability_zone)[0]}" + desired_capacity = var.min_nodes + max_capacity = var.max_nodes + min_capacity = var.min_nodes } } @@ -75,7 +84,7 @@ resource "kubernetes_annotations" "gp2" { api_version = "storage.k8s.io/v1" kind = "StorageClass" force = "true" - depends_on = [module.eks] + depends_on = [module.eks] metadata { name = "gp2" @@ -92,14 +101,14 @@ resource "kubernetes_storage_class" "gp3" { "storageclass.kubernetes.io/is-default-class" = "true" } } - depends_on = [kubernetes_annotations.gp2] + depends_on = [kubernetes_annotations.gp2] storage_provisioner = "kubernetes.io/aws-ebs" parameters = { fsType = "ext4" - type = "gp3" + type = "gp3" } - reclaim_policy = "Delete" - volume_binding_mode = "WaitForFirstConsumer" + reclaim_policy = "Delete" + volume_binding_mode = "WaitForFirstConsumer" allow_volume_expansion = true } @@ -169,3 +178,12 @@ module "external_dns" { depends_on = [module.eks] } + +module "cluster_autoscaler" { + source = "./cluster_autoscaler" + + namespace = var.namespace + oidc_provider = aws_iam_openid_connect_provider.eks + + depends_on = [module.eks] +} diff --git a/modules/app_eks/outputs.tf b/modules/app_eks/outputs.tf index cc791455f..b50376a3d 100644 --- a/modules/app_eks/outputs.tf +++ b/modules/app_eks/outputs.tf @@ -1,7 +1,7 @@ output "autoscaling_group_names" { value = { for name, value in module.eks.node_groups : name => lookup(lookup(lookup(value, "resources")[0], "autoscaling_groups")[0], "name") } } -output "cluster_id" { +output "cluster_name" { value = module.eks.cluster_id description = "ID of the created EKS cluster" } @@ -20,4 +20,4 @@ output "primary_workers_security_group_id" { output "aws_iam_openid_connect_provider" { value = aws_iam_openid_connect_provider.eks.url -} \ No newline at end of file +} diff --git a/modules/app_eks/variables.tf b/modules/app_eks/variables.tf index ff2d4ce6c..46171f791 100644 --- a/modules/app_eks/variables.tf +++ b/modules/app_eks/variables.tf @@ -116,7 +116,13 @@ variable "service_port" { default = 32543 } -variable "desired_capacity" { +variable "min_nodes" { + description = "Desired number of worker nodes." + type = number + default = 2 +} + +variable "max_nodes" { description = "Desired number of worker nodes." type = number default = 2 diff --git a/modules/app_lb/main.tf b/modules/app_lb/main.tf index 7a52be369..3ba70bd79 100644 --- a/modules/app_lb/main.tf +++ b/modules/app_lb/main.tf @@ -76,137 +76,3 @@ resource "aws_security_group_rule" "alb_https_traffic" { security_group_id = aws_security_group.inbound.id source_security_group_id = aws_security_group.inbound_private[0].id } - - -resource "aws_security_group" "outbound" { - name = "${var.namespace}-alb-outbound" - vpc_id = var.network_id - description = "Allow all traffic outbound from W&B" - - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - ipv6_cidr_blocks = ["::/0"] - } -} - -resource "aws_lb" "alb" { - name = "${var.namespace}-alb" - internal = (var.load_balancing_scheme == "PRIVATE") - load_balancer_type = "application" - security_groups = [aws_security_group.inbound.id, aws_security_group.outbound.id] - subnets = var.load_balancing_scheme == "PRIVATE" ? var.network_private_subnets : var.network_public_subnets -} - -locals { - https_enabled = var.acm_certificate_arn != null -} - -# The acm_certificate_arn is conditionally created depending on other resources. -# Terraform needs to know how many resources to create at apply time. Therefore, -# we must always create a http and https listener. - -# Create http target group if http is not enabled -resource "aws_lb_listener" "http" { - load_balancer_arn = aws_lb.alb.arn - port = local.http_port - protocol = "HTTP" - - # HTTPS Enabled - dynamic "default_action" { - for_each = local.https_enabled ? [1] : [] - content { - type = "redirect" - - redirect { - port = local.https_port - protocol = "HTTPS" - status_code = "HTTP_301" - } - } - } - - # HTTPS Disabled - dynamic "default_action" { - for_each = local.https_enabled ? [] : [1] - content { - type = "forward" - target_group_arn = aws_lb_target_group.app.arn - } - } -} - -resource "aws_lb_listener" "https" { - load_balancer_arn = aws_lb.alb.arn - port = local.https_port - - protocol = local.https_enabled ? "HTTPS" : "HTTP" - ssl_policy = local.https_enabled ? var.ssl_policy : null - certificate_arn = local.https_enabled ? var.acm_certificate_arn : null - - # HTTPS Enabled - dynamic "default_action" { - for_each = local.https_enabled ? [1] : [] - content { - type = "forward" - target_group_arn = aws_lb_target_group.app.arn - } - } - - # HTTPS Disabled - dynamic "default_action" { - for_each = local.https_enabled ? [] : [1] - content { - type = "redirect" - - redirect { - port = local.http_port - protocol = "HTTP" - status_code = "HTTP_301" - } - } - } -} - -resource "aws_lb_target_group" "app" { - name = "${var.namespace}-tg-app" - port = var.target_port - vpc_id = var.network_id - protocol = "HTTP" - - health_check { - path = "/healthz" - protocol = "HTTP" - healthy_threshold = 5 - unhealthy_threshold = 2 - matcher = "200" - } -} - -# Create record for route53 zone. -resource "aws_route53_record" "alb" { - zone_id = var.zone_id - name = var.fqdn - type = "A" - - alias { - name = aws_lb.alb.dns_name - zone_id = aws_lb.alb.zone_id - evaluate_target_health = true - } -} - -resource "aws_route53_record" "extra" { - for_each = toset(var.extra_fqdn) - zone_id = var.zone_id - name = each.value - type = "A" - - alias { - name = aws_lb.alb.dns_name - zone_id = aws_lb.alb.zone_id - evaluate_target_health = true - } -} diff --git a/modules/app_lb/outputs.tf b/modules/app_lb/outputs.tf index 20724c32c..b74bedf7e 100644 --- a/modules/app_lb/outputs.tf +++ b/modules/app_lb/outputs.tf @@ -1,23 +1,6 @@ -output "dns_name" { - value = aws_lb.alb.dns_name -} - output "security_group_inbound_id" { value = aws_security_group.inbound.id } - -output "lb_arn" { - value = aws_lb.alb.arn -} - -output "tg_app_arn" { - value = aws_lb_target_group.app.arn -} - -output "alb_name" { -value = aws_lb.alb.arn -} - output "nlb_security_group" { - value = var.enable_private_only_traffic? aws_security_group.inbound_private[0].id : null -} \ No newline at end of file + value = var.enable_private_only_traffic ? aws_security_group.inbound_private[0].id : null +} diff --git a/modules/app_lb/variables.tf b/modules/app_lb/variables.tf index bc1ab76d8..8e2f2d7b9 100644 --- a/modules/app_lb/variables.tf +++ b/modules/app_lb/variables.tf @@ -4,44 +4,12 @@ variable "namespace" { description = "(Required) String used for prefix resources." } -variable "acm_certificate_arn" { - type = string - description = "(Optional) The ARN of an existing ACM certificate." - default = null -} - variable "ssl_policy" { type = string default = "ELBSecurityPolicy-FS-1-2-Res-2020-10" description = "(Optional) SSL policy to use on ALB listener" } -variable "zone_id" { - type = string - description = "(Required) The zone ID of the route53 to create the application A record in." -} - -variable "fqdn" { - type = string - description = "(Required) Fully qualified domain name." -} - -variable "extra_fqdn" { - type = list(string) - default = [] -} - -variable "load_balancing_scheme" { - default = "PRIVATE" - description = "(Optional) Load Balancing Scheme. Supported values are: \"PRIVATE\"; \"PUBLIC\"." - type = string - - validation { - condition = contains(["PRIVATE", "PUBLIC"], var.load_balancing_scheme) - error_message = "The load_balancer_scheme value must be one of: \"PRIVATE\"; \"PUBLIC\"." - } -} - variable "allowed_inbound_cidr" { description = "CIDRs allowed to access wandb-server." type = list(string) @@ -59,23 +27,6 @@ variable "network_id" { type = string } -variable "network_private_subnets" { - description = "(Required) A list of the identities of the private subnetworks in which the MySQL Aurora instances will be deployed." - type = list(string) -} - -variable "network_public_subnets" { - default = [] - description = "(Optional) A list of the identities of the public subnetworks in which resources will be deployed." - type = list(string) -} - -variable "target_port" { - type = number - default = 32543 -} - - variable "private_endpoint_cidr" { description = "List of CIDR blocks allowed to access the wandb-server" type = list(string) @@ -84,4 +35,4 @@ variable "private_endpoint_cidr" { variable "enable_private_only_traffic" { description = "Boolean flag to create sg" type = bool -} \ No newline at end of file +} diff --git a/modules/file_storage/main.tf b/modules/file_storage/main.tf index 39ced492a..3afb858fa 100644 --- a/modules/file_storage/main.tf +++ b/modules/file_storage/main.tf @@ -11,7 +11,6 @@ resource "aws_sqs_queue" "file_storage" { # kms_master_key_id = var.kms_key_arn } - resource "aws_s3_bucket" "file_storage" { bucket = "${var.namespace}-file-storage-${random_pet.file_storage.id}" @@ -67,9 +66,6 @@ resource "aws_s3_bucket_server_side_encryption_configuration" "file_storage" { } } - - - # Give the bucket permission to send messages onto the queue. Looks like we # overide this value. resource "aws_sqs_queue_policy" "file_storage" { diff --git a/modules/iam_role/main.tf b/modules/iam_role/main.tf index 42c15c0cb..899943bc3 100644 --- a/modules/iam_role/main.tf +++ b/modules/iam_role/main.tf @@ -23,7 +23,6 @@ resource "aws_iam_role" "irsa" { }) } - resource "aws_iam_policy" "irsa" { name = "${var.namespace}-yace-irsa-policy" description = "IRSA IAM Policy" @@ -48,4 +47,4 @@ resource "aws_iam_policy" "irsa" { resource "aws_iam_role_policy_attachment" "default" { role = aws_iam_role.irsa.name policy_arn = aws_iam_policy.irsa.arn -} \ No newline at end of file +} diff --git a/modules/kms/main.tf b/modules/kms/main.tf index bc0519daa..9763ceecf 100644 --- a/modules/kms/main.tf +++ b/modules/kms/main.tf @@ -136,8 +136,6 @@ resource "aws_kms_key" "clickhouse_key" { } } - - resource "aws_kms_alias" "clickhouse_key" { count = var.create_clickhouse_key ? 1 : 0 diff --git a/modules/private_link/main.tf b/modules/private_link/main.tf index bb2989c07..ed68098d5 100644 --- a/modules/private_link/main.tf +++ b/modules/private_link/main.tf @@ -1,6 +1,6 @@ locals { max_lb_name_length = 32 - length("-nlb") - lb_name_truncated = var.enable_private_only_traffic ? "${substr(var.namespace, 0, local.max_lb_name_length)}-private-link-nlb" : "${substr(var.namespace, 0, local.max_lb_name_length)}-nlb" + lb_name_truncated = var.enable_private_only_traffic ? "${substr(var.namespace, 0, local.max_lb_name_length)}-private-link-nlb" : "${substr(var.namespace, 0, local.max_lb_name_length)}-nlb" } resource "aws_lb" "nlb" { @@ -9,10 +9,10 @@ resource "aws_lb" "nlb" { load_balancer_type = "network" subnets = var.network_private_subnets enable_deletion_protection = var.deletion_protection - security_groups = var.enable_private_only_traffic ? [var.nlb_security_group] : [] -lifecycle { - create_before_destroy = true -} + security_groups = var.enable_private_only_traffic ? [var.nlb_security_group] : [] + lifecycle { + create_before_destroy = true + } } resource "aws_lb_target_group" "nlb" { diff --git a/modules/redis/main.tf b/modules/redis/main.tf index 649207b54..b7b1be4c9 100644 --- a/modules/redis/main.tf +++ b/modules/redis/main.tf @@ -1,5 +1,5 @@ locals { - redis_version = "6.x" + redis_version = "7.1" } resource "aws_elasticache_subnet_group" "default" { @@ -15,7 +15,7 @@ resource "aws_elasticache_replication_group" "default" { port = 6379 node_type = var.node_type - parameter_group_name = "default.redis6.x" + parameter_group_name = "default.redis7" engine_version = local.redis_version automatic_failover_enabled = true diff --git a/outputs.tf b/outputs.tf index 04d1c6af9..0ba202835 100644 --- a/outputs.tf +++ b/outputs.tf @@ -11,8 +11,8 @@ output "bucket_region" { value = data.aws_s3_bucket.file_storage.region } -output "cluster_id" { - value = module.app_eks.cluster_id +output "cluster_name" { + value = module.app_eks.cluster_name } output "cluster_node_role" { @@ -32,23 +32,23 @@ output "database_password" { } output "database_instance_type" { - value = try(local.deployment_size[var.size].db, var.database_instance_class) + value = local.database_instance_class } output "elasticache_connection_string" { value = var.create_elasticache ? module.redis.0.connection_string : null } -output "eks_node_count" { - value = try(local.deployment_size[var.size].node_count, var.kubernetes_node_count) +output "eks_min_nodes_per_az" { + value = local.kubernetes_min_nodes_per_az } -output "eks_node_instance_type" { - value = try([local.deployment_size[var.size].node_instance], var.kubernetes_instance_types) +output "eks_max_nodes_per_az" { + value = local.kubernetes_max_nodes_per_az } -output "internal_app_port" { - value = local.internal_app_port +output "eks_node_instance_type" { + value = local.kubernetes_instance_types } output "kms_key_arn" { @@ -73,12 +73,12 @@ output "network_private_subnets" { } output "network_public_subnets" { - value = local.network_public_subnets + value = var.allowed_inbound_cidr description = "The identities of the public subnetworks deployed within the VPC." } output "redis_instance_type" { - value = try(local.deployment_size[var.size].cache, var.elasticache_node_type) + value = local.elasticache_node_type } output "standardized_size" { diff --git a/variables.tf b/variables.tf index e7ee4174c..1c295a5a7 100644 --- a/variables.tf +++ b/variables.tf @@ -18,10 +18,9 @@ variable "use_internal_queue" { } variable "size" { - default = null - description = "Deployment size" - nullable = true + description = "Deployment size for the instance" type = string + default = "small" } ########################################## @@ -43,15 +42,15 @@ variable "controller_image_tag" { # Database # ########################################## variable "database_engine_version" { - description = "Version for MySQL Auora" + description = "Version for MySQL Aurora" type = string - default = "8.0.mysql_aurora.3.05.2" + default = "8.0.mysql_aurora.3.07.1" } variable "database_instance_class" { - description = "Instance type to use by database master instance." + description = "Instance type to use by database master instance. Defaults to null and value from deployment-size.tf is used" type = string - default = "db.r5.large" + default = null } variable "database_snapshot_identifier" { @@ -146,19 +145,6 @@ variable "subdomain" { description = "Subdomain for accessing the Weights & Biases UI. Default creates record at Route53 Route." } -variable "enable_dummy_dns" { - type = bool - default = false - description = "Boolean indicating whether or not to enable dummy DNS for the old alb" -} - - -variable "enable_operator_alb" { - type = bool - default = false - description = "Boolean indicating whether to use operatore ALB (true) or not (false)." -} - variable "extra_fqdn" { type = list(string) description = "Additional fqdn's must be in the same hosted zone as `domain_name`." @@ -192,7 +178,6 @@ variable "allowed_inbound_ipv6_cidr" { type = list(string) } - ########################################## # KMS # ########################################## @@ -346,7 +331,6 @@ variable "kubernetes_public_access" { default = false } - variable "kubernetes_public_access_cidrs" { description = "List of CIDR blocks which can access the Amazon EKS public API server endpoint." type = list(string) @@ -380,15 +364,21 @@ variable "kubernetes_map_users" { } variable "kubernetes_instance_types" { - description = "EC2 Instance type for primary node group." + description = "EC2 Instance type for primary node group. Defaults to null and value from deployment-size.tf is used" type = list(string) - default = ["m5.large"] + default = null +} + +variable "kubernetes_min_nodes_per_az" { + description = "Minimum number of nodes for the EKS cluster. Defaults to null and value from deployment-size.tf is used" + type = number + default = null } -variable "kubernetes_node_count" { - description = "Number of nodes" +variable "kubernetes_max_nodes_per_az" { + description = "Maximum number of nodes for the EKS cluster. Defaults to null and value from deployment-size.tf is used" type = number - default = 2 + default = null } variable "eks_policy_arns" { @@ -472,9 +462,9 @@ variable "create_elasticache" { } variable "elasticache_node_type" { - description = "The type of the redis cache node to deploy" + description = "The type of the redis cache node to deploy. Defaults to null and value from deployment-size.tf is used" type = string - default = "cache.t2.medium" + default = null } ##########################################
"10.10.0.0/24",
"10.10.1.0/24"
]