From 94a594ae33e653115efaa02f54bb076f32c07f11 Mon Sep 17 00:00:00 2001 From: amanpruthi Date: Tue, 4 Jun 2024 13:29:10 +0530 Subject: [PATCH 1/5] feat: added support for stackdriver and otel metrics --- main.tf | 78 ++++++++++++++++++++++----- modules/app_gke/main.tf | 15 +++++- modules/app_gke/variables.tf | 5 ++ modules/service_accounts/main.tf | 24 +++++++++ modules/service_accounts/outputs.tf | 4 ++ modules/service_accounts/variables.tf | 15 ++++++ variables.tf | 15 ++++++ 7 files changed, 143 insertions(+), 13 deletions(-) diff --git a/main.tf b/main.tf index d26147a9..62e3e177 100644 --- a/main.tf +++ b/main.tf @@ -29,10 +29,13 @@ locals { } module "service_accounts" { - source = "./modules/service_accounts" - namespace = var.namespace - bucket_name = var.bucket_name - depends_on = [module.project_factory_project_services] + source = "./modules/service_accounts" + namespace = var.namespace + bucket_name = var.bucket_name + account_id = var.workload_account_id + service_account_name = var.service_account_name + enable_stackdriver = var.enable_stackdriver + depends_on = [module.project_factory_project_services] } module "kms" { @@ -77,14 +80,15 @@ locals { } module "app_gke" { - source = "./modules/app_gke" - namespace = var.namespace - machine_type = coalesce(try(local.deployment_size[var.size].node_instance, null), var.gke_machine_type) - node_count = coalesce(try(local.deployment_size[var.size].node_count, null), var.gke_node_count) - network = local.network - subnetwork = local.subnetwork - service_account = module.service_accounts.service_account - depends_on = [module.project_factory_project_services] + source = "./modules/app_gke" + namespace = var.namespace + machine_type = coalesce(try(local.deployment_size[var.size].node_instance, null), var.gke_machine_type) + node_count = coalesce(try(local.deployment_size[var.size].node_count, null), var.gke_node_count) + network = local.network + subnetwork = local.subnetwork + service_account = module.service_accounts.service_account + create_workload_identity = var.enable_stackdriver + depends_on = [module.project_factory_project_services] } module "app_lb" { @@ -186,6 +190,8 @@ locals { } : {} } +data "google_client_config" "current" {} + module "wandb" { source = "wandb/wandb/helm" version = "1.2.0" @@ -241,6 +247,54 @@ module "wandb" { "ingress.gcp.kubernetes.io/pre-shared-cert" = module.app_lb.certificate } } + # To support otel rds and redis metrics need operator-wandb chart minimum version 0.13.8 ( stackdriver subchart) + stackdriver = var.enable_stackdriver ? { + install = true + stackdriver = { + projectId = data.google_client_config.current.project + } + serviceAccount = { annotations = { "iam.gke.io/gcp-service-account" = module.service_accounts.monitoring_role } } + } : { + install = false + stackdriver = {} + serviceAccount = {} + } + + otel = { + daemonset = var.enable_stackdriver ? { + config = { + receivers = { + prometheus = { + config = { + scrape_configs = [ + { job_name = "stackdriver" + scheme = "http" + metrics_path = "/metrics" + dns_sd_configs = [ + { names = ["stackdriver"] + type = "A" + port = 9255 + } + ] + } + ] + } + } + } + service = { + pipelines = { + metrics = { + receivers = ["hostmetrics", "k8s_cluster", "kubeletstats", "prometheus"] + } + } + } + } + } : { config = { + receivers = {} + service = {} + } + } + } redis = { install = false } mysql = { install = false } diff --git a/modules/app_gke/main.tf b/modules/app_gke/main.tf index e57cbea6..5027a223 100644 --- a/modules/app_gke/main.tf +++ b/modules/app_gke/main.tf @@ -1,3 +1,9 @@ +data "google_client_config" "current" {} + +locals { + project_id = data.google_client_config.current.project +} + resource "google_container_cluster" "default" { name = "${var.namespace}-cluster" @@ -11,7 +17,14 @@ resource "google_container_cluster" "default" { evaluation_mode = "PROJECT_SINGLETON_POLICY_ENFORCE" } - + # Conditionally enable workload identity + dynamic "workload_identity_config" { + for_each = var.create_workload_identity == true ? [1] : [] + content { + workload_pool = "${local.project_id}.svc.id.goog" + } + } + ip_allocation_policy { cluster_ipv4_cidr_block = "/14" services_ipv4_cidr_block = "/19" diff --git a/modules/app_gke/variables.tf b/modules/app_gke/variables.tf index a9ec7408..fa502bb7 100644 --- a/modules/app_gke/variables.tf +++ b/modules/app_gke/variables.tf @@ -43,4 +43,9 @@ variable "parquet_wandb_env" { variable "node_count" { type = number +} + +variable "create_workload_identity" { + description = "Flag to indicate whether to enable workload identity for the service account." + type = bool } \ No newline at end of file diff --git a/modules/service_accounts/main.tf b/modules/service_accounts/main.tf index 724e7d73..ca85630e 100644 --- a/modules/service_accounts/main.tf +++ b/modules/service_accounts/main.tf @@ -1,4 +1,5 @@ data "google_client_config" "current" {} +data "google_project" "project" {} resource "random_id" "main" { # 30 bytes ensures that enough characters are generated to satisfy the service account ID requirements, regardless of @@ -60,3 +61,26 @@ resource "google_project_iam_member" "secretmanager_admin" { member = local.sa_member role = "roles/secretmanager.admin" } + + +resource "google_service_account" "workload-identity-user-sa" { + count = var.enable_stackdriver == true ? 1 : 0 + account_id = "stackdriver" + display_name = "Service Account For Workload Identity" + +} + +resource "google_project_iam_member" "monitoring-role" { + count = var.enable_stackdriver == true ? 1 : 0 + project = local.project_id + role = "roles/monitoring.viewer" + member = "serviceAccount:${google_service_account.workload-identity-user-sa[count.index].email}" +} + + +resource "google_project_iam_member" "workload_identity-role" { + count = var.enable_stackdriver == true ? 1 : 0 + project = local.project_id + role = "roles/iam.workloadIdentityUser" + member = "serviceAccount:${local.project_id}.svc.id.goog[default/${var.service_account_name}]" +} \ No newline at end of file diff --git a/modules/service_accounts/outputs.tf b/modules/service_accounts/outputs.tf index 0ed66faa..ba84de5d 100644 --- a/modules/service_accounts/outputs.tf +++ b/modules/service_accounts/outputs.tf @@ -2,4 +2,8 @@ output "service_account" { value = google_service_account.main description = "The service account." +} + +output "monitoring_role" { + value = var.enable_stackdriver == true ? google_service_account.workload-identity-user-sa[0].email : null } \ No newline at end of file diff --git a/modules/service_accounts/variables.tf b/modules/service_accounts/variables.tf index e4d4bb8d..6cc76753 100644 --- a/modules/service_accounts/variables.tf +++ b/modules/service_accounts/variables.tf @@ -7,4 +7,19 @@ variable "bucket_name" { type = string description = "Existing bucket the service account will access" default = "" +} + +variable "account_id" { + description = "The ID of the Google Cloud Platform (GCP) account." + type = string +} + +variable "service_account_name" { + description = "The name of the service account." + type = string +} + +variable "enable_stackdriver" { + description = "Flag to indicate whether to enable workload identity for the service account." + type = bool } \ No newline at end of file diff --git a/variables.tf b/variables.tf index a2cbff87..57aa6588 100644 --- a/variables.tf +++ b/variables.tf @@ -253,3 +253,18 @@ variable "parquet_wandb_env" { description = "Extra environment variables for W&B" default = {} } + +variable "enable_stackdriver" { + type = bool + default = false +} + +variable "workload_account_id" { + type = string + default = "stackdriver" +} + +variable "service_account_name" { + type = string + default = "stackdriver" +} \ No newline at end of file From 521da673ab3ab0f2e8a31970c7be63c6cee45031 Mon Sep 17 00:00:00 2001 From: amanpruthi Date: Tue, 4 Jun 2024 13:31:31 +0530 Subject: [PATCH 2/5] fixed checks --- main.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.tf b/main.tf index 62e3e177..80d81bd7 100644 --- a/main.tf +++ b/main.tf @@ -255,9 +255,9 @@ module "wandb" { } serviceAccount = { annotations = { "iam.gke.io/gcp-service-account" = module.service_accounts.monitoring_role } } } : { - install = false - stackdriver = {} - serviceAccount = {} + install = false + stackdriver = {} + serviceAccount = {} } otel = { From 5f7cf05457ad68426a0d20221d20b4f807aaf72b Mon Sep 17 00:00:00 2001 From: amanpruthi Date: Thu, 6 Jun 2024 18:05:05 +0530 Subject: [PATCH 3/5] fixed tmp service account issue --- main.tf | 8 ++++---- modules/service_accounts/main.tf | 3 +-- variables.tf | 8 +++++++- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/main.tf b/main.tf index 80d81bd7..d00db54e 100644 --- a/main.tf +++ b/main.tf @@ -87,7 +87,7 @@ module "app_gke" { network = local.network subnetwork = local.subnetwork service_account = module.service_accounts.service_account - create_workload_identity = var.enable_stackdriver + create_workload_identity = var.create_workload_identity depends_on = [module.project_factory_project_services] } @@ -255,9 +255,9 @@ module "wandb" { } serviceAccount = { annotations = { "iam.gke.io/gcp-service-account" = module.service_accounts.monitoring_role } } } : { - install = false - stackdriver = {} - serviceAccount = {} + install = false + stackdriver = {} + serviceAccount = {} } otel = { diff --git a/modules/service_accounts/main.tf b/modules/service_accounts/main.tf index ca85630e..b498ef29 100644 --- a/modules/service_accounts/main.tf +++ b/modules/service_accounts/main.tf @@ -74,10 +74,9 @@ resource "google_project_iam_member" "monitoring-role" { count = var.enable_stackdriver == true ? 1 : 0 project = local.project_id role = "roles/monitoring.viewer" - member = "serviceAccount:${google_service_account.workload-identity-user-sa[count.index].email}" + member = local.sa_member } - resource "google_project_iam_member" "workload_identity-role" { count = var.enable_stackdriver == true ? 1 : 0 project = local.project_id diff --git a/variables.tf b/variables.tf index 57aa6588..36c30a30 100644 --- a/variables.tf +++ b/variables.tf @@ -256,7 +256,7 @@ variable "parquet_wandb_env" { variable "enable_stackdriver" { type = bool - default = false + default = true } variable "workload_account_id" { @@ -267,4 +267,10 @@ variable "workload_account_id" { variable "service_account_name" { type = string default = "stackdriver" +} + +variable "create_workload_identity" { + description = "Flag to indicate whether to create a workload identity for the service account." + type = bool + default = true } \ No newline at end of file From 7005447315ad68bad35b2d49ed25374b837bed84 Mon Sep 17 00:00:00 2001 From: amanpruthi Date: Thu, 6 Jun 2024 18:11:17 +0530 Subject: [PATCH 4/5] fixed lint --- main.tf | 2 +- variables.tf | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/main.tf b/main.tf index d00db54e..62e3e177 100644 --- a/main.tf +++ b/main.tf @@ -87,7 +87,7 @@ module "app_gke" { network = local.network subnetwork = local.subnetwork service_account = module.service_accounts.service_account - create_workload_identity = var.create_workload_identity + create_workload_identity = var.enable_stackdriver depends_on = [module.project_factory_project_services] } diff --git a/variables.tf b/variables.tf index 36c30a30..5b5a5813 100644 --- a/variables.tf +++ b/variables.tf @@ -267,10 +267,4 @@ variable "workload_account_id" { variable "service_account_name" { type = string default = "stackdriver" -} - -variable "create_workload_identity" { - description = "Flag to indicate whether to create a workload identity for the service account." - type = bool - default = true } \ No newline at end of file From 368f0cc3ee46b0f8c2cd52da6824eac782735ca6 Mon Sep 17 00:00:00 2001 From: amanpruthi Date: Fri, 7 Jun 2024 12:02:07 +0530 Subject: [PATCH 5/5] fixed sa issue --- modules/service_accounts/main.tf | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/modules/service_accounts/main.tf b/modules/service_accounts/main.tf index b498ef29..3dc03374 100644 --- a/modules/service_accounts/main.tf +++ b/modules/service_accounts/main.tf @@ -67,19 +67,25 @@ resource "google_service_account" "workload-identity-user-sa" { count = var.enable_stackdriver == true ? 1 : 0 account_id = "stackdriver" display_name = "Service Account For Workload Identity" - } resource "google_project_iam_member" "monitoring-role" { count = var.enable_stackdriver == true ? 1 : 0 project = local.project_id role = "roles/monitoring.viewer" - member = local.sa_member + member = "serviceAccount:${google_service_account.workload-identity-user-sa[count.index].email}" } -resource "google_project_iam_member" "workload_identity-role" { +resource "google_service_account_iam_member" "monitoring-role" { count = var.enable_stackdriver == true ? 1 : 0 - project = local.project_id + service_account_id = google_service_account.workload-identity-user-sa[count.index].id + role = "roles/iam.serviceAccountTokenCreator" + member = "serviceAccount:${google_service_account.workload-identity-user-sa[count.index].email}" +} + +resource "google_service_account_iam_member" "workload_identity-role" { + count = var.enable_stackdriver == true ? 1 : 0 + service_account_id = google_service_account.workload-identity-user-sa[count.index].id role = "roles/iam.workloadIdentityUser" member = "serviceAccount:${local.project_id}.svc.id.goog[default/${var.service_account_name}]" } \ No newline at end of file