From ede7b800dacc26554ed9684d9a6a53d3b4188235 Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Tue, 4 Oct 2022 15:37:06 -0600 Subject: [PATCH 01/18] feat(monitoring): Add charts --- k8s/global/flux/repositories/helm/grafana-charts.yaml | 10 ++++++++++ k8s/global/flux/repositories/helm/kustomization.yaml | 1 + 2 files changed, 11 insertions(+) create mode 100644 k8s/global/flux/repositories/helm/grafana-charts.yaml diff --git a/k8s/global/flux/repositories/helm/grafana-charts.yaml b/k8s/global/flux/repositories/helm/grafana-charts.yaml new file mode 100644 index 0000000000..7b1df329e9 --- /dev/null +++ b/k8s/global/flux/repositories/helm/grafana-charts.yaml @@ -0,0 +1,10 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: HelmRepository +metadata: + name: grafana + namespace: flux-system +spec: + interval: 30m + url: https://grafana.github.io/helm-charts + timeout: 3m diff --git a/k8s/global/flux/repositories/helm/kustomization.yaml b/k8s/global/flux/repositories/helm/kustomization.yaml index d8087279e5..fbb10f4df9 100644 --- a/k8s/global/flux/repositories/helm/kustomization.yaml +++ b/k8s/global/flux/repositories/helm/kustomization.yaml @@ -15,3 +15,4 @@ resources: - sealed-secrets-charts.yaml - bitnami-charts.yaml - kubereboot-charts.yaml + - grafana-charts.yaml From 3336d3b072e7b981e4da144fec73b1325b22a8a2 Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Tue, 4 Oct 2022 15:42:40 -0600 Subject: [PATCH 02/18] feat(postgres): Add database for monitoring --- .../base/databases/kustomization.yaml | 5 ++ .../clusters/base/databases/namespace.yaml | 7 +++ .../base/databases/postgres/cluster.yaml | 18 ++++++ .../base/databases/postgres/helm-release.yaml | 33 +++++++++++ .../databases/postgres/kustomization.yaml | 17 ++++++ .../base/databases/postgres/secret.sops.yaml | 58 +++++++++++++++++++ 6 files changed, 138 insertions(+) create mode 100644 k8s/namespaces/clusters/base/databases/kustomization.yaml create mode 100644 k8s/namespaces/clusters/base/databases/namespace.yaml create mode 100644 k8s/namespaces/clusters/base/databases/postgres/cluster.yaml create mode 100644 k8s/namespaces/clusters/base/databases/postgres/helm-release.yaml create mode 100644 k8s/namespaces/clusters/base/databases/postgres/kustomization.yaml create mode 100644 k8s/namespaces/clusters/base/databases/postgres/secret.sops.yaml diff --git a/k8s/namespaces/clusters/base/databases/kustomization.yaml b/k8s/namespaces/clusters/base/databases/kustomization.yaml new file mode 100644 index 0000000000..809cbe53b4 --- /dev/null +++ b/k8s/namespaces/clusters/base/databases/kustomization.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - namespace.yaml diff --git a/k8s/namespaces/clusters/base/databases/namespace.yaml b/k8s/namespaces/clusters/base/databases/namespace.yaml new file mode 100644 index 0000000000..139865c6c8 --- /dev/null +++ b/k8s/namespaces/clusters/base/databases/namespace.yaml @@ -0,0 +1,7 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: databases + labels: + kustomize.toolkit.fluxcd.io/prune: disabled diff --git a/k8s/namespaces/clusters/base/databases/postgres/cluster.yaml b/k8s/namespaces/clusters/base/databases/postgres/cluster.yaml new file mode 100644 index 0000000000..3bc15113a6 --- /dev/null +++ b/k8s/namespaces/clusters/base/databases/postgres/cluster.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: postgresql.cnpg.io/v1 +kind: Cluster +metadata: + name: postgres + namespace: databases + annotations: + kyverno.io/ignore: "true" +spec: + instances: 3 + primaryUpdateStrategy: unsupervised + storage: + size: 10Gi + storageClass: ceph-block + superuserSecret: + name: postgres-superuser + monitoring: + enablePodMonitor: true diff --git a/k8s/namespaces/clusters/base/databases/postgres/helm-release.yaml b/k8s/namespaces/clusters/base/databases/postgres/helm-release.yaml new file mode 100644 index 0000000000..6df1dad927 --- /dev/null +++ b/k8s/namespaces/clusters/base/databases/postgres/helm-release.yaml @@ -0,0 +1,33 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: postgres + namespace: databases +spec: + interval: 15m + chart: + spec: + chart: cloudnative-pg + version: 0.15.0 + sourceRef: + kind: HelmRepository + name: cloudnative-pg-charts + namespace: flux-system + test: + enable: false + install: + remediation: + retries: 5 + upgrade: + remediation: + retries: 5 + remediateLastFailure: true + cleanupOnFail: true + rollback: + timeout: 10m + recreate: true + cleanupOnFail: true + values: + crds: + create: false diff --git a/k8s/namespaces/clusters/base/databases/postgres/kustomization.yaml b/k8s/namespaces/clusters/base/databases/postgres/kustomization.yaml new file mode 100644 index 0000000000..afc49a4461 --- /dev/null +++ b/k8s/namespaces/clusters/base/databases/postgres/kustomization.yaml @@ -0,0 +1,17 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - secret.sops.yaml + - helm-release.yaml + - cluster.yaml +configMapGenerator: + - name: cloudnative-pg-dashboard + files: + - cloudnative-pg-dashboard.json=https://raw.githubusercontent.com/cloudnative-pg/charts/main/charts/cnpg-sandbox/dashboard.json +generatorOptions: + disableNameSuffixHash: true + annotations: + kustomize.toolkit.fluxcd.io/substitute: disabled + labels: + grafana_dashboard: "true" diff --git a/k8s/namespaces/clusters/base/databases/postgres/secret.sops.yaml b/k8s/namespaces/clusters/base/databases/postgres/secret.sops.yaml new file mode 100644 index 0000000000..3f9c185ba0 --- /dev/null +++ b/k8s/namespaces/clusters/base/databases/postgres/secret.sops.yaml @@ -0,0 +1,58 @@ +# yamllint disable +apiVersion: v1 +kind: Secret +metadata: + name: postgres-superuser + namespace: databases +stringData: + username: ENC[AES256_GCM,data:FD8BSiM5fBts4Aw=,iv:Kq6646pYuvWxOv3PWATY3NMnkZhIA9/rMJugal/nDjE=,tag:XxmFZrnvQ/RUyWXQ/C2JQQ==,type:str] + password: ENC[AES256_GCM,data:UuNEPCydjrIMBe0=,iv:FQ/CxQq9Mt0bXkeGsB/7vxsrZ+olSFy/Rlt1T/l1kYE=,tag:+/ebf9ZjQ/npUsRjzE5h6g==,type:str] +sops: + kms: [] + gcp_kms: [] + azure_kv: [] + hc_vault: [] + age: + - recipient: age1eynu35v0tpg9remal6zeecfeg9e84a2qxake027wwgdn02rdfcls7nyv8r + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSA0TENNN3V6MnVxSWlBRmVz + ekNaRjZ3MlJXVk1oZ2pZWExYYVlLM0oveDJnCklpYnVTWUQwZHJHNDR2S3U3MEQ0 + WWg4SGZWK2IrQ1FyeFk5Qmd4clc3L00KLS0tIGlvWTRLMjVjVTFlTlREVm5SRUJG + Z1R3RFV5bER5Q0VBb3RnK0diNXpNcVkK3U21Y5GWvnmPA8hxi8Us7TkNGsCYAvlD + QmY8mT6ApdiczqVo1DgFmKDSMIYNGL2wlyyriu9MLCU8a9tGxmj47g== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2022-10-04T21:03:20Z" + mac: ENC[AES256_GCM,data:iAgS0g0u73yTGjLl4OyQsuoqYYe5CE9Ad1QQ0+HYRiBS7+GEy2A5DhqwlYJuJlVTz1fdAmWq/O/3pJkNyPqOU3o/tIAuyQTwpw4tLqqJ0Dut3LnyjzzMV7lE5cMODmNfx+WhpcjKRAC2iCT50bDskEc6k4FGexfi0fs0wkUPbEw=,iv:zOLY8Xi0f+PY+6EX8vM8gc/HxOw5cRzdSa2OAzqolsM=,tag:oxeflPdBwjfmmM7viFO1FQ==,type:str] + pgp: [] + encrypted_regex: ((?i)(pass|secret($|[^N])|key|token|^data$|^stringData)) + version: 3.7.3 +--- +apiVersion: v1 +kind: Secret +metadata: + name: grafana + namespace: databases +stringData: + GF_DATABASE_USER: ENC[AES256_GCM,data:41KaDmtK5BkuTh0=,iv:dTL9A5UvgvTYvUSmNpBiuPnKRIumFz0uRYCMWRBcdRM=,tag:ZVjBTuvRzQMkEXjhYOfsTw==,type:str] + GF_DATABASE_PASSWORD: ENC[AES256_GCM,data:1lVcej/PghZeSKg=,iv:Sbu7yX8y/pSHucdUetVeiIohcCyrkBox4EPO37GJfBo=,tag:eNixQE4plTppva+hoZJXeQ==,type:str] +sops: + kms: [] + gcp_kms: [] + azure_kv: [] + hc_vault: [] + age: + - recipient: age1eynu35v0tpg9remal6zeecfeg9e84a2qxake027wwgdn02rdfcls7nyv8r + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSA0TENNN3V6MnVxSWlBRmVz + ekNaRjZ3MlJXVk1oZ2pZWExYYVlLM0oveDJnCklpYnVTWUQwZHJHNDR2S3U3MEQ0 + WWg4SGZWK2IrQ1FyeFk5Qmd4clc3L00KLS0tIGlvWTRLMjVjVTFlTlREVm5SRUJG + Z1R3RFV5bER5Q0VBb3RnK0diNXpNcVkK3U21Y5GWvnmPA8hxi8Us7TkNGsCYAvlD + QmY8mT6ApdiczqVo1DgFmKDSMIYNGL2wlyyriu9MLCU8a9tGxmj47g== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2022-10-04T21:03:20Z" + mac: ENC[AES256_GCM,data:iAgS0g0u73yTGjLl4OyQsuoqYYe5CE9Ad1QQ0+HYRiBS7+GEy2A5DhqwlYJuJlVTz1fdAmWq/O/3pJkNyPqOU3o/tIAuyQTwpw4tLqqJ0Dut3LnyjzzMV7lE5cMODmNfx+WhpcjKRAC2iCT50bDskEc6k4FGexfi0fs0wkUPbEw=,iv:zOLY8Xi0f+PY+6EX8vM8gc/HxOw5cRzdSa2OAzqolsM=,tag:oxeflPdBwjfmmM7viFO1FQ==,type:str] + pgp: [] + encrypted_regex: ((?i)(pass|secret($|[^N])|key|token|^data$|^stringData)) + version: 3.7.3 From 1aa019b04a428148af66d07c3a9271ac8b0220c8 Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Tue, 4 Oct 2022 15:43:51 -0600 Subject: [PATCH 03/18] feat(monitoring): Namespace creation --- k8s/namespaces/clusters/base/monitoring/kustomization.yaml | 5 +++++ k8s/namespaces/clusters/base/monitoring/namespace.yaml | 7 +++++++ 2 files changed, 12 insertions(+) create mode 100644 k8s/namespaces/clusters/base/monitoring/kustomization.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/namespace.yaml diff --git a/k8s/namespaces/clusters/base/monitoring/kustomization.yaml b/k8s/namespaces/clusters/base/monitoring/kustomization.yaml new file mode 100644 index 0000000000..809cbe53b4 --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/kustomization.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - namespace.yaml diff --git a/k8s/namespaces/clusters/base/monitoring/namespace.yaml b/k8s/namespaces/clusters/base/monitoring/namespace.yaml new file mode 100644 index 0000000000..ef4dd87a43 --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/namespace.yaml @@ -0,0 +1,7 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring + labels: + kustomize.toolkit.fluxcd.io/prune: disabled From 6d6cb97a3b7fcd14e6c49dee72feaafce53deaf3 Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Tue, 4 Oct 2022 15:45:19 -0600 Subject: [PATCH 04/18] feat(grafana): Add grafana --- .../base/monitoring/grafana/helm-release.yaml | 327 ++++++++++++++++++ .../monitoring/grafana/kustomization.yaml | 7 + .../base/monitoring/grafana/secret.sops.yaml | 29 ++ 3 files changed, 363 insertions(+) create mode 100644 k8s/namespaces/clusters/base/monitoring/grafana/helm-release.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/grafana/kustomization.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/grafana/secret.sops.yaml diff --git a/k8s/namespaces/clusters/base/monitoring/grafana/helm-release.yaml b/k8s/namespaces/clusters/base/monitoring/grafana/helm-release.yaml new file mode 100644 index 0000000000..1702d8cbe9 --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/grafana/helm-release.yaml @@ -0,0 +1,327 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: &app grafana + namespace: monitoring +spec: + interval: 15m + chart: + spec: + chart: grafana + version: 6.40.1 + sourceRef: + kind: HelmRepository + name: grafana + namespace: flux-system + test: + enable: false + install: + remediation: + retries: 5 + upgrade: + remediation: + retries: 5 + remediateLastFailure: true + cleanupOnFail: true + rollback: + timeout: 10m + recreate: true + cleanupOnFail: true + values: + replicas: 1 + deploymentStrategy: + type: Recreate + + env: + GF_EXPLORE_ENABLED: true + GF_PANELS_DISABLE_SANITIZE_HTML: true + GF_LOG_FILTERS: rendering:debug + GF_DATE_FORMATS_USE_BROWSER_LOCALE: true + GF_DATE_FORMATS_FULL_DATE: "MMM Do, YYYY hh:mm:ss a" + GF_SECURITY_ALLOW_EMBEDDING: true + + admin: + existingSecret: grafana-admin + + grafana.ini: + server: + root_url: "https://grafana.${SECRET_DOMAIN}" + paths: + data: /var/lib/grafana/data + logs: /var/log/grafana + plugins: /var/lib/grafana/plugins + provisioning: /etc/grafana/provisioning + analytics: + check_for_updates: false + log: + mode: console + grafana_net: + url: https://grafana.net + auth.basic: + enabled: true + disable_login_form: false + + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: "default" + orgId: 1 + folder: "" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default + - name: "flux" + orgId: 1 + folder: "flux" + type: file + updateIntervalSeconds: 10 + disableDeletion: false + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards/flux + - name: "loki" + orgId: 1 + folder: "loki" + type: file + updateIntervalSeconds: 10 + disableDeletion: false + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards/loki + - name: "media" + orgId: 1 + folder: "media" + type: file + updateIntervalSeconds: 10 + disableDeletion: false + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards/media + - name: "networking" + orgId: 1 + folder: "networking" + type: file + updateIntervalSeconds: 10 + disableDeletion: false + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards/networking + - name: "storage" + orgId: 1 + folder: "storage" + type: file + updateIntervalSeconds: 10 + disableDeletion: false + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards/storage + - name: "thanos" + orgId: 1 + folder: "thanos" + type: file + updateIntervalSeconds: 10 + disableDeletion: false + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards/thanos + + datasources: + datasources.yaml: + apiVersion: 1 + deleteDatasources: + - name: Loki + orgId: 1 + datasources: + - name: Prometheus + type: prometheus + url: http://thanos-query.monitoring:9090/ + access: proxy + isDefault: true + - name: Loki + type: loki + access: proxy + url: http://loki.monitoring:3100 + + dashboards: + default: + node-exporter-full: + url: https://grafana.com/api/dashboards/1860/revisions/22/download + datasource: Prometheus + zfs: + gnetId: 7845 + revision: 4 + datasource: Prometheus + flux: + flux-cluster: + url: https://raw.githubusercontent.com/fluxcd/flux2/main/manifests/monitoring/monitoring-config/dashboards/cluster.json + datasource: Prometheus + flux-control-plane: + url: https://raw.githubusercontent.com/fluxcd/flux2/main/manifests/monitoring/monitoring-config/dashboards/control-plane.json + datasource: Prometheus + flux-logs: + url: https://raw.githubusercontent.com/fluxcd/flux2/main/manifests/monitoring/monitoring-config/dashboards/logs.json + datasource: Loki + + storage: + # Ref: https://grafana.com/grafana/dashboards/2842 + ceph-cluster: + gnetId: 2842 + revision: 14 + datasource: Prometheus + # Ref: https://grafana.com/grafana/dashboards/5336 + ceph-osd: + gnetId: 5336 + revision: 5 + datasource: Prometheus + # Ref: https://grafana.com/grafana/dashboards/5342 + ceph-pools: + gnetId: 5342 + revision: 5 + datasource: Prometheus + # Ref: https://grafana.com/grafana/dashboards/7845 + zfs: + gnetId: 7845 + revision: 4 + datasource: Prometheus + # Ref: https://grafana.com/grafana/dashboards/7845 + netdata: + gnetId: 7107 + revision: 1 + datasource: Prometheus + + media: + radarr: + url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/radarr.json + datasource: Prometheus + sonarr: + url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/sonarr.json + datasource: Prometheus + + networking: + blackbox: + url: https://raw.githubusercontent.com/jr0dd/grafana-dashboards/main/blackbox.json + datasource: Prometheus + # Ref: https://grafana.com/grafana/dashboards/15513 + cert-manager: + url: https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/cert-manager/dashboards/cert-manager.json + datasource: Prometheus + cilium-agent: + gnetId: 15513 + revision: 1 + datasource: Prometheus + # Ref: https://grafana.com/grafana/dashboards/15514 + cilium-operator: + gnetId: 15514 + revision: 1 + datasource: Prometheus + # Ref: https://grafana.com/grafana/dashboards/15515 + cilium-hubble: + gnetId: 15515 + revision: 1 + datasource: Prometheus + # Ref: https://grafana.com/grafana/dashboards/13665 + speedtest: + gnetId: 13665 + revision: 4 + datasource: Prometheus + nginx-dashboard: + url: https://raw.githubusercontent.com/kubernetes/ingress-nginx/master/deploy/grafana/dashboards/nginx.json + datasource: Prometheus + + thanos: + bucket-replicate: + url: https://raw.githubusercontent.com/thanos-io/thanos/main/examples/dashboards/bucket-replicate.json + datasource: Prometheus + compact: + url: https://raw.githubusercontent.com/thanos-io/thanos/main/examples/dashboards/compact.json + datasource: Prometheus + overview: + url: https://raw.githubusercontent.com/thanos-io/thanos/main/examples/dashboards/overview.json + datasource: Prometheus + query: + url: https://raw.githubusercontent.com/thanos-io/thanos/main/examples/dashboards/query.json + datasource: Prometheus + query-frontend: + url: https://raw.githubusercontent.com/thanos-io/thanos/main/examples/dashboards/query-frontend.json + datasource: Prometheus + receive: + url: https://raw.githubusercontent.com/thanos-io/thanos/main/examples/dashboards/receive.json + datasource: Prometheus + rule: + url: https://raw.githubusercontent.com/thanos-io/thanos/main/examples/dashboards/rule.json + datasource: Prometheus + sidecar: + url: https://raw.githubusercontent.com/thanos-io/thanos/main/examples/dashboards/sidecar.json + datasource: Prometheus + store: + url: https://raw.githubusercontent.com/thanos-io/thanos/main/examples/dashboards/store.json + datasource: Prometheus + + sidecar: + dashboards: + enabled: true + searchNamespace: ALL + datasources: + enabled: true + searchNamespace: ALL + + imageRenderer: + enabled: true + + plugins: + - natel-discrete-panel + - pr0ps-trackmap-panel + - vonage-status-panel + - grafana-piechart-panel + - grafana-polystat-panel + - grafana-worldmap-panel + - grafana-clock-panel + - grafana-singlestat-panel + - mxswat-separator-panel + - farski-blendstat-panel + - speakyourcode-button-panel + - snuids-trafficlights-panel + + serviceMonitor: + enabled: true + + ingress: + enabled: true + annotations: + cert-manager.io/cluster-issuer: "letsencrypt-staging" + ingressClassName: nginx + hosts: + - "grafana.${SECRET_DOMAIN}" + tls: + - secretName: grafana-tls + hosts: + - "grafana.${SECRET_DOMAIN}" + + persistence: + enabled: false + + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: ["grafana"] + topologyKey: kubernetes.io/hostnam + + resources: + requests: + cpu: 23m + memory: 110M + limits: + memory: 152M + + podAnnotations: + configmap.reloader.stakater.com/reload: "grafana" diff --git a/k8s/namespaces/clusters/base/monitoring/grafana/kustomization.yaml b/k8s/namespaces/clusters/base/monitoring/grafana/kustomization.yaml new file mode 100644 index 0000000000..966cf96e8f --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/grafana/kustomization.yaml @@ -0,0 +1,7 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - config-pvc.yaml + - helm-release.yaml + - secret.sops.yaml diff --git a/k8s/namespaces/clusters/base/monitoring/grafana/secret.sops.yaml b/k8s/namespaces/clusters/base/monitoring/grafana/secret.sops.yaml new file mode 100644 index 0000000000..1f93b96b36 --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/grafana/secret.sops.yaml @@ -0,0 +1,29 @@ +# yamllint disable +apiVersion: v1 +kind: Secret +metadata: + name: grafana-admin + namespace: monitoring +stringData: + admin-user: ENC[AES256_GCM,data:kuidUh1LPnS28yk=,iv:7e25n5CWeLyIhsAG5xMBOIbJQsc9VFz65Ro1AD+ZDcA=,tag:33OHQ7Q5MMr0MClo8DGP3w==,type:str] + admin-password: ENC[AES256_GCM,data:OUc2cGtG9AM++DY=,iv:KJTepyEQsxZVtG6AbyQch/2hXzZVOEGPsCWCXX1oRCg=,tag:EYgaf07OKJUwI+WmMG9rhQ==,type:str] +sops: + kms: [] + gcp_kms: [] + azure_kv: [] + hc_vault: [] + age: + - recipient: age1eynu35v0tpg9remal6zeecfeg9e84a2qxake027wwgdn02rdfcls7nyv8r + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSA0TENNN3V6MnVxSWlBRmVz + ekNaRjZ3MlJXVk1oZ2pZWExYYVlLM0oveDJnCklpYnVTWUQwZHJHNDR2S3U3MEQ0 + WWg4SGZWK2IrQ1FyeFk5Qmd4clc3L00KLS0tIGlvWTRLMjVjVTFlTlREVm5SRUJG + Z1R3RFV5bER5Q0VBb3RnK0diNXpNcVkK3U21Y5GWvnmPA8hxi8Us7TkNGsCYAvlD + QmY8mT6ApdiczqVo1DgFmKDSMIYNGL2wlyyriu9MLCU8a9tGxmj47g== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2022-10-04T21:04:23Z" + mac: ENC[AES256_GCM,data:ATOlcCk4vD784adyd+nExlhU1zRKeHjiRCD7oTeW0aMvP/0oW4yZhdp+afdTpMojSO3xcgUHekEKfwwVbtrmOsxFBPfvItmvvFS6hloJe5hLJbPx2Wu9z1wQptzap6ViVWvdYy9gmHDYVaGA15CT9wML6UOCsl1pANEMRT4/vCs=,iv:dD8+5XNVd3uMXMRSXtFw8IQKyA4OHBMc++inln1hCLE=,tag:0WI/2A86wPtdfhJZLgzy5A==,type:str] + pgp: [] + encrypted_regex: ((?i)(pass|secret($|[^N])|key|token|^data$|^stringData)) + version: 3.7.3 From 3e4f357341838480fd774ed2a52ad7d2c6a7ab33 Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Tue, 4 Oct 2022 15:46:14 -0600 Subject: [PATCH 05/18] feat(prometheus): Add Prometheus --- .../kube-prometheus-stack/helm-release.yaml | 223 ++++++++++++++++++ .../kube-prometheus-stack/kustomization.yaml | 5 + 2 files changed, 228 insertions(+) create mode 100644 k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack/helm-release.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack/kustomization.yaml diff --git a/k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack/helm-release.yaml b/k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack/helm-release.yaml new file mode 100644 index 0000000000..ef7a1e96a4 --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack/helm-release.yaml @@ -0,0 +1,223 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: &app kube-prometheus-stack + namespace: monitoring +spec: + interval: 15m + chart: + spec: + chart: kube-prometheus-stack + version: 40.3.1 + sourceRef: + kind: HelmRepository + name: prometheus-community + namespace: flux-system + test: + enable: false + install: + remediation: + retries: 5 + upgrade: + remediation: + retries: 5 + remediateLastFailure: true + cleanupOnFail: true + rollback: + timeout: 10m + recreate: true + cleanupOnFail: true + values: + kubeApiServer: + enabled: true + + kubeControllerManager: + enabled: true + endpoints: + - 192.168.20.5 + - 192.168.20.6 + - 192.168.20.7 + + kubeScheduler: + enabled: true + endpoints: + - 192.168.20.5 + - 192.168.20.6 + - 192.168.20.7 + + kubeEtcd: + enabled: true + endpoints: + - 192.168.20.5 + - 192.168.20.6 + - 192.168.20.7 + service: + enabled: true + port: 2381 + targetPort: 2381 + + kubelet: + enabled: true + serviceMonitor: + metricRelabelings: + - action: replace + sourceLabels: + - node + targetLabel: instance + + kubeProxy: + enabled: false + + kubeStateMetrics: + enabled: true + + kube-state-metrics: + metricLabelsAllowlist: + - "persistentvolumeclaims=[*]" + prometheus: + monitor: + enabled: true + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: kubernetes_node + resources: + requests: + cpu: 15m + memory: 127M + limits: + memory: 153M + + grafana: + enabled: false + forceDeployDashboards: true + sidecar: + dashboards: + multicluster: + etcd: + enabled: true + + nodeExporter: + enabled: true + + prometheus-node-exporter: + resources: + requests: + cpu: 23m + memory: 64M + limits: + memory: 64M + + prometheus: + monitor: + enabled: true + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: kubernetes_node + + prometheusOperator: + resources: + requests: + cpu: 35m + memory: 273M + limits: + memory: 326M + + prometheusConfigReloader: + resources: + requests: + cpu: 11m + memory: 32M + limits: + memory: 32M + + prometheus: + ingress: + enabled: true + pathType: Prefix + ingressClassName: "nginx" + annotations: + # cert-manager.io/cluster-issuer: ${CLUSTER_CERT} + cert-manager.io/cluster-issuer: "letsencrypt-staging" + hosts: + - &host "prometheus.${SECRET_DOMAIN}" + tls: + - secretName: "prometheus-tls" + hosts: + - *host + + thanosService: + enabled: true + + thanosServiceMonitor: + enabled: true + + thanosIngress: + enabled: true + pathType: Prefix + ingressClassName: "nginx" + annotations: + # cert-manager.io/cluster-issuer: ${CLUSTER_CERT} + cert-manager.io/cluster-issuer: "letsencrypt-staging" + nginx.ingress.kubernetes.io/ssl-redirect: "true" + nginx.ingress.kubernetes.io/backend-protocol: "GRPC" + hosts: + - &host "thanos-sidecar.${SECRET_DOMAIN}" + tls: + - secretName: "thanos-sidecar-tls" + hosts: + - *host + + prometheusSpec: + replicas: 3 + replicaExternalLabelName: __replica__ + externalLabels: + cluster: betty + podAntiAffinity: hard + ruleSelectorNilUsesHelmValues: false + serviceMonitorSelectorNilUsesHelmValues: false + podMonitorSelectorNilUsesHelmValues: false + probeSelectorNilUsesHelmValues: false + retentionSize: "6GB" + retention: 2d + enableAdminAPI: true + walCompression: true + storageSpec: + volumeClaimTemplate: + spec: + storageClassName: ceph-block + resources: + requests: + storage: 10Gi + resources: + requests: + cpu: 10m + memory: 500Mi + limits: + memory: 1000Mi + + thanos: + image: quay.io/thanos/thanos:v0.28.0 + version: v0.28.0 + + additionalScrapeConfigs: + - job_name: node-exporter + scrape_interval: 1m + scrape_timeout: 10s + honor_timestamps: true + static_configs: + - targets: + - "kmaster1.${SECRET_DOMAIN}:9100" + - "kmaster2.${SECRET_DOMAIN}:9100" + - "kmaster3.${SECRET_DOMAIN}:9100" + - "kworker1.${SECRET_DOMAIN}:9100" + - "kworker2.${SECRET_DOMAIN}:9100" + - "kworker3.${SECRET_DOMAIN}:9100" diff --git a/k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack/kustomization.yaml b/k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack/kustomization.yaml new file mode 100644 index 0000000000..2fa2de20ca --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack/kustomization.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - helm-release.yaml From f19f72f192faa3fdfbdc84768579efd9c32963ab Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Tue, 4 Oct 2022 15:46:57 -0600 Subject: [PATCH 06/18] feat(thanos): Add Thanos --- .../base/monitoring/thanos/helm-release.yaml | 69 +++++++++++++++++++ .../base/monitoring/thanos/kustomization.yaml | 5 ++ 2 files changed, 74 insertions(+) create mode 100644 k8s/namespaces/clusters/base/monitoring/thanos/helm-release.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/thanos/kustomization.yaml diff --git a/k8s/namespaces/clusters/base/monitoring/thanos/helm-release.yaml b/k8s/namespaces/clusters/base/monitoring/thanos/helm-release.yaml new file mode 100644 index 0000000000..1ac73f3e7e --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/thanos/helm-release.yaml @@ -0,0 +1,69 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: thanos + namespace: monitoring +spec: + interval: 15m + chart: + spec: + chart: thanos + version: 11.5.4 + sourceRef: + kind: HelmRepository + name: bitnami + namespace: flux-system + test: + enable: false + install: + remediation: + retries: 5 + upgrade: + remediation: + retries: 5 + remediateLastFailure: true + cleanupOnFail: true + rollback: + timeout: 10m + recreate: true + cleanupOnFail: true + dependsOn: + - name: kube-prometheus-stack + namespace: monitoring + values: + query: + enabled: true + replicaCount: 3 + podAntiAffinityPreset: hard + replicaLabel: + - __replica__ + dnsDiscovery: + sidecarsService: kube-prometheus-stack-thanos-discovery + sidecarsNamespace: monitoring + ingress: + enabled: true + hostname: &host "thanos-query.${SECRET_DOMAIN}" + ingressClassName: "nginx" + annotations: + # cert-manager.io/cluster-issuer: ${CLUSTER_CERT} + cert-manager.io/cluster-issuer: "letsencrypt-staging" + tls: true + extraTls: + - secretName: "thanos-query-tls" + hosts: + - *host + queryFrontend: + enabled: false + bucketweb: + enabled: false + compactor: + enabled: false + storegateway: + enabled: false + ruler: + enabled: false + metrics: + enabled: true + serviceMonitor: + enabled: true diff --git a/k8s/namespaces/clusters/base/monitoring/thanos/kustomization.yaml b/k8s/namespaces/clusters/base/monitoring/thanos/kustomization.yaml new file mode 100644 index 0000000000..2fa2de20ca --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/thanos/kustomization.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - helm-release.yaml From 28357cf5633368576dde976d2e5b1aa53786a4cd Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Wed, 5 Oct 2022 09:30:34 -0600 Subject: [PATCH 07/18] chore(monitoring): :wrench: Configuration updates --- .../base/monitoring/grafana/helm-release.yaml | 33 +--- .../base/monitoring/grafana/patches/env.yaml | 27 ++++ .../monitoring/grafana/patches/postgres.yaml | 31 ++++ .../base/monitoring/grafana/secret.sops.yaml | 33 +++- k8s/namespaces/overlays/base/database.yaml | 46 ++++++ k8s/namespaces/overlays/base/monitoring.yaml | 144 ++++++++++++++++++ 6 files changed, 285 insertions(+), 29 deletions(-) create mode 100644 k8s/namespaces/clusters/base/monitoring/grafana/patches/env.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/grafana/patches/postgres.yaml create mode 100644 k8s/namespaces/overlays/base/database.yaml create mode 100644 k8s/namespaces/overlays/base/monitoring.yaml diff --git a/k8s/namespaces/clusters/base/monitoring/grafana/helm-release.yaml b/k8s/namespaces/clusters/base/monitoring/grafana/helm-release.yaml index 1702d8cbe9..26bfb82c6e 100644 --- a/k8s/namespaces/clusters/base/monitoring/grafana/helm-release.yaml +++ b/k8s/namespaces/clusters/base/monitoring/grafana/helm-release.yaml @@ -33,31 +33,13 @@ spec: deploymentStrategy: type: Recreate - env: - GF_EXPLORE_ENABLED: true - GF_PANELS_DISABLE_SANITIZE_HTML: true - GF_LOG_FILTERS: rendering:debug - GF_DATE_FORMATS_USE_BROWSER_LOCALE: true - GF_DATE_FORMATS_FULL_DATE: "MMM Do, YYYY hh:mm:ss a" - GF_SECURITY_ALLOW_EMBEDDING: true - admin: existingSecret: grafana-admin + envFromSecrets: + - name: grafana + grafana.ini: - server: - root_url: "https://grafana.${SECRET_DOMAIN}" - paths: - data: /var/lib/grafana/data - logs: /var/log/grafana - plugins: /var/lib/grafana/plugins - provisioning: /etc/grafana/provisioning - analytics: - check_for_updates: false - log: - mode: console - grafana_net: - url: https://grafana.net auth.basic: enabled: true disable_login_form: false @@ -138,23 +120,20 @@ spec: datasources: - name: Prometheus type: prometheus - url: http://thanos-query.monitoring:9090/ + url: http://thanos-query.monitoring.svc.cluster.local:9090/ access: proxy isDefault: true - name: Loki type: loki access: proxy - url: http://loki.monitoring:3100 + url: http://loki.monitoring.svc.cluster.local:3100 dashboards: default: node-exporter-full: url: https://grafana.com/api/dashboards/1860/revisions/22/download datasource: Prometheus - zfs: - gnetId: 7845 - revision: 4 - datasource: Prometheus + flux: flux-cluster: url: https://raw.githubusercontent.com/fluxcd/flux2/main/manifests/monitoring/monitoring-config/dashboards/cluster.json diff --git a/k8s/namespaces/clusters/base/monitoring/grafana/patches/env.yaml b/k8s/namespaces/clusters/base/monitoring/grafana/patches/env.yaml new file mode 100644 index 0000000000..77942c380d --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/grafana/patches/env.yaml @@ -0,0 +1,27 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: grafana + namespace: monitoring +spec: + values: + env: + GF_ANALYTICS_CHECK_FOR_UPDATES: false + GF_DATABASE_HOST: postgres-rw.monitoring.svc.cluster.local:5432 + GF_DATABASE_NAME: grafana + GF_DATABASE_SSL_MODE: disable + GF_DATABASE_TYPE: postgres + GF_DATE_FORMATS_FULL_DATE: "MMM Do, YYYY hh:mm:ss a" + GF_EXPLORE_ENABLED: true + GF_GRAFANA_NET_URL: https://grafana.net + GF_LOG_FILTERS: rendering:debug + GF_LOG_MODE: console + GF_PANELS_DISABLE_SANITIZE_HTML: true + GF_PATHS_DATA: /var/lib/grafana/data + GF_PATHS_LOGS: /var/log/grafana + GF_PATHS_PLUGINS: /var/lib/grafana/plugins + GF_PATHS_PROVISIONING: /etc/grafana/provisioning + GF_SECURITY_ALLOW_EMBEDDING: true + GF_SECURITY_COOKIE_SAMESITE: grafana + GF_SERVER_ROOT_URL: "https://grafana.${SECRET_DOMAIN}" diff --git a/k8s/namespaces/clusters/base/monitoring/grafana/patches/postgres.yaml b/k8s/namespaces/clusters/base/monitoring/grafana/patches/postgres.yaml new file mode 100644 index 0000000000..ad3cd31ce5 --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/grafana/patches/postgres.yaml @@ -0,0 +1,31 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: grafana + namespace: monitoring +spec: + values: + extraInitContainers: + - name: init-db + image: ghcr.io/onedr0p/postgres-initdb:14.5 + env: + - name: POSTGRES_HOST + value: postgres-rw.monitoring.svc.cluster.local + - name: POSTGRES_DB + value: grafana + - name: POSTGRES_SUPER_PASS + valueFrom: + secretKeyRef: + name: postgres-superuser + key: password + - name: POSTGRES_USER + valueFrom: + secretKeyRef: + name: grafana + key: GF_DATABASE_USER + - name: POSTGRES_PASS + valueFrom: + secretKeyRef: + name: grafana + key: GF_DATABASE_PASSWORD diff --git a/k8s/namespaces/clusters/base/monitoring/grafana/secret.sops.yaml b/k8s/namespaces/clusters/base/monitoring/grafana/secret.sops.yaml index 1f93b96b36..609bc6e20e 100644 --- a/k8s/namespaces/clusters/base/monitoring/grafana/secret.sops.yaml +++ b/k8s/namespaces/clusters/base/monitoring/grafana/secret.sops.yaml @@ -22,8 +22,37 @@ sops: Z1R3RFV5bER5Q0VBb3RnK0diNXpNcVkK3U21Y5GWvnmPA8hxi8Us7TkNGsCYAvlD QmY8mT6ApdiczqVo1DgFmKDSMIYNGL2wlyyriu9MLCU8a9tGxmj47g== -----END AGE ENCRYPTED FILE----- - lastmodified: "2022-10-04T21:04:23Z" - mac: ENC[AES256_GCM,data:ATOlcCk4vD784adyd+nExlhU1zRKeHjiRCD7oTeW0aMvP/0oW4yZhdp+afdTpMojSO3xcgUHekEKfwwVbtrmOsxFBPfvItmvvFS6hloJe5hLJbPx2Wu9z1wQptzap6ViVWvdYy9gmHDYVaGA15CT9wML6UOCsl1pANEMRT4/vCs=,iv:dD8+5XNVd3uMXMRSXtFw8IQKyA4OHBMc++inln1hCLE=,tag:0WI/2A86wPtdfhJZLgzy5A==,type:str] + lastmodified: "2022-10-05T15:05:14Z" + mac: ENC[AES256_GCM,data:KBjuXHhmBJi3kQZiu5xAagXBTaMeuyUlv1sTwcaWb9IzE/4qTssuyF3sA31Jokjy8d6uEqbTrpb9eoPmDkH30mX9M3E7uorTPyVu3fEhtMCsn/yWDVNZeX9JicwFxdEoU127ox5qFSfn0u7UYxl0b58LU1YT9kH/ea2MKd4bV/k=,iv:LRiCgC/aA1BUtIev6eYdkPpzruAURuHNO5aoB8UGNbM=,tag:d2QEUNoORWoLkgWt0PXIfA==,type:str] + pgp: [] + encrypted_regex: ((?i)(pass|secret($|[^N])|key|token|^data$|^stringData)) + version: 3.7.3 +--- +apiVersion: v1 +kind: Secret +metadata: + name: grafana + namespace: monitoring +stringData: + GF_DATABASE_USER: ENC[AES256_GCM,data:66EKHH1SNTbhQOA=,iv:ISp1bxw+62g/+kg6iYOM2NrU5youD7WYuJ/WE3l3Feg=,tag:N2yqF47NKOKtWxj21fODqg==,type:str] + GF_DATABASE_PASSWORD: ENC[AES256_GCM,data:/1QtvaJrJywR0g4=,iv:UKwhiLq05kATf6QJgapk4NrJ/WGRAFj9FpERTfe1Cdk=,tag:COKoQ86J71gladhxZEChlA==,type:str] +sops: + kms: [] + gcp_kms: [] + azure_kv: [] + hc_vault: [] + age: + - recipient: age1eynu35v0tpg9remal6zeecfeg9e84a2qxake027wwgdn02rdfcls7nyv8r + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSA0TENNN3V6MnVxSWlBRmVz + ekNaRjZ3MlJXVk1oZ2pZWExYYVlLM0oveDJnCklpYnVTWUQwZHJHNDR2S3U3MEQ0 + WWg4SGZWK2IrQ1FyeFk5Qmd4clc3L00KLS0tIGlvWTRLMjVjVTFlTlREVm5SRUJG + Z1R3RFV5bER5Q0VBb3RnK0diNXpNcVkK3U21Y5GWvnmPA8hxi8Us7TkNGsCYAvlD + QmY8mT6ApdiczqVo1DgFmKDSMIYNGL2wlyyriu9MLCU8a9tGxmj47g== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2022-10-05T15:05:14Z" + mac: ENC[AES256_GCM,data:KBjuXHhmBJi3kQZiu5xAagXBTaMeuyUlv1sTwcaWb9IzE/4qTssuyF3sA31Jokjy8d6uEqbTrpb9eoPmDkH30mX9M3E7uorTPyVu3fEhtMCsn/yWDVNZeX9JicwFxdEoU127ox5qFSfn0u7UYxl0b58LU1YT9kH/ea2MKd4bV/k=,iv:LRiCgC/aA1BUtIev6eYdkPpzruAURuHNO5aoB8UGNbM=,tag:d2QEUNoORWoLkgWt0PXIfA==,type:str] pgp: [] encrypted_regex: ((?i)(pass|secret($|[^N])|key|token|^data$|^stringData)) version: 3.7.3 diff --git a/k8s/namespaces/overlays/base/database.yaml b/k8s/namespaces/overlays/base/database.yaml new file mode 100644 index 0000000000..025319bd93 --- /dev/null +++ b/k8s/namespaces/overlays/base/database.yaml @@ -0,0 +1,46 @@ +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 +kind: Kustomization +metadata: + name: database-namespace + namespace: flux-system +spec: + interval: 5m + path: "./k8s/namespaces/clusters/base/databases" + prune: true + wait: true + sourceRef: + kind: GitRepository + name: home-cluster +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 +kind: Kustomization +metadata: + name: networking-origin-ca-issuer + namespace: flux-system +spec: + interval: 5m + path: "./k8s/namespaces/clusters/base/databases/postgres" + prune: true + wait: true + sourceRef: + kind: GitRepository + name: home-cluster + decryption: + provider: sops + secretRef: + name: sops-age + dependsOn: + - name: networking-cert-manager + namespace: flux-system + - name: database-namespace + namespace: flux-system + postBuild: + substitute: {} + substituteFrom: + - kind: ConfigMap + name: cluster-config + - kind: ConfigMap + name: global-config + - kind: Secret + name: cluster-secrets diff --git a/k8s/namespaces/overlays/base/monitoring.yaml b/k8s/namespaces/overlays/base/monitoring.yaml new file mode 100644 index 0000000000..2e032c98aa --- /dev/null +++ b/k8s/namespaces/overlays/base/monitoring.yaml @@ -0,0 +1,144 @@ +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 +kind: Kustomization +metadata: + name: monitoring-namespace + namespace: flux-system +spec: + interval: 5m + path: "./k8s/namespaces/clusters/base/monitoring" + prune: true + wait: true + sourceRef: + kind: GitRepository + name: home-cluster +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 +kind: Kustomization +metadata: + name: monitoring-kube-prometheus-stack + namespace: flux-system +spec: + interval: 5m + path: "./k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack" + prune: true + wait: true + sourceRef: + kind: GitRepository + name: home-cluster + decryption: + provider: sops + secretRef: + name: sops-age + dependsOn: + - name: networking-cert-manager + namespace: flux-system + - name: monitoring-namespace + namespace: flux-system + postBuild: + substitute: {} + substituteFrom: + - kind: ConfigMap + name: cluster-config + - kind: ConfigMap + name: global-config + - kind: Secret + name: cluster-secrets +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 +kind: Kustomization +metadata: + name: monitoring-grafana + namespace: flux-system +spec: + interval: 5m + path: "./k8s/namespaces/clusters/base/monitoring/grafana" + prune: true + wait: true + sourceRef: + kind: GitRepository + name: home-cluster + decryption: + provider: sops + secretRef: + name: sops-age + dependsOn: + - name: networking-cert-manager + namespace: flux-system + - name: monitoring-namespace + namespace: flux-system + postBuild: + substitute: {} + substituteFrom: + - kind: ConfigMap + name: cluster-config + - kind: ConfigMap + name: global-config + - kind: Secret + name: cluster-secrets +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 +kind: Kustomization +metadata: + name: monitoring-grafana-patches + namespace: flux-system +spec: + interval: 5m + path: "./k8s/namespaces/clusters/base/monitoring/grafana/patches" + prune: true + wait: true + sourceRef: + kind: GitRepository + name: home-cluster + decryption: + provider: sops + secretRef: + name: sops-age + dependsOn: + - name: networking-cert-manager + namespace: flux-system + - name: monitoring-namespace + namespace: flux-system + - name: monitoring-grafana + namespace: flux-system + postBuild: + substitute: {} + substituteFrom: + - kind: ConfigMap + name: cluster-config + - kind: ConfigMap + name: global-config + - kind: Secret + name: cluster-secrets +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 +kind: Kustomization +metadata: + name: monitoring-thanos + namespace: flux-system +spec: + interval: 5m + path: "./k8s/namespaces/clusters/base/monitoring/thanos" + prune: true + wait: true + sourceRef: + kind: GitRepository + name: home-cluster + decryption: + provider: sops + secretRef: + name: sops-age + dependsOn: + - name: networking-cert-manager + namespace: flux-system + - name: monitoring-namespace + namespace: flux-system + postBuild: + substitute: {} + substituteFrom: + - kind: ConfigMap + name: cluster-config + - kind: ConfigMap + name: global-config + - kind: Secret + name: cluster-secrets From 02dda620d9fb48b3d6383e68dda29a8c33c1f36a Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Wed, 5 Oct 2022 10:19:43 -0600 Subject: [PATCH 08/18] chore(monitoring): :wrench: Change config --- .../monitoring/kube-prometheus-stack/helm-release.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack/helm-release.yaml b/k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack/helm-release.yaml index ef7a1e96a4..50bb3bc001 100644 --- a/k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack/helm-release.yaml +++ b/k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack/helm-release.yaml @@ -186,17 +186,17 @@ spec: serviceMonitorSelectorNilUsesHelmValues: false podMonitorSelectorNilUsesHelmValues: false probeSelectorNilUsesHelmValues: false - retentionSize: "6GB" - retention: 2d + retention: 14d + retentionSize: 45GB enableAdminAPI: true walCompression: true storageSpec: volumeClaimTemplate: spec: - storageClassName: ceph-block + storageClassName: "ceph-block" resources: requests: - storage: 10Gi + storage: 50Gi resources: requests: cpu: 10m From e4aa8d40771e00ef24097f08c2cac38e2ee1e2a2 Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Wed, 5 Oct 2022 11:56:23 -0600 Subject: [PATCH 09/18] feat(rook-ceph): :memo: Add monitoring support --- .../clusters/base/rook-ceph/rook-cluster/helm-release.yaml | 3 ++- .../clusters/base/rook-ceph/rook-operator/helm-release.yaml | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/k8s/namespaces/clusters/base/rook-ceph/rook-cluster/helm-release.yaml b/k8s/namespaces/clusters/base/rook-ceph/rook-cluster/helm-release.yaml index e860e53610..d94cb2bf8e 100644 --- a/k8s/namespaces/clusters/base/rook-ceph/rook-cluster/helm-release.yaml +++ b/k8s/namespaces/clusters/base/rook-ceph/rook-cluster/helm-release.yaml @@ -29,7 +29,8 @@ spec: cleanupOnFail: true values: monitoring: - enabled: false + enabled: true + createPrometheusRules: true ingress: dashboard: ingressClassName: "nginx" diff --git a/k8s/namespaces/clusters/base/rook-ceph/rook-operator/helm-release.yaml b/k8s/namespaces/clusters/base/rook-ceph/rook-operator/helm-release.yaml index d16fe52a7f..2cd42a6042 100644 --- a/k8s/namespaces/clusters/base/rook-ceph/rook-operator/helm-release.yaml +++ b/k8s/namespaces/clusters/base/rook-ceph/rook-operator/helm-release.yaml @@ -30,6 +30,8 @@ spec: values: crds: enabled: false + monitoring: + enabled: true resources: requests: cpu: 10m From 25512b95149faa75b9c5f0ca2ede3bc366b0d96f Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Wed, 5 Oct 2022 12:41:40 -0600 Subject: [PATCH 10/18] feat(monitoring): :hammer: Add loki --- .../base/monitoring/loki/helm-release.yaml | 200 ++++++++++++++++++ .../base/monitoring/loki/kustomization.yaml | 5 + 2 files changed, 205 insertions(+) create mode 100644 k8s/namespaces/clusters/base/monitoring/loki/helm-release.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/loki/kustomization.yaml diff --git a/k8s/namespaces/clusters/base/monitoring/loki/helm-release.yaml b/k8s/namespaces/clusters/base/monitoring/loki/helm-release.yaml new file mode 100644 index 0000000000..ff6c6b80db --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/loki/helm-release.yaml @@ -0,0 +1,200 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: &app loki + namespace: monitoring +spec: + interval: 5m + chart: + spec: + chart: loki + version: 3.2.1 + sourceRef: + kind: HelmRepository + name: grafana + namespace: flux-system + interval: 5m + test: + enable: false + install: + remediation: + retries: 5 + upgrade: + remediation: + retries: 5 + remediateLastFailure: true + cleanupOnFail: true + rollback: + timeout: 10m + recreate: true + cleanupOnFail: true + dependsOn: + - name: kube-prometheus-stack + namespace: monitoring + - name: ingress-nginx + namespace: networking + # https://github.com/grafana/loki/blob/main/production/helm/loki/values.yaml + values: + loki: + structuredConfig: + auth_enabled: false + server: + log_level: info + http_listen_port: 3100 + grpc_listen_port: 9095 + + memberlist: + join_members: ["loki-memberlist"] + + limits_config: + retention_period: 14d + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h + max_cache_freshness_per_query: 10m + split_queries_by_interval: 15m + ingestion_rate_mb: 8 + ingestion_burst_size_mb: 16 + + schema_config: + configs: + - from: "2021-08-01" + store: boltdb-shipper + object_store: s3 + schema: v11 + index: + prefix: loki_index_ + period: 24h + common: + path_prefix: /var/loki + replication_factor: 3 + storage: + s3: + s3: null + insecure: true + s3forcepathstyle: true + ring: + kvstore: + store: memberlist + ruler: + enable_api: true + enable_alertmanager_v2: true + alertmanager_url: http://kube-prometheus-stack-alertmanager:9093 + storage: + type: local + local: + directory: /rules + rule_path: /tmp/scratch + ring: + kvstore: + store: memberlist + distributor: + ring: + kvstore: + store: memberlist + compactor: + working_directory: /var/loki/boltdb-shipper-compactor + shared_store: s3 + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + ingester: + max_chunk_age: 1h + lifecycler: + ring: + kvstore: + store: memberlist + analytics: + reporting_enabled: false + gateway: + enabled: true + replicas: 3 + affinity: | + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + podAffinityTerm: + labelSelector: + matchLabels: + {{- include "loki.gatewaySelectorLabels" . | nindent 12 }} + topologyKey: kubernetes.io/hostname + ingress: + enabled: true + ingressClassName: "nginx" + hosts: + - host: &host "loki.${EXTERNAL_DOMAIN}" + paths: + - path: / + pathType: Prefix + tls: + - hosts: + - *host + write: + replicas: 3 + affinity: | + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + podAffinityTerm: + labelSelector: + matchLabels: + {{- include "loki.writeSelectorLabels" . | nindent 12 }} + topologyKey: kubernetes.io/hostname + persistence: + size: 10Gi + storageClass: ceph-block + + read: + replicas: 3 + affinity: | + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + podAffinityTerm: + labelSelector: + matchLabels: + {{- include "loki.readSelectorLabels" . | nindent 12 }} + topologyKey: kubernetes.io/hostname + extraVolumeMounts: + - name: loki-rules + mountPath: /rules/fake + - name: loki-rules-tmp + mountPath: /tmp/scratch + - name: loki-tmp + mountPath: /tmp/loki-tmp + extraVolumes: + - name: loki-rules + configMap: + name: loki-alerting-rules + - name: loki-rules-tmp + emptyDir: {} + - name: loki-tmp + emptyDir: {} + persistence: + size: 10Gi + storageClass: ceph-block + monitoring: + selfMonitoring: + enabled: false + grafanaAgent: + installOperator: false + + valuesFrom: + - kind: Secret + name: loki-secret + valuesKey: S3_BUCKET_NAME + targetPath: loki.structuredConfig.common.storage.s3.bucketnames + - kind: Secret + name: loki-secret + valuesKey: S3_BUCKET_HOST + targetPath: loki.structuredConfig.common.storage.s3.endpoint + - kind: Secret + name: loki-secret + valuesKey: S3_ACCESS_KEY + targetPath: loki.structuredConfig.common.storage.s3.access_key_id + - kind: Secret + name: loki-secret + valuesKey: S3_SECRET_KEY + targetPath: loki.structuredConfig.common.storage.s3.secret_access_key diff --git a/k8s/namespaces/clusters/base/monitoring/loki/kustomization.yaml b/k8s/namespaces/clusters/base/monitoring/loki/kustomization.yaml new file mode 100644 index 0000000000..2fa2de20ca --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/loki/kustomization.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - helm-release.yaml From 602db5d882a382d12cbd8e7cd7dd76fd9bcb7ef1 Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Wed, 5 Oct 2022 14:37:10 -0600 Subject: [PATCH 11/18] chore(monitoring): Update deployment --- .../base/monitoring/loki/helm-release.yaml | 73 ++----------------- 1 file changed, 5 insertions(+), 68 deletions(-) diff --git a/k8s/namespaces/clusters/base/monitoring/loki/helm-release.yaml b/k8s/namespaces/clusters/base/monitoring/loki/helm-release.yaml index ff6c6b80db..d0d4b624ec 100644 --- a/k8s/namespaces/clusters/base/monitoring/loki/helm-release.yaml +++ b/k8s/namespaces/clusters/base/monitoring/loki/helm-release.yaml @@ -5,36 +5,22 @@ metadata: name: &app loki namespace: monitoring spec: - interval: 5m + interval: 15m chart: spec: chart: loki - version: 3.2.1 + version: 3.2.0 sourceRef: kind: HelmRepository - name: grafana + name: grafana-charts namespace: flux-system - interval: 5m - test: - enable: false install: + createNamespace: true remediation: retries: 5 upgrade: remediation: retries: 5 - remediateLastFailure: true - cleanupOnFail: true - rollback: - timeout: 10m - recreate: true - cleanupOnFail: true - dependsOn: - - name: kube-prometheus-stack - namespace: monitoring - - name: ingress-nginx - namespace: networking - # https://github.com/grafana/loki/blob/main/production/helm/loki/values.yaml values: loki: structuredConfig: @@ -43,10 +29,8 @@ spec: log_level: info http_listen_port: 3100 grpc_listen_port: 9095 - memberlist: join_members: ["loki-memberlist"] - limits_config: retention_period: 14d enforce_metric_name: false @@ -56,7 +40,6 @@ spec: split_queries_by_interval: 15m ingestion_rate_mb: 8 ingestion_burst_size_mb: 16 - schema_config: configs: - from: "2021-08-01" @@ -111,20 +94,11 @@ spec: gateway: enabled: true replicas: 3 - affinity: | - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 1 - podAffinityTerm: - labelSelector: - matchLabels: - {{- include "loki.gatewaySelectorLabels" . | nindent 12 }} - topologyKey: kubernetes.io/hostname ingress: enabled: true ingressClassName: "nginx" hosts: - - host: &host "loki.${EXTERNAL_DOMAIN}" + - host: &host "loki.${SECRET_DOMAIN}" paths: - path: / pathType: Prefix @@ -133,30 +107,11 @@ spec: - *host write: replicas: 3 - affinity: | - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 1 - podAffinityTerm: - labelSelector: - matchLabels: - {{- include "loki.writeSelectorLabels" . | nindent 12 }} - topologyKey: kubernetes.io/hostname persistence: size: 10Gi storageClass: ceph-block - read: replicas: 3 - affinity: | - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 1 - podAffinityTerm: - labelSelector: - matchLabels: - {{- include "loki.readSelectorLabels" . | nindent 12 }} - topologyKey: kubernetes.io/hostname extraVolumeMounts: - name: loki-rules mountPath: /rules/fake @@ -180,21 +135,3 @@ spec: enabled: false grafanaAgent: installOperator: false - - valuesFrom: - - kind: Secret - name: loki-secret - valuesKey: S3_BUCKET_NAME - targetPath: loki.structuredConfig.common.storage.s3.bucketnames - - kind: Secret - name: loki-secret - valuesKey: S3_BUCKET_HOST - targetPath: loki.structuredConfig.common.storage.s3.endpoint - - kind: Secret - name: loki-secret - valuesKey: S3_ACCESS_KEY - targetPath: loki.structuredConfig.common.storage.s3.access_key_id - - kind: Secret - name: loki-secret - valuesKey: S3_SECRET_KEY - targetPath: loki.structuredConfig.common.storage.s3.secret_access_key From b42f15aab5b19170f40bdc32807ceb8510f17be9 Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Wed, 5 Oct 2022 14:44:03 -0600 Subject: [PATCH 12/18] chore(monitoring): --- .../base/monitoring/loki/config-map.yaml | 46 +++++++++++++++++++ .../base/monitoring/loki/helm-release.yaml | 6 ++- .../base/monitoring/loki/kustomization.yaml | 1 + 3 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 k8s/namespaces/clusters/base/monitoring/loki/config-map.yaml diff --git a/k8s/namespaces/clusters/base/monitoring/loki/config-map.yaml b/k8s/namespaces/clusters/base/monitoring/loki/config-map.yaml new file mode 100644 index 0000000000..4dfcc1f5c2 --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/loki/config-map.yaml @@ -0,0 +1,46 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: loki-alerting-rules + namespace: monitoring +data: + loki-alerting-rules.yaml: |- + groups: + # + # SMART Failures + # + - name: smart-failure + rules: + - alert: SmartFailures + expr: | + sum by (hostname) (count_over_time({hostname=~".+"} | json | _SYSTEMD_UNIT = "smartmontools.service" !~ "(?i)previous self-test completed without error" !~ "(?i)Prefailure" |~ "(?i)(error|fail)"[2m])) > 0 + for: 2m + labels: + severity: critical + category: logs + annotations: + summary: "SMART has reported failures on host {{ $labels.hostname }}" + # + # *arr + # + - name: arr + rules: + - alert: ArrDatabaseIsLocked + expr: | + sum by (app) (count_over_time({app=~".*arr"} |~ "(?i)database is locked"[2m])) > 0 + for: 2m + labels: + severity: critical + category: logs + annotations: + summary: "{{ $labels.app }} is experiencing locked database issues" + - alert: ArrDatabaseIsMalformed + expr: | + sum by (app) (count_over_time({app=~".*arr"} |~ "(?i)database disk image is malformed"[2m])) > 0 + for: 2m + labels: + severity: critical + category: logs + annotations: + summary: "{{ $labels.app }} is experiencing malformed database disk image issues" diff --git a/k8s/namespaces/clusters/base/monitoring/loki/helm-release.yaml b/k8s/namespaces/clusters/base/monitoring/loki/helm-release.yaml index d0d4b624ec..904cc7c872 100644 --- a/k8s/namespaces/clusters/base/monitoring/loki/helm-release.yaml +++ b/k8s/namespaces/clusters/base/monitoring/loki/helm-release.yaml @@ -97,13 +97,17 @@ spec: ingress: enabled: true ingressClassName: "nginx" + annotations: + # cert-manager.io/cluster-issuer: ${CLUSTER_CERT} + cert-manager.io/cluster-issuer: "letsencrypt-staging" hosts: - host: &host "loki.${SECRET_DOMAIN}" paths: - path: / pathType: Prefix tls: - - hosts: + - secretName: "loki-tls" + hosts: - *host write: replicas: 3 diff --git a/k8s/namespaces/clusters/base/monitoring/loki/kustomization.yaml b/k8s/namespaces/clusters/base/monitoring/loki/kustomization.yaml index 2fa2de20ca..c2b538cdda 100644 --- a/k8s/namespaces/clusters/base/monitoring/loki/kustomization.yaml +++ b/k8s/namespaces/clusters/base/monitoring/loki/kustomization.yaml @@ -2,4 +2,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: + - config-map.yaml - helm-release.yaml From b6c634af51e6ff7aa7651562d71a68b7c4997b37 Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Wed, 5 Oct 2022 16:01:48 -0600 Subject: [PATCH 13/18] feat(monitoring): :hammer: Add vector --- .../flux/repositories/helm/kustomization.yaml | 1 + .../flux/repositories/helm/vector-charts.yaml | 9 + .../base/monitoring/loki/helm-release.yaml | 9 +- .../monitoring/vector/agent/helm-release.yaml | 73 +++++++ .../vector/agent/kustomization.yaml | 5 + .../vector/aggregator/filterlog-regex.txt | 20 ++ .../vector/aggregator/helm-release.yaml | 186 ++++++++++++++++++ .../vector/aggregator/kustomization.yaml | 5 + .../vector/geoipupdate/config-pvc.yaml | 15 ++ .../vector/geoipupdate/cron-job.yaml | 46 +++++ .../vector/geoipupdate/kustomization.yaml | 5 + .../vector/geoipupdate/secrets.sops.yaml | 29 +++ k8s/namespaces/overlays/base/monitoring.yaml | 96 +++++++++ 13 files changed, 498 insertions(+), 1 deletion(-) create mode 100644 k8s/global/flux/repositories/helm/vector-charts.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/vector/agent/helm-release.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/vector/agent/kustomization.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/vector/aggregator/filterlog-regex.txt create mode 100644 k8s/namespaces/clusters/base/monitoring/vector/aggregator/helm-release.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/vector/aggregator/kustomization.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/vector/geoipupdate/config-pvc.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/vector/geoipupdate/cron-job.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/vector/geoipupdate/kustomization.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/vector/geoipupdate/secrets.sops.yaml diff --git a/k8s/global/flux/repositories/helm/kustomization.yaml b/k8s/global/flux/repositories/helm/kustomization.yaml index fbb10f4df9..0aad3ccc81 100644 --- a/k8s/global/flux/repositories/helm/kustomization.yaml +++ b/k8s/global/flux/repositories/helm/kustomization.yaml @@ -16,3 +16,4 @@ resources: - bitnami-charts.yaml - kubereboot-charts.yaml - grafana-charts.yaml + - vector-charts.yaml diff --git a/k8s/global/flux/repositories/helm/vector-charts.yaml b/k8s/global/flux/repositories/helm/vector-charts.yaml new file mode 100644 index 0000000000..e5090d6b23 --- /dev/null +++ b/k8s/global/flux/repositories/helm/vector-charts.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: HelmRepository +metadata: + name: vector + namespace: flux-system +spec: + interval: 1h + url: https://helm.vector.dev diff --git a/k8s/namespaces/clusters/base/monitoring/loki/helm-release.yaml b/k8s/namespaces/clusters/base/monitoring/loki/helm-release.yaml index 904cc7c872..eb37297074 100644 --- a/k8s/namespaces/clusters/base/monitoring/loki/helm-release.yaml +++ b/k8s/namespaces/clusters/base/monitoring/loki/helm-release.yaml @@ -14,13 +14,20 @@ spec: kind: HelmRepository name: grafana-charts namespace: flux-system + test: + enable: false install: - createNamespace: true remediation: retries: 5 upgrade: remediation: retries: 5 + remediateLastFailure: true + cleanupOnFail: true + rollback: + timeout: 10m + recreate: true + cleanupOnFail: true values: loki: structuredConfig: diff --git a/k8s/namespaces/clusters/base/monitoring/vector/agent/helm-release.yaml b/k8s/namespaces/clusters/base/monitoring/vector/agent/helm-release.yaml new file mode 100644 index 0000000000..dbd0a391d8 --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/vector/agent/helm-release.yaml @@ -0,0 +1,73 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: vector-agent + namespace: monitoring +spec: + interval: 15m + chart: + spec: + chart: vector + version: 0.16.0 + sourceRef: + kind: HelmRepository + name: vector + namespace: flux-system + test: + enable: false + install: + remediation: + retries: 5 + upgrade: + remediation: + retries: 5 + remediateLastFailure: true + cleanupOnFail: true + rollback: + timeout: 10m + recreate: true + cleanupOnFail: true + dependsOn: + - name: loki + namespace: monitoring + - name: vector-aggregator + namespace: monitoring + values: + image: + repository: timberio/vector + tag: 0.24.1-debian + role: Agent + customConfig: + data_dir: /vector-data-dir + api: + enabled: false + sources: + journal_logs: + type: journald + journal_directory: /var/log/journal + kubernetes_logs: + type: kubernetes_logs + pod_annotation_fields: + container_image: container_image + container_name: container_name + pod_annotations: pod_annotations + pod_labels: pod_labels + pod_name: pod_name + sinks: + loki_journal_sink: + type: vector + inputs: + - journal_logs + address: vector-aggregator:6000 + version: "2" + loki_kubernetes_sink: + type: vector + inputs: + - kubernetes_logs + address: vector-aggregator:6010 + version: "2" + service: + enabled: false + securityContext: + privileged: true diff --git a/k8s/namespaces/clusters/base/monitoring/vector/agent/kustomization.yaml b/k8s/namespaces/clusters/base/monitoring/vector/agent/kustomization.yaml new file mode 100644 index 0000000000..2fa2de20ca --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/vector/agent/kustomization.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - helm-release.yaml diff --git a/k8s/namespaces/clusters/base/monitoring/vector/aggregator/filterlog-regex.txt b/k8s/namespaces/clusters/base/monitoring/vector/aggregator/filterlog-regex.txt new file mode 100644 index 0000000000..59d572d034 --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/vector/aggregator/filterlog-regex.txt @@ -0,0 +1,20 @@ +# +# IPv4: TCP +# Regex: ^(?P(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?Ptcp),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*))$ +# Example: 94,,,ef794793b2e3764b938bd04cba88e8a3,igb0,match,pass,out,4,0x0,,62,16800,0,DF,6,tcp,60,xxx.xxx.xxx.xxx,xxx.xxx.xxx.xxx,11715,443,0,S,3876953207,,64240,,mss;sackOK;TS;nop;wscale +# +# IPv6: TCP +# Regex: ? +# Example: ? +# +# IPv4 / IPv6: UDP +# Regex: ^(?P(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?Pudp),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*))$ +# Example: 90,,,91e2443ae2e8caf012f9a6e5a8a455c8,lo0,match,pass,in,4,0x4,,255,4660,0,none,17,udp,914,xxx.xxx.xxx.xxx,xxx.xxx.xxx.xxx,5353,5353,894 +# Example: 15,,,91515c100a3692cb94121964974ce513,igb1_vlan150,match,block,in,6,0x00,0x00000,255,udp,17,391,xxxx::xxxx:xxxx:xxxx:xxxx,xxxx::xx,5353,5353,391 +# +# IPv4: ICMP / IGMP / GRE +# Regex: ^(?P(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?Picmp|igmp|gre),(?P[^,]*),(?P[^,]*),(?P[^,]*),(?P[^,]*))$ +# Example: 94,,,ef794793b2e3764b938bd04cba88e8a3,igb0,match,pass,out,4,0x0,,63,44871,0,DF,1,icmp,84,xxx.xxx.xxx.xxx,xxx.xxx.xxx.xxx,datalength=64 +# Example: 16,,,02f4bab031b57d1e30553ce08e0ec131,igb1_vlan150,match,block,in,4,0xc0,,1,15472,0,none,2,igmp,32,xxx.xxx.xxx.xxx,xxx.xxx.xxx.xxx,datalength=8 +# Example: 16,,,02f4bab031b57d1e30553ce08e0ec131,igb0,match,block,in,4,0x0,,57,20354,0,DF,47,gre,564,xxx.xxx.xxx.xxx,xxx.xxx.xxx.xxx,datalength=544 +# diff --git a/k8s/namespaces/clusters/base/monitoring/vector/aggregator/helm-release.yaml b/k8s/namespaces/clusters/base/monitoring/vector/aggregator/helm-release.yaml new file mode 100644 index 0000000000..43e9638539 --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/vector/aggregator/helm-release.yaml @@ -0,0 +1,186 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: vector-aggregator + namespace: monitoring +spec: + interval: 15m + chart: + spec: + chart: vector + version: 0.16.0 + sourceRef: + kind: HelmRepository + name: vector + namespace: flux-system + test: + enable: false + install: + remediation: + retries: 5 + upgrade: + remediation: + retries: 5 + remediateLastFailure: true + cleanupOnFail: true + rollback: + timeout: 10m + recreate: true + cleanupOnFail: true + dependsOn: + - name: loki + namespace: monitoring + values: + image: + repository: timberio/vector + tag: 0.24.1-debian + role: Stateless-Aggregator + replicas: 3 + customConfig: + data_dir: /vector-data-dir + api: + enabled: false + sources: + journal_logs: + type: vector + address: 0.0.0.0:6000 + version: "2" + kubernetes_logs: + type: vector + address: 0.0.0.0:6010 + version: "2" + opnsense_filterlog_logs: + type: syslog + address: 0.0.0.0:5140 + mode: udp + transforms: + kubernetes_logs_remap: + type: remap + inputs: + - kubernetes_logs + source: | + # Standardize 'app' index + .custom_app_name = .pod_labels."app.kubernetes.io/name" || .pod_labels.app || .pod_labels."k8s-app" || "unknown" + opnsense_filterlog_remap: + type: remap + inputs: + - opnsense_filterlog_logs + source: | + msg = parse_csv!(string!(.message)) + # Only parse IPv4 / IPv6 + if msg[8] == "4" || msg[8] == "6" { + .filter_interface = msg[4] + .filter_direction = msg[7] + .filter_action = msg[6] + .filter_ip_version = msg[8] + .filter_protocol = msg[16] + .filter_source_ip = msg[18] + .filter_destination_ip = msg[19] + if (msg[16] == "icmp" || msg[16] == "igmp" || msg[16] == "gre") { + .filter_data = msg[20] + } else { + .filter_source_port = msg[20] + .filter_destination_port = msg[21] + .filter_data_length = msg[22] + if msg[8] == "4" && msg[16] == "tcp" { + .filter_tcp_flags = msg[23] + } + } + } + opnsense_filterlog_route: + type: route + inputs: + - opnsense_filterlog_remap + route: + pass_action: >- + .filter_action == "pass" + opnsense_filterlog_geoip: + type: geoip + inputs: + - opnsense_filterlog_route.pass_action + database: /geoip/GeoLite2-City.mmdb + source: filter_source_ip + target: geoip + sinks: + loki_journal: + type: loki + inputs: + - journal_logs + endpoint: http://loki-gateway:80 + encoding: + codec: json + batch: + max_bytes: 2049000 + out_of_order_action: accept + remove_label_fields: true + remove_timestamp: true + labels: + hostname: >- + {{`{{ host }}`}} + loki_kubernetes: + type: loki + inputs: + - kubernetes_logs_remap + endpoint: http://loki-gateway:80 + encoding: + codec: json + batch: + max_bytes: 2049000 + out_of_order_action: accept + remove_label_fields: true + remove_timestamp: true + labels: + app: >- + {{`{{ custom_app_name }}`}} + namespace: >- + {{`{{ kubernetes.pod_namespace }}`}} + node: >- + {{`{{ kubernetes.pod_node_name }}`}} + loki_opnsense_filterlog: + type: loki + inputs: + - opnsense_filterlog_route._unmatched + - opnsense_filterlog_geoip + endpoint: http://loki-gateway:80 + encoding: + codec: json + batch: + max_bytes: 2049000 + out_of_order_action: accept + labels: + hostname: opnsense + extraVolumeMounts: + - name: geoip + mountPath: /geoip + extraVolumes: + - name: geoip + persistentVolumeClaim: + claimName: vector-geoipupdate-config-v1 + service: + enabled: true + type: LoadBalancer + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/component + operator: In + values: ["Stateless-Aggregator"] + topologyKey: kubernetes.io/hostname + postRenderers: + - kustomize: + patchesJson6902: + - target: + kind: Service + name: vector-aggregator + patch: + - op: add + path: /spec/externalIPs + value: ["${SVC_SYSLOG_ADDR}"] + - op: replace + path: /spec/externalTrafficPolicy + value: Local diff --git a/k8s/namespaces/clusters/base/monitoring/vector/aggregator/kustomization.yaml b/k8s/namespaces/clusters/base/monitoring/vector/aggregator/kustomization.yaml new file mode 100644 index 0000000000..2fa2de20ca --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/vector/aggregator/kustomization.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - helm-release.yaml diff --git a/k8s/namespaces/clusters/base/monitoring/vector/geoipupdate/config-pvc.yaml b/k8s/namespaces/clusters/base/monitoring/vector/geoipupdate/config-pvc.yaml new file mode 100644 index 0000000000..66f4d04c37 --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/vector/geoipupdate/config-pvc.yaml @@ -0,0 +1,15 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: vector-geoipupdate-config-v1 + namespace: monitoring + labels: + excluded_from_alerts: "true" +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1Gi + storageClassName: ceph-filesystem diff --git a/k8s/namespaces/clusters/base/monitoring/vector/geoipupdate/cron-job.yaml b/k8s/namespaces/clusters/base/monitoring/vector/geoipupdate/cron-job.yaml new file mode 100644 index 0000000000..cc25f3ca0a --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/vector/geoipupdate/cron-job.yaml @@ -0,0 +1,46 @@ +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: &app vector-geoipupdate + namespace: monitoring +spec: + schedule: "@daily" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 2 + jobTemplate: + spec: + ttlSecondsAfterFinished: 86400 + template: + spec: + automountServiceAccountToken: false + restartPolicy: Never + containers: + - name: *app + image: docker.io/maxmindinc/geoipupdate:v4.10 + imagePullPolicy: IfNotPresent + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - rm -rf /usr/share/GeoIP/.geoipupdate.lock + env: + - name: GEOIPUPDATE_EDITION_IDS + value: GeoLite2-City + - name: GEOIPUPDATE_FREQUENCY + value: "0" + - name: GEOIPUPDATE_VERBOSE + value: "true" + envFrom: + - secretRef: + name: *app + volumeMounts: + - name: *app + mountPath: /usr/share/GeoIP + volumes: + - name: *app + persistentVolumeClaim: + claimName: vector-geoipupdate-config-v1 diff --git a/k8s/namespaces/clusters/base/monitoring/vector/geoipupdate/kustomization.yaml b/k8s/namespaces/clusters/base/monitoring/vector/geoipupdate/kustomization.yaml new file mode 100644 index 0000000000..2fa2de20ca --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/vector/geoipupdate/kustomization.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - helm-release.yaml diff --git a/k8s/namespaces/clusters/base/monitoring/vector/geoipupdate/secrets.sops.yaml b/k8s/namespaces/clusters/base/monitoring/vector/geoipupdate/secrets.sops.yaml new file mode 100644 index 0000000000..658b2fb917 --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/vector/geoipupdate/secrets.sops.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +kind: Secret +metadata: + name: vector-geoipupdate + namespace: monitoring +type: Opaque +stringData: + GEOUPDATE_ACCOUNT_ID: ENC[AES256_GCM,data:Sjq0DHmf,iv:0MsyPvCY511ZsvPcg99Keles1NIBgjNwlEb3qKpXn44=,tag:FoVYjet3KFaqQOnaUVB59Q==,type:str] + GEOUPDATE_LICENSE_KEY: ENC[AES256_GCM,data:ouZ8q4mLmbFVvXnefC53ug==,iv:s0a6pfHRriE6CV7c8KwYRn8zPsfnl4EhKUl2fiyHfE4=,tag:VRhVwjHvCQepq6QHRdayMQ==,type:str] +sops: + kms: [] + gcp_kms: [] + azure_kv: [] + hc_vault: [] + age: + - recipient: age1eynu35v0tpg9remal6zeecfeg9e84a2qxake027wwgdn02rdfcls7nyv8r + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBrRkJ4VTViM0NTS254NU1V + QW95MGJqV1NGNVplRXB1aDJ5WFlaOWNaRGtFCnhUeGR5aCtDZFV2b2c4Q2phbHFJ + dzU2MURjV0NBWkk0a2N6WCt3cTFKRjQKLS0tIDlEQU5KeUNQakNJRkdtK0RWdnc4 + TGxoQ2RBRVRRbW9Ib3lpcUpheDlTTHMKfzDVtap+CWIqOOo7SOe2GvKQPGyazB9/ + AJpzXvIV6uxKjd5KhSxdgtXz4wJPXN93MalwC90gZdDMi3WegZOKLg== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2022-10-05T21:12:26Z" + mac: ENC[AES256_GCM,data:Q+5BEI/bG2upEUMGlnOMt0C9oDHzZPLuzYyVGRzCSt8zOwj+ZmDYcTdfO88oVuPLZqO+0b89KPSvqrYevSXHChR5APqwGoQBcJWejotAfYfn1bPLkxCSMCv0b7pCFY+IOP3M0ocsCm83DyboIeQc1ftfkZOBuMxNQesp/Sz6x08=,iv:Us8OUzwa90eZ+SpV1U6h9r0ebQyfjlNP+xBU+VI6kF8=,tag:NDfaKBNcoysm5gz2HSh5Pg==,type:str] + pgp: [] + encrypted_regex: ((?i)(pass|secret($|[^N])|key|token|^data$|^stringData)) + version: 3.7.3 diff --git a/k8s/namespaces/overlays/base/monitoring.yaml b/k8s/namespaces/overlays/base/monitoring.yaml index 2e032c98aa..0fbc657c75 100644 --- a/k8s/namespaces/overlays/base/monitoring.yaml +++ b/k8s/namespaces/overlays/base/monitoring.yaml @@ -142,3 +142,99 @@ spec: name: global-config - kind: Secret name: cluster-secrets +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 +kind: Kustomization +metadata: + name: monitoring-vector-agent + namespace: flux-system +spec: + interval: 5m + path: "./k8s/namespaces/clusters/base/monitoring/vector/agent" + prune: true + wait: true + sourceRef: + kind: GitRepository + name: home-cluster + decryption: + provider: sops + secretRef: + name: sops-age + dependsOn: + - name: networking-cert-manager + namespace: flux-system + - name: monitoring-namespace + namespace: flux-system + postBuild: + substitute: {} + substituteFrom: + - kind: ConfigMap + name: cluster-config + - kind: ConfigMap + name: global-config + - kind: Secret + name: cluster-secrets +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 +kind: Kustomization +metadata: + name: monitoring-vector-aggregator + namespace: flux-system +spec: + interval: 5m + path: "./k8s/namespaces/clusters/base/monitoring/vector/aggregator" + prune: true + wait: true + sourceRef: + kind: GitRepository + name: home-cluster + decryption: + provider: sops + secretRef: + name: sops-age + dependsOn: + - name: networking-cert-manager + namespace: flux-system + - name: monitoring-namespace + namespace: flux-system + postBuild: + substitute: {} + substituteFrom: + - kind: ConfigMap + name: cluster-config + - kind: ConfigMap + name: global-config + - kind: Secret + name: cluster-secrets +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 +kind: Kustomization +metadata: + name: monitoring-vector-geoipupdate + namespace: flux-system +spec: + interval: 5m + path: "./k8s/namespaces/clusters/base/monitoring/vector/geoipupdate" + prune: true + wait: true + sourceRef: + kind: GitRepository + name: home-cluster + decryption: + provider: sops + secretRef: + name: sops-age + dependsOn: + - name: networking-cert-manager + namespace: flux-system + - name: monitoring-namespace + namespace: flux-system + postBuild: + substitute: {} + substituteFrom: + - kind: ConfigMap + name: cluster-config + - kind: ConfigMap + name: global-config + - kind: Secret + name: cluster-secrets From 8b9a347883562088fc9f04b3b8886200db5880b0 Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Thu, 6 Oct 2022 13:47:01 -0600 Subject: [PATCH 14/18] feat(monitoring): :wrench: Add alertmanager-discord --- .vscode/settings.json | 6 ++- .../alertmanager-discord/helm-release.yaml | 49 +++++++++++++++++++ .../alertmanager-discord/kustomization.yaml | 6 +++ .../alertmanager-discord/secret.sops.yaml | 31 ++++++++++++ .../base/monitoring/grafana/secret.sops.yaml | 8 +-- 5 files changed, 94 insertions(+), 6 deletions(-) create mode 100644 k8s/namespaces/clusters/base/monitoring/alertmanager-discord/helm-release.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/alertmanager-discord/kustomization.yaml create mode 100644 k8s/namespaces/clusters/base/monitoring/alertmanager-discord/secret.sops.yaml diff --git a/.vscode/settings.json b/.vscode/settings.json index 79e2aa83db..246bd92bac 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -16,8 +16,10 @@ "editor.guides.bracketPairs":"active", "cSpell.enabled": false, "yaml.schemas": { - "Kubernetes": "k8s/**/*.yaml", - "recyclarr": "https://raw.githubusercontent.com/recyclarr/recyclarr/master/schemas/config-schema.json" + "https://raw.githubusercontent.com/recyclarr/recyclarr/master/schemas/config-schema.json": "recyclarr.yaml", + "https://raw.githubusercontent.com/recyclarr/recyclarr/master/schemas/settings-schema.json": "settings.yaml", + "https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/v1.25.2-standalone/all.json": "k8s/**/*.yaml", + "recyclarr": "https://raw.githubusercontent.com/recyclarr/recyclarr/master/schemas/config-schema.json", }, "material-icon-theme.folders.associations": { ".taskfiles": "utils", diff --git a/k8s/namespaces/clusters/base/monitoring/alertmanager-discord/helm-release.yaml b/k8s/namespaces/clusters/base/monitoring/alertmanager-discord/helm-release.yaml new file mode 100644 index 0000000000..7ccc93cfe2 --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/alertmanager-discord/helm-release.yaml @@ -0,0 +1,49 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: &app alertmanager-discord + namespace: monitoring +spec: + interval: 15m + chart: + spec: + chart: app-template + version: 0.2.2 + sourceRef: + kind: HelmRepository + name: bjw-s-charts + namespace: flux-system + interval: 15m + test: + enable: false + install: + remediation: + retries: 5 + upgrade: + remediation: + retries: 5 + remediateLastFailure: true + cleanupOnFail: true + rollback: + timeout: 10m + recreate: true + cleanupOnFail: true + values: + global: + nameOverride: *app + image: + repository: benjojo/alertmanager-discord + tag: latest + service: + main: + ports: + http: + port: 9094 + resources: + requests: + cpu: 20m + memory: 100Mi + envFrom: + - secretRef: + name: alertmanager-discord-secret diff --git a/k8s/namespaces/clusters/base/monitoring/alertmanager-discord/kustomization.yaml b/k8s/namespaces/clusters/base/monitoring/alertmanager-discord/kustomization.yaml new file mode 100644 index 0000000000..73941365cd --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/alertmanager-discord/kustomization.yaml @@ -0,0 +1,6 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - helm-release.yaml + - secret.sops.yaml diff --git a/k8s/namespaces/clusters/base/monitoring/alertmanager-discord/secret.sops.yaml b/k8s/namespaces/clusters/base/monitoring/alertmanager-discord/secret.sops.yaml new file mode 100644 index 0000000000..ffdebf55e1 --- /dev/null +++ b/k8s/namespaces/clusters/base/monitoring/alertmanager-discord/secret.sops.yaml @@ -0,0 +1,31 @@ +# yamllint disable +apiVersion: v1 +kind: Secret +metadata: + name: alertmanager-discord-secret + namespace: monitoring + labels: + app.kubernetes.io/name: alertmanager-discord +type: Opaque +stringData: + DISCORD_WEBHOOK: ENC[AES256_GCM,data:X4wSVMjNcBzHamUy4gDDwdBlwHf+9zBm0u8TBryEHIU39w7k9PW8kDlQC6Cn5EInm85zkDydP5PNTp8z6f1ypJhAwEKcGu7daODXXpHi0UoWqYcVIkuxoxUgajAO8nNxERQ+iUavLduH1QHDJ67fgyuy6iJyUteTaQ==,iv:u89wssUCEltfsSDRIQa64dtphs16NO19ZZOvCQozhA4=,tag:oimjzmsZWO2ukAsVxXPRzA==,type:str] +sops: + kms: [] + gcp_kms: [] + azure_kv: [] + hc_vault: [] + age: + - recipient: age1eynu35v0tpg9remal6zeecfeg9e84a2qxake027wwgdn02rdfcls7nyv8r + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBPMFI2WVlGOWhhdHdaNlkr + MFVPczFFcmJ6TDBtNDNiWFQxcGpEY3o4Z2owCnlRcW1qZUVQb2pzWEx2b3dpZDd4 + ZUVpb1orWXQ0ZkMvV0wzS0F2djRSZDQKLS0tIHA2cEl0Q05qbU51RXh4SWprajhr + Z0hDTEgyeE94azhVSkVjb3NxS29Zc28K1tSWQXfEdMZX/HcitWShuTyaRD26VeHL + N4+LidD6V69SHblAiCIf2rbWWgemobiwbuIgGxE1VqLi3KWdKCEwQQ== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2022-10-06T19:20:52Z" + mac: ENC[AES256_GCM,data:09yhy8Yl95MPVSBVdh5MWohoOumOU5UV+DktKxqQG1VM1bwAd1gF70/Yh3ok2vcHKREb7sIzwzlMI0n+y1pI2bRXLJT7M0ru7LsJDJ+xs6vj8OXL2S88TE+FRUDlxali4XBLkY94DKmBrfIK3Uqc7rClOOHMv81eeodplQmi7Zw=,iv:Pv3Og4etVefQzxCCXn+QjWCBYxXOy9OHMKPPEL5Jtq8=,tag:1ypZQi7qYJN0AlSiGtOKKw==,type:str] + pgp: [] + encrypted_regex: ((?i)(pass|secret($|[^N])|key|token|^data$|^stringData)) + version: 3.7.3 diff --git a/k8s/namespaces/clusters/base/monitoring/grafana/secret.sops.yaml b/k8s/namespaces/clusters/base/monitoring/grafana/secret.sops.yaml index 609bc6e20e..91361a5a86 100644 --- a/k8s/namespaces/clusters/base/monitoring/grafana/secret.sops.yaml +++ b/k8s/namespaces/clusters/base/monitoring/grafana/secret.sops.yaml @@ -22,8 +22,8 @@ sops: Z1R3RFV5bER5Q0VBb3RnK0diNXpNcVkK3U21Y5GWvnmPA8hxi8Us7TkNGsCYAvlD QmY8mT6ApdiczqVo1DgFmKDSMIYNGL2wlyyriu9MLCU8a9tGxmj47g== -----END AGE ENCRYPTED FILE----- - lastmodified: "2022-10-05T15:05:14Z" - mac: ENC[AES256_GCM,data:KBjuXHhmBJi3kQZiu5xAagXBTaMeuyUlv1sTwcaWb9IzE/4qTssuyF3sA31Jokjy8d6uEqbTrpb9eoPmDkH30mX9M3E7uorTPyVu3fEhtMCsn/yWDVNZeX9JicwFxdEoU127ox5qFSfn0u7UYxl0b58LU1YT9kH/ea2MKd4bV/k=,iv:LRiCgC/aA1BUtIev6eYdkPpzruAURuHNO5aoB8UGNbM=,tag:d2QEUNoORWoLkgWt0PXIfA==,type:str] + lastmodified: "2022-10-06T19:15:56Z" + mac: ENC[AES256_GCM,data:7gN0iYVMIeAlSF9c21RI5hEdXdQoCUYf3M487gVkMNWgCY256Cf0gmP5CyiusgHjQ/5IIJmRupNZvLX2/BDtSl4FPhJcIcTVWk/zspTaDfvzA5O7uwH+LTKeUYbCIvCqRsRbeNmFTSjTSiItCoc5VPo+oKjK4hZ5SML4D6YABBc=,iv:G55upHE7qQNG+/jYqp4ACyqWnT44/L90BCLF3OQ8UxU=,tag:no+fcHquWpmajVRkqzq7EQ==,type:str] pgp: [] encrypted_regex: ((?i)(pass|secret($|[^N])|key|token|^data$|^stringData)) version: 3.7.3 @@ -51,8 +51,8 @@ sops: Z1R3RFV5bER5Q0VBb3RnK0diNXpNcVkK3U21Y5GWvnmPA8hxi8Us7TkNGsCYAvlD QmY8mT6ApdiczqVo1DgFmKDSMIYNGL2wlyyriu9MLCU8a9tGxmj47g== -----END AGE ENCRYPTED FILE----- - lastmodified: "2022-10-05T15:05:14Z" - mac: ENC[AES256_GCM,data:KBjuXHhmBJi3kQZiu5xAagXBTaMeuyUlv1sTwcaWb9IzE/4qTssuyF3sA31Jokjy8d6uEqbTrpb9eoPmDkH30mX9M3E7uorTPyVu3fEhtMCsn/yWDVNZeX9JicwFxdEoU127ox5qFSfn0u7UYxl0b58LU1YT9kH/ea2MKd4bV/k=,iv:LRiCgC/aA1BUtIev6eYdkPpzruAURuHNO5aoB8UGNbM=,tag:d2QEUNoORWoLkgWt0PXIfA==,type:str] + lastmodified: "2022-10-06T19:15:56Z" + mac: ENC[AES256_GCM,data:7gN0iYVMIeAlSF9c21RI5hEdXdQoCUYf3M487gVkMNWgCY256Cf0gmP5CyiusgHjQ/5IIJmRupNZvLX2/BDtSl4FPhJcIcTVWk/zspTaDfvzA5O7uwH+LTKeUYbCIvCqRsRbeNmFTSjTSiItCoc5VPo+oKjK4hZ5SML4D6YABBc=,iv:G55upHE7qQNG+/jYqp4ACyqWnT44/L90BCLF3OQ8UxU=,tag:no+fcHquWpmajVRkqzq7EQ==,type:str] pgp: [] encrypted_regex: ((?i)(pass|secret($|[^N])|key|token|^data$|^stringData)) version: 3.7.3 From ca9b30afc5481d568f16b58c445fef365a97f108 Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Thu, 6 Oct 2022 14:39:27 -0600 Subject: [PATCH 15/18] feat(monitoring): :wrench: Prometheus add alertmanager-discord hook --- .../kube-prometheus-stack/helm-release.yaml | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack/helm-release.yaml b/k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack/helm-release.yaml index 50bb3bc001..32a7e1c960 100644 --- a/k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack/helm-release.yaml +++ b/k8s/namespaces/clusters/base/monitoring/kube-prometheus-stack/helm-release.yaml @@ -29,6 +29,62 @@ spec: recreate: true cleanupOnFail: true values: + alertmanager: + config: + global: + resolve_timeout: 5m + receivers: + - name: "null" + - name: discord + webhook_configs: + - url: http://alertmanager-discord:9094 + route: + group_by: ["alertname", "job"] + group_wait: 30s + group_interval: 5m + repeat_interval: 6h + receiver: "discord" + routes: + - receiver: "null" + matchers: + - alertname =~ "InfoInhibitor|Watchdog" + - receiver: "discord" + matchers: + - severity = "critical" + continue: true + inhibit_rules: + - source_matchers: + - severity = "critical" + target_matchers: + - severity = "warning" + equal: ["alertname", "namespace"] + ingress: + enabled: true + pathType: Prefix + ingressClassName: "nginx" + annotations: + # cert-manager.io/cluster-issuer: ${CLUSTER_CERT} + cert-manager.io/cluster-issuer: "letsencrypt-staging" + hosts: + - &host "alert-manager.${SECRET_PUBLIC_DOMAIN}" + tls: + - secretName: "alert-manager-tls" + hosts: + - *host + alertmanagerSpec: + replicas: 3 + podAntiAffinity: hard + storage: + volumeClaimTemplate: + spec: + storageClassName: "ceph-block" + resources: + requests: + storage: 1Gi + + alertmanagerSpec: + replicas: 1 + kubeApiServer: enabled: true From 53b13688beae0225008fb6533c98a84d43b77fbd Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Thu, 6 Oct 2022 15:13:40 -0600 Subject: [PATCH 16/18] chore(monitoring): :wrench: Update configs --- .../helm/cloudnative-pg-charts.yaml | 9 ++++++ .../base/databases/postgres/cluster.yaml | 2 -- .../base/databases/postgres/helm-release.yaml | 2 +- .../base/monitoring/grafana/helm-release.yaml | 2 +- k8s/namespaces/overlays/base/monitoring.yaml | 32 +++++++++++++++++++ 5 files changed, 43 insertions(+), 4 deletions(-) create mode 100644 k8s/global/flux/repositories/helm/cloudnative-pg-charts.yaml diff --git a/k8s/global/flux/repositories/helm/cloudnative-pg-charts.yaml b/k8s/global/flux/repositories/helm/cloudnative-pg-charts.yaml new file mode 100644 index 0000000000..6cd8394ff2 --- /dev/null +++ b/k8s/global/flux/repositories/helm/cloudnative-pg-charts.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: HelmRepository +metadata: + name: cloudnative-pg + namespace: flux-system +spec: + interval: 1h + url: https://cloudnative-pg.github.io/charts diff --git a/k8s/namespaces/clusters/base/databases/postgres/cluster.yaml b/k8s/namespaces/clusters/base/databases/postgres/cluster.yaml index 3bc15113a6..17e4e87f0f 100644 --- a/k8s/namespaces/clusters/base/databases/postgres/cluster.yaml +++ b/k8s/namespaces/clusters/base/databases/postgres/cluster.yaml @@ -4,8 +4,6 @@ kind: Cluster metadata: name: postgres namespace: databases - annotations: - kyverno.io/ignore: "true" spec: instances: 3 primaryUpdateStrategy: unsupervised diff --git a/k8s/namespaces/clusters/base/databases/postgres/helm-release.yaml b/k8s/namespaces/clusters/base/databases/postgres/helm-release.yaml index 6df1dad927..9d5690b67a 100644 --- a/k8s/namespaces/clusters/base/databases/postgres/helm-release.yaml +++ b/k8s/namespaces/clusters/base/databases/postgres/helm-release.yaml @@ -12,7 +12,7 @@ spec: version: 0.15.0 sourceRef: kind: HelmRepository - name: cloudnative-pg-charts + name: cloudnative-pg namespace: flux-system test: enable: false diff --git a/k8s/namespaces/clusters/base/monitoring/grafana/helm-release.yaml b/k8s/namespaces/clusters/base/monitoring/grafana/helm-release.yaml index 26bfb82c6e..928537a772 100644 --- a/k8s/namespaces/clusters/base/monitoring/grafana/helm-release.yaml +++ b/k8s/namespaces/clusters/base/monitoring/grafana/helm-release.yaml @@ -29,7 +29,7 @@ spec: recreate: true cleanupOnFail: true values: - replicas: 1 + replicas: 3 deploymentStrategy: type: Recreate diff --git a/k8s/namespaces/overlays/base/monitoring.yaml b/k8s/namespaces/overlays/base/monitoring.yaml index 0fbc657c75..f2c6e3b919 100644 --- a/k8s/namespaces/overlays/base/monitoring.yaml +++ b/k8s/namespaces/overlays/base/monitoring.yaml @@ -15,6 +15,38 @@ spec: --- apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 kind: Kustomization +metadata: + name: monitoring-alertmanager-discord + namespace: flux-system +spec: + interval: 5m + path: "./k8s/namespaces/clusters/base/monitoring/alertmanager-discord" + prune: true + wait: true + sourceRef: + kind: GitRepository + name: home-cluster + decryption: + provider: sops + secretRef: + name: sops-age + dependsOn: + - name: networking-cert-manager + namespace: flux-system + - name: monitoring-namespace + namespace: flux-system + postBuild: + substitute: {} + substituteFrom: + - kind: ConfigMap + name: cluster-config + - kind: ConfigMap + name: global-config + - kind: Secret + name: cluster-secrets +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1beta2 +kind: Kustomization metadata: name: monitoring-kube-prometheus-stack namespace: flux-system From fede2ee47e498b04f3746aad651ddf2021ddf085 Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Thu, 6 Oct 2022 15:25:51 -0600 Subject: [PATCH 17/18] chore(flux): :wrench: Add cloudnative-pg chart --- k8s/global/flux/repositories/helm/kustomization.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/k8s/global/flux/repositories/helm/kustomization.yaml b/k8s/global/flux/repositories/helm/kustomization.yaml index 0aad3ccc81..1973587cae 100644 --- a/k8s/global/flux/repositories/helm/kustomization.yaml +++ b/k8s/global/flux/repositories/helm/kustomization.yaml @@ -2,18 +2,19 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: + - bitnami-charts.yaml - cilium-charts.yaml + - cloudnative-pg-charts.yaml - external-dns-charts.yaml + - grafana-charts.yaml - ingress-nginx-charts.yaml - jetstack-charts.yaml - k8s-gateway-charts.yaml + - kubereboot-charts.yaml - metallb-charts.yaml - metrics-server-charts.yaml - prometheus-community-charts.yaml - rook-ceph-charts.yaml - - stakater-charts.yaml - sealed-secrets-charts.yaml - - bitnami-charts.yaml - - kubereboot-charts.yaml - - grafana-charts.yaml + - stakater-charts.yaml - vector-charts.yaml From a63eeb36035cbd93b1419b052c95e3372ddc60f0 Mon Sep 17 00:00:00 2001 From: snoopy82481 Date: Thu, 6 Oct 2022 15:35:31 -0600 Subject: [PATCH 18/18] chore(vscode): --- .vscode/settings.json | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 246bd92bac..32df076eb6 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -46,12 +46,20 @@ "cluster-betty", "cluster-talos", "cluster-global", + "flux", + "monitoring", + "postgres", "cilium", "kured", "metrics-server", "reloader", "sealed-secrets", - "qbittorrent", + "alertmanager", + "grafana", + "kps", + "loki", + "thanos", + "vector", "cert-manager", "external-dns", "ingress-nginx", @@ -60,7 +68,6 @@ "origin-ca-issuer", "rook-ceph", "qbittorrent", - "flux", "bazarr", "overseer", "prowlarr", @@ -73,8 +80,6 @@ "sonarranime", "tautulli", "nzbget", - "static", - "blackbox", "vpn-gateway", "hajimari", "theme-park"