From 8fb9ab2213b2b7ba6a9710749065b107854ffbb0 Mon Sep 17 00:00:00 2001 From: Deepak Tiwari Date: Wed, 18 Dec 2024 15:01:36 +0530 Subject: [PATCH] Updated the docker rules and fixed the test case for docker container status --- .../prometheus-linuxaid/rules/docker.yaml | 15 ++++++++++ .../templates/prometheusRule-docker.yaml | 9 ++++++ .../prometheus-linuxaid/tests/docker.yaml | 28 +++++++++++++++++++ .../prometheus-linuxaid/values.yaml | 1 + 4 files changed, 53 insertions(+) create mode 100644 argocd-helm-charts/prometheus-linuxaid/rules/docker.yaml create mode 100644 argocd-helm-charts/prometheus-linuxaid/templates/prometheusRule-docker.yaml create mode 100644 argocd-helm-charts/prometheus-linuxaid/tests/docker.yaml diff --git a/argocd-helm-charts/prometheus-linuxaid/rules/docker.yaml b/argocd-helm-charts/prometheus-linuxaid/rules/docker.yaml new file mode 100644 index 00000000..bb3675ff --- /dev/null +++ b/argocd-helm-charts/prometheus-linuxaid/rules/docker.yaml @@ -0,0 +1,15 @@ +groups: + - name: monitor::system::docker + rules: + - alert: monitor::system::docker::status + expr: | + ( + sum by (certname, name) (increase(container_last_seen{name!~"^runner.*"}[30m])) < 1200 + ) and on(certname) obmondo_monitoring{alert_id="monitor::system::docker::status"} > 0 + for: 5m + labels: + alert_id: monitor::system::docker::status + severity: critical + annotations: + summary: "Docker container {{$labels.name}} is down on host {{ $labels.certname }}" + description: "Docker container **{{ $labels.name }}** is down since from 10min on server **{{ $labels.certname }}**" diff --git a/argocd-helm-charts/prometheus-linuxaid/templates/prometheusRule-docker.yaml b/argocd-helm-charts/prometheus-linuxaid/templates/prometheusRule-docker.yaml new file mode 100644 index 00000000..4b2d44fe --- /dev/null +++ b/argocd-helm-charts/prometheus-linuxaid/templates/prometheusRule-docker.yaml @@ -0,0 +1,9 @@ +{{- if .Values.prometheusRule.docker }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: docker-rules + namespace: monitoring-{{ .Values.customerid }} +spec: + {{- $.Files.Get "rules/docker.yaml" | nindent 4 }} +{{- end }} diff --git a/argocd-helm-charts/prometheus-linuxaid/tests/docker.yaml b/argocd-helm-charts/prometheus-linuxaid/tests/docker.yaml new file mode 100644 index 00000000..0542fefa --- /dev/null +++ b/argocd-helm-charts/prometheus-linuxaid/tests/docker.yaml @@ -0,0 +1,28 @@ +--- +evaluation_interval: 1m + +rule_files: + - ../rules/docker.yaml + +tests: + - interval: 1m + input_series: + - series: obmondo_monitoring{certname="bantha.enableit", alert_id="monitor::system::docker::status"} + values: 1x1800 + - series: container_last_seen{certname="bantha.enableit", name="packagesign-script-1"} + values: '1734500308+35x1800' + - series: container_last_seen{certname="bantha.enableit", name="jenkins"} + values: '1734500308+60x1800' + + alert_rule_test: + - alertname: monitor::system::docker::status + eval_time: 30m + exp_alerts: + - exp_labels: + severity: critical + certname: bantha.enableit + name: packagesign-script-1 + alert_id: monitor::system::docker::status + exp_annotations: + summary: 'Docker container packagesign-script-1 is down on host bantha.enableit' + description: 'Docker container **packagesign-script-1** is down since from 10min on server **bantha.enableit**' diff --git a/argocd-helm-charts/prometheus-linuxaid/values.yaml b/argocd-helm-charts/prometheus-linuxaid/values.yaml index 194973c9..96af9924 100644 --- a/argocd-helm-charts/prometheus-linuxaid/values.yaml +++ b/argocd-helm-charts/prometheus-linuxaid/values.yaml @@ -32,6 +32,7 @@ prometheusRule: cpu: true dellhw: true disk: true + docker: true drbd: true elasticsearch: true file_size: true