Skip to content

Commit

Permalink
Updated the docker rules and fixed the test case for docker container…
Browse files Browse the repository at this point in the history
… status
  • Loading branch information
Deepak Tiwari committed Dec 18, 2024
1 parent 66d3c0a commit 8fb9ab2
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 0 deletions.
15 changes: 15 additions & 0 deletions argocd-helm-charts/prometheus-linuxaid/rules/docker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
groups:
- name: monitor::system::docker
rules:
- alert: monitor::system::docker::status
expr: |
(
sum by (certname, name) (increase(container_last_seen{name!~"^runner.*"}[30m])) < 1200
) and on(certname) obmondo_monitoring{alert_id="monitor::system::docker::status"} > 0
for: 5m
labels:
alert_id: monitor::system::docker::status
severity: critical
annotations:
summary: "Docker container {{$labels.name}} is down on host {{ $labels.certname }}"
description: "Docker container **{{ $labels.name }}** is down since from 10min on server **{{ $labels.certname }}**"
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{{- if .Values.prometheusRule.docker }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: docker-rules
namespace: monitoring-{{ .Values.customerid }}
spec:
{{- $.Files.Get "rules/docker.yaml" | nindent 4 }}
{{- end }}
28 changes: 28 additions & 0 deletions argocd-helm-charts/prometheus-linuxaid/tests/docker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
---
evaluation_interval: 1m

rule_files:
- ../rules/docker.yaml

tests:
- interval: 1m
input_series:
- series: obmondo_monitoring{certname="bantha.enableit", alert_id="monitor::system::docker::status"}
values: 1x1800
- series: container_last_seen{certname="bantha.enableit", name="packagesign-script-1"}
values: '1734500308+35x1800'
- series: container_last_seen{certname="bantha.enableit", name="jenkins"}
values: '1734500308+60x1800'

alert_rule_test:
- alertname: monitor::system::docker::status
eval_time: 30m
exp_alerts:
- exp_labels:
severity: critical
certname: bantha.enableit
name: packagesign-script-1
alert_id: monitor::system::docker::status
exp_annotations:
summary: 'Docker container packagesign-script-1 is down on host bantha.enableit'
description: 'Docker container **packagesign-script-1** is down since from 10min on server **bantha.enableit**'
1 change: 1 addition & 0 deletions argocd-helm-charts/prometheus-linuxaid/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ prometheusRule:
cpu: true
dellhw: true
disk: true
docker: true
drbd: true
elasticsearch: true
file_size: true
Expand Down

0 comments on commit 8fb9ab2

Please sign in to comment.