From f2d3fe0fbc17910577040564226d9e243f7efa1f Mon Sep 17 00:00:00 2001
From: Mia Altieri <32723809+MiaAltieri@users.noreply.github.com>
Date: Wed, 11 Dec 2024 17:07:45 +0100
Subject: [PATCH] [DPE-6073] add requested alerts (#521)
Addressing #517 by adding the following requested alerts:
- Cluster is not writable
- Cluster will not be writable if I lose one more node
- Number of connections is close to max connections limit
along with a few others from the Percona alert rules
## testing
- Cluster is not writable
- Cluster will not be writable if I lose one more node - note this is
firing because it was deployed with a single replica, when the replica
set is scaled up it goes back to green
- Number of connections is close to max connections limit (80%)
---
.../prometheus/percona-mongodb-exporter.yml | 56 ++++++++++++++++++-
1 file changed, 55 insertions(+), 1 deletion(-)
diff --git a/src/alert_rules/prometheus/percona-mongodb-exporter.yml b/src/alert_rules/prometheus/percona-mongodb-exporter.yml
index fe0568dbf..1caaf5dd8 100644
--- a/src/alert_rules/prometheus/percona-mongodb-exporter.yml
+++ b/src/alert_rules/prometheus/percona-mongodb-exporter.yml
@@ -21,10 +21,64 @@ groups:
description: "MongoDB replica member is not healthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbReplicationLag
- expr: 'mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"} > 10'
+ expr: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10'
for: 0m
labels:
severity: critical
annotations:
summary: MongoDB replication lag (instance {{ $labels.instance }})
description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: MongodbReplicationHeadroom
+ expr: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0'
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: MongoDB replication headroom (instance {{ $labels.instance }})
+ description: "MongoDB replication headroom is <= 0\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: MongodbNumberCursorsOpen
+ expr: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000'
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: MongoDB number cursors open (instance {{ $labels.instance }})
+ description: "Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: MongodbCursorsTimeouts
+ expr: "increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100"
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: MongoDB cursors timeouts (instance {{ $labels.instance }})
+ description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: MongodbTooManyConnections
+ expr: 'avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80'
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: MongoDB too many connections (instance {{ $labels.instance }})
+ description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: MongoDBNotWritable
+ expr: "sum(mongodb_mongod_replset_my_state == 1) == 0"
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: MongoDB is not writable, no node is primary
+ description: "MongoDB is not writable, no node is primary"
+
+ - alert: MongoDBOneNodeLossAwayFromNotWitable
+ expr: "sum(mongodb_mongod_replset_my_state) == 1"
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: If MongoDB loses one more node it will not be writable
+ description: "If MongoDB loses one more node it will not be writable"