From f2d3fe0fbc17910577040564226d9e243f7efa1f Mon Sep 17 00:00:00 2001 From: Mia Altieri <32723809+MiaAltieri@users.noreply.github.com> Date: Wed, 11 Dec 2024 17:07:45 +0100 Subject: [PATCH] [DPE-6073] add requested alerts (#521) Addressing #517 by adding the following requested alerts: - Cluster is not writable - Cluster will not be writable if I lose one more node - Number of connections is close to max connections limit along with a few others from the Percona alert rules ## testing - Cluster is not writable Screenshot 2024-12-09 at 14 22 58 - Cluster will not be writable if I lose one more node - note this is firing because it was deployed with a single replica, when the replica set is scaled up it goes back to green Screenshot 2024-12-09 at 14 22 04 - Number of connections is close to max connections limit (80%) Screenshot 2024-12-09 at 14 23 32 --- .../prometheus/percona-mongodb-exporter.yml | 56 ++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/src/alert_rules/prometheus/percona-mongodb-exporter.yml b/src/alert_rules/prometheus/percona-mongodb-exporter.yml index fe0568dbf..1caaf5dd8 100644 --- a/src/alert_rules/prometheus/percona-mongodb-exporter.yml +++ b/src/alert_rules/prometheus/percona-mongodb-exporter.yml @@ -21,10 +21,64 @@ groups: description: "MongoDB replica member is not healthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbReplicationLag - expr: 'mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"} > 10' + expr: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10' for: 0m labels: severity: critical annotations: summary: MongoDB replication lag (instance {{ $labels.instance }}) description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MongodbReplicationHeadroom + expr: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0' + for: 0m + labels: + severity: critical + annotations: + summary: MongoDB replication headroom (instance {{ $labels.instance }}) + description: "MongoDB replication headroom is <= 0\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MongodbNumberCursorsOpen + expr: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000' + for: 2m + labels: + severity: warning + annotations: + summary: MongoDB number cursors open (instance {{ $labels.instance }}) + description: "Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MongodbCursorsTimeouts + expr: "increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100" + for: 2m + labels: + severity: warning + annotations: + summary: MongoDB cursors timeouts (instance {{ $labels.instance }}) + description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MongodbTooManyConnections + expr: 'avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80' + for: 2m + labels: + severity: warning + annotations: + summary: MongoDB too many connections (instance {{ $labels.instance }}) + description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MongoDBNotWritable + expr: "sum(mongodb_mongod_replset_my_state == 1) == 0" + for: 2m + labels: + severity: critical + annotations: + summary: MongoDB is not writable, no node is primary + description: "MongoDB is not writable, no node is primary" + + - alert: MongoDBOneNodeLossAwayFromNotWitable + expr: "sum(mongodb_mongod_replset_my_state) == 1" + for: 2m + labels: + severity: warning + annotations: + summary: If MongoDB loses one more node it will not be writable + description: "If MongoDB loses one more node it will not be writable"