-
Notifications
You must be signed in to change notification settings - Fork 103
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
krkn-lib prometheus client ready + kube_burner removed
removed certs config file restored
- Loading branch information
1 parent
8001d2f
commit 2575cf1
Showing
11 changed files
with
138 additions
and
206 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
# etcd | ||
|
||
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 0.01 | ||
description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 10ms. {{$value}}s | ||
severity: warning | ||
|
||
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 1 | ||
description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 1s. {{$value}}s | ||
severity: error | ||
|
||
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[10m:]) > 0.007 | ||
description: 10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 30ms. {{$value}}s | ||
severity: warning | ||
|
||
- expr: rate(etcd_server_leader_changes_seen_total[2m]) > 0 | ||
description: etcd leader changes observed | ||
severity: warning | ||
|
||
- expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95 | ||
description: etcd cluster database is running full. | ||
severity: critical | ||
|
||
- expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5 | ||
description: etcd database size in use is less than 50% of the actual allocated storage. | ||
severity: warning | ||
|
||
- expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 | ||
description: etcd cluster has high number of proposal failures. | ||
severity: warning | ||
|
||
- expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15 | ||
description: etcd cluster member communication is slow. | ||
severity: warning | ||
|
||
- expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) > 0.15 | ||
description: etcd grpc requests are slow. | ||
severity: critical | ||
|
||
- expr: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5 | ||
description: etcd cluster has high number of failed grpc requests. | ||
severity: critical | ||
|
||
- expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 | ||
description: etcd cluster has no leader. | ||
severity: warning | ||
|
||
- expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2) | ||
description: etcd cluster has insufficient number of members. | ||
severity: warning | ||
|
||
- expr: max without (endpoint) ( sum without (instance) (up{job=~".*etcd.*"} == bool 0) or count without (To) ( sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 )) > 0 | ||
description: etcd cluster members are down. | ||
severity: warning | ||
|
||
# API server | ||
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb))[10m:]) > 1 | ||
description: 10 minutes avg. 99th mutating API call latency for {{$labels.verb}}/{{$labels.resource}} higher than 1 second. {{$value}}s | ||
severity: error | ||
|
||
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="resource"}[2m])) by (le, resource, verb, scope))[5m:]) > 1 | ||
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 1 second. {{$value}}s | ||
severity: error | ||
|
||
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="namespace"}[2m])) by (le, resource, verb, scope))[5m:]) > 5 | ||
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 5 seconds. {{$value}}s | ||
severity: error | ||
|
||
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="cluster"}[2m])) by (le, resource, verb, scope))[5m:]) > 30 | ||
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 30 seconds. {{$value}}s | ||
severity: error | ||
|
||
# Control plane pods | ||
|
||
- expr: up{job=~"crio|kubelet"} == 0 | ||
description: "{{$labels.node}}/{{$labels.job}} down" | ||
severity: warning | ||
|
||
- expr: up{job="ovnkube-node"} == 0 | ||
description: "{{$labels.instance}}/{{$labels.pod}} {{$labels.job}} down" | ||
severity: warning | ||
|
||
# Service sync latency | ||
- expr: histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket[2m])) by (le)) > 10 | ||
description: 99th Kubeproxy network programming latency higher than 10 seconds. {{$value}}s | ||
severity: warning | ||
|
||
# Prometheus alerts | ||
- expr: ALERTS{severity="critical", alertstate="firing"} > 0 | ||
description: Critical prometheus alert. {{$labels.alertname}} | ||
severity: warning |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .client import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,52 +1,30 @@ | ||
import datetime | ||
import os.path | ||
import urllib3 | ||
import logging | ||
import prometheus_api_client | ||
import sys | ||
import kraken.invoke.command as runcommand | ||
|
||
import yaml | ||
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus | ||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | ||
def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile): | ||
|
||
# Initialize the client | ||
def initialize_prom_client(distribution, prometheus_url, prometheus_bearer_token): | ||
global prom_cli | ||
prometheus_url, prometheus_bearer_token = instance(distribution, prometheus_url, prometheus_bearer_token) | ||
if prometheus_url and prometheus_bearer_token: | ||
bearer = "Bearer " + prometheus_bearer_token | ||
headers = {"Authorization": bearer} | ||
try: | ||
prom_cli = prometheus_api_client.PrometheusConnect(url=prometheus_url, headers=headers, disable_ssl=True) | ||
except Exception as e: | ||
logging.error("Not able to initialize the client %s" % e) | ||
sys.exit(1) | ||
else: | ||
prom_cli = None | ||
|
||
if alert_profile is None or os.path.exists(alert_profile) is False: | ||
logging.error(f"{alert_profile} alert profile does not exist") | ||
sys.exit(1) | ||
|
||
# Process custom prometheus query | ||
def process_prom_query(query): | ||
if prom_cli: | ||
try: | ||
return prom_cli.custom_query(query=query, params=None) | ||
except Exception as e: | ||
logging.error("Failed to get the metrics: %s" % e) | ||
with open(alert_profile) as profile: | ||
profile_yaml = yaml.safe_load(profile) | ||
if not isinstance(profile_yaml, list): | ||
logging.error(f"{alert_profile} wrong file format, alert profile must be " | ||
f"a valid yaml file containing a list of items with 3 properties: " | ||
f"expr, description, severity" ) | ||
sys.exit(1) | ||
else: | ||
logging.info("Skipping the prometheus query as the prometheus client couldn't " "be initialized\n") | ||
|
||
# Get prometheus details | ||
def instance(distribution, prometheus_url, prometheus_bearer_token): | ||
if distribution == "openshift" and not prometheus_url: | ||
url = runcommand.invoke( | ||
r"""oc get routes -n openshift-monitoring -o=jsonpath='{.items[?(@.metadata.name=="prometheus-k8s")].spec.host}'""" # noqa | ||
) | ||
prometheus_url = "https://" + url | ||
if distribution == "openshift" and not prometheus_bearer_token: | ||
prometheus_bearer_token = runcommand.invoke( | ||
"oc create token -n openshift-monitoring prometheus-k8s --duration=12h " | ||
"|| oc -n openshift-monitoring sa get-token prometheus-k8s " | ||
"|| oc sa new-token -n openshift-monitoring prometheus-k8s" | ||
) | ||
return prometheus_url, prometheus_bearer_token | ||
|
||
for alert in profile_yaml: | ||
if list(alert.keys()).sort() != ["expr", "description", "severity"].sort(): | ||
logging.error(f"wrong alert {alert}, skipping") | ||
|
||
def scrape_metrics() | ||
prom_cli.process_alert(alert, | ||
datetime.datetime.fromtimestamp(start_time), | ||
datetime.datetime.fromtimestamp(end_time)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.