Skip to content

Commit

Permalink
krkn-lib prometheus client ready + kube_burner removed
Browse files Browse the repository at this point in the history
removed certs

config file restored
  • Loading branch information
tsebastiani committed Nov 16, 2023
1 parent 8001d2f commit 2575cf1
Show file tree
Hide file tree
Showing 11 changed files with 138 additions and 206 deletions.
90 changes: 90 additions & 0 deletions config/alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# etcd

- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 0.01
description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 10ms. {{$value}}s
severity: warning

- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 1
description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 1s. {{$value}}s
severity: error

- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[10m:]) > 0.007
description: 10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 30ms. {{$value}}s
severity: warning

- expr: rate(etcd_server_leader_changes_seen_total[2m]) > 0
description: etcd leader changes observed
severity: warning

- expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
description: etcd cluster database is running full.
severity: critical

- expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
description: etcd database size in use is less than 50% of the actual allocated storage.
severity: warning

- expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
description: etcd cluster has high number of proposal failures.
severity: warning

- expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15
description: etcd cluster member communication is slow.
severity: warning

- expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) > 0.15
description: etcd grpc requests are slow.
severity: critical

- expr: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5
description: etcd cluster has high number of failed grpc requests.
severity: critical

- expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
description: etcd cluster has no leader.
severity: warning

- expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
description: etcd cluster has insufficient number of members.
severity: warning

- expr: max without (endpoint) ( sum without (instance) (up{job=~".*etcd.*"} == bool 0) or count without (To) ( sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 )) > 0
description: etcd cluster members are down.
severity: warning

# API server
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb))[10m:]) > 1
description: 10 minutes avg. 99th mutating API call latency for {{$labels.verb}}/{{$labels.resource}} higher than 1 second. {{$value}}s
severity: error

- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="resource"}[2m])) by (le, resource, verb, scope))[5m:]) > 1
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 1 second. {{$value}}s
severity: error

- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="namespace"}[2m])) by (le, resource, verb, scope))[5m:]) > 5
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 5 seconds. {{$value}}s
severity: error

- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="cluster"}[2m])) by (le, resource, verb, scope))[5m:]) > 30
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 30 seconds. {{$value}}s
severity: error

# Control plane pods

- expr: up{job=~"crio|kubelet"} == 0
description: "{{$labels.node}}/{{$labels.job}} down"
severity: warning

- expr: up{job="ovnkube-node"} == 0
description: "{{$labels.instance}}/{{$labels.pod}} {{$labels.job}} down"
severity: warning

# Service sync latency
- expr: histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket[2m])) by (le)) > 10
description: 99th Kubeproxy network programming latency higher than 10 seconds. {{$value}}s
severity: warning

# Prometheus alerts
- expr: ALERTS{severity="critical", alertstate="firing"} > 0
description: Critical prometheus alert. {{$labels.alertname}}
severity: warning
File renamed without changes.
5 changes: 1 addition & 4 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ performance_monitoring:
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
uuid: # uuid for the run is generated by default if not set
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
alert_profile: config/alerts # Path or URL to alert profile with the prometheus queries
alert_profile: config/alerts.yaml # Path or URL to alert profile with the prometheus queries
check_critical_alerts: False # When enabled will check prometheus for critical alerts firing post chaos
tunings:
wait_duration: 60 # Duration to wait between each chaos scenario
Expand Down Expand Up @@ -87,9 +87,6 @@ telemetry:
- "(\\w{3}\\s\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}\\.\\d+).+" # Sep 9 11:20:36.123425532
- "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+" # kinit 2023/09/15 11:20:36 log
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log


logs_filter_patterns: [ "(\\w{3}\\s\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}\\.\\d+).+","kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+","(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" ]
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
events_backup: True # enables/disables cluster events collection

Empty file removed kraken/kube_burner/__init__.py
Empty file.
116 changes: 0 additions & 116 deletions kraken/kube_burner/client.py

This file was deleted.

1 change: 1 addition & 0 deletions kraken/prometheus/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .client import *
62 changes: 20 additions & 42 deletions kraken/prometheus/client.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,30 @@
import datetime
import os.path
import urllib3
import logging
import prometheus_api_client
import sys
import kraken.invoke.command as runcommand

import yaml
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):

# Initialize the client
def initialize_prom_client(distribution, prometheus_url, prometheus_bearer_token):
global prom_cli
prometheus_url, prometheus_bearer_token = instance(distribution, prometheus_url, prometheus_bearer_token)
if prometheus_url and prometheus_bearer_token:
bearer = "Bearer " + prometheus_bearer_token
headers = {"Authorization": bearer}
try:
prom_cli = prometheus_api_client.PrometheusConnect(url=prometheus_url, headers=headers, disable_ssl=True)
except Exception as e:
logging.error("Not able to initialize the client %s" % e)
sys.exit(1)
else:
prom_cli = None

if alert_profile is None or os.path.exists(alert_profile) is False:
logging.error(f"{alert_profile} alert profile does not exist")
sys.exit(1)

# Process custom prometheus query
def process_prom_query(query):
if prom_cli:
try:
return prom_cli.custom_query(query=query, params=None)
except Exception as e:
logging.error("Failed to get the metrics: %s" % e)
with open(alert_profile) as profile:
profile_yaml = yaml.safe_load(profile)
if not isinstance(profile_yaml, list):
logging.error(f"{alert_profile} wrong file format, alert profile must be "
f"a valid yaml file containing a list of items with 3 properties: "
f"expr, description, severity" )
sys.exit(1)
else:
logging.info("Skipping the prometheus query as the prometheus client couldn't " "be initialized\n")

# Get prometheus details
def instance(distribution, prometheus_url, prometheus_bearer_token):
if distribution == "openshift" and not prometheus_url:
url = runcommand.invoke(
r"""oc get routes -n openshift-monitoring -o=jsonpath='{.items[?(@.metadata.name=="prometheus-k8s")].spec.host}'""" # noqa
)
prometheus_url = "https://" + url
if distribution == "openshift" and not prometheus_bearer_token:
prometheus_bearer_token = runcommand.invoke(
"oc create token -n openshift-monitoring prometheus-k8s --duration=12h "
"|| oc -n openshift-monitoring sa get-token prometheus-k8s "
"|| oc sa new-token -n openshift-monitoring prometheus-k8s"
)
return prometheus_url, prometheus_bearer_token

for alert in profile_yaml:
if list(alert.keys()).sort() != ["expr", "description", "severity"].sort():
logging.error(f"wrong alert {alert}, skipping")

def scrape_metrics()
prom_cli.process_alert(alert,
datetime.datetime.fromtimestamp(start_time),
datetime.datetime.fromtimestamp(end_time))
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ ibm_cloud_sdk_core
ibm_vpc
itsdangerous==2.0.1
jinja2==3.0.3
krkn-lib@git+https://github.com/redhat-chaos/krkn-lib.git@prometheus
krkn-lib@git+https://github.com/redhat-chaos/krkn-lib.git@time_interval
kubernetes
lxml >= 4.3.0
oauth2client>=4.1.3
Expand Down
Loading

0 comments on commit 2575cf1

Please sign in to comment.