From 2575cf127f701a42b37c54e7ccf54001cb9ef2dd Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Thu, 16 Nov 2023 16:47:46 +0100 Subject: [PATCH] krkn-lib prometheus client ready + kube_burner removed removed certs config file restored --- config/alerts.yaml | 90 ++++++++++++++++++ config/{alerts => alerts_openshift.yaml} | 0 config/config.yaml | 5 +- kraken/kube_burner/__init__.py | 0 kraken/kube_burner/client.py | 116 ----------------------- kraken/prometheus/__init__.py | 1 + kraken/prometheus/client.py | 62 ++++-------- requirements.txt | 2 +- run_kraken.py | 50 ++++------ scenarios/arcaflow/cpu-hog/config.yaml | 6 +- scenarios/arcaflow/cpu-hog/input.yaml | 12 +-- 11 files changed, 138 insertions(+), 206 deletions(-) create mode 100644 config/alerts.yaml rename config/{alerts => alerts_openshift.yaml} (100%) delete mode 100644 kraken/kube_burner/__init__.py delete mode 100644 kraken/kube_burner/client.py diff --git a/config/alerts.yaml b/config/alerts.yaml new file mode 100644 index 000000000..7dfc912d5 --- /dev/null +++ b/config/alerts.yaml @@ -0,0 +1,90 @@ +# etcd + +- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 0.01 + description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 10ms. {{$value}}s + severity: warning + +- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 1 + description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 1s. {{$value}}s + severity: error + +- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[10m:]) > 0.007 + description: 10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 30ms. {{$value}}s + severity: warning + +- expr: rate(etcd_server_leader_changes_seen_total[2m]) > 0 + description: etcd leader changes observed + severity: warning + +- expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95 + description: etcd cluster database is running full. + severity: critical + +- expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5 + description: etcd database size in use is less than 50% of the actual allocated storage. + severity: warning + +- expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 + description: etcd cluster has high number of proposal failures. + severity: warning + +- expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15 + description: etcd cluster member communication is slow. + severity: warning + +- expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) > 0.15 + description: etcd grpc requests are slow. + severity: critical + +- expr: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5 + description: etcd cluster has high number of failed grpc requests. + severity: critical + +- expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 + description: etcd cluster has no leader. + severity: warning + +- expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2) + description: etcd cluster has insufficient number of members. + severity: warning + +- expr: max without (endpoint) ( sum without (instance) (up{job=~".*etcd.*"} == bool 0) or count without (To) ( sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 )) > 0 + description: etcd cluster members are down. + severity: warning + +# API server +- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb))[10m:]) > 1 + description: 10 minutes avg. 99th mutating API call latency for {{$labels.verb}}/{{$labels.resource}} higher than 1 second. {{$value}}s + severity: error + +- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="resource"}[2m])) by (le, resource, verb, scope))[5m:]) > 1 + description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 1 second. {{$value}}s + severity: error + +- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="namespace"}[2m])) by (le, resource, verb, scope))[5m:]) > 5 + description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 5 seconds. {{$value}}s + severity: error + +- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="cluster"}[2m])) by (le, resource, verb, scope))[5m:]) > 30 + description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 30 seconds. {{$value}}s + severity: error + +# Control plane pods + +- expr: up{job=~"crio|kubelet"} == 0 + description: "{{$labels.node}}/{{$labels.job}} down" + severity: warning + +- expr: up{job="ovnkube-node"} == 0 + description: "{{$labels.instance}}/{{$labels.pod}} {{$labels.job}} down" + severity: warning + +# Service sync latency +- expr: histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket[2m])) by (le)) > 10 + description: 99th Kubeproxy network programming latency higher than 10 seconds. {{$value}}s + severity: warning + +# Prometheus alerts +- expr: ALERTS{severity="critical", alertstate="firing"} > 0 + description: Critical prometheus alert. {{$labels.alertname}} + severity: warning diff --git a/config/alerts b/config/alerts_openshift.yaml similarity index 100% rename from config/alerts rename to config/alerts_openshift.yaml diff --git a/config/config.yaml b/config/config.yaml index 53406636f..e1c2dacf3 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -59,7 +59,7 @@ performance_monitoring: prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. uuid: # uuid for the run is generated by default if not set enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error - alert_profile: config/alerts # Path or URL to alert profile with the prometheus queries + alert_profile: config/alerts.yaml # Path or URL to alert profile with the prometheus queries check_critical_alerts: False # When enabled will check prometheus for critical alerts firing post chaos tunings: wait_duration: 60 # Duration to wait between each chaos scenario @@ -87,9 +87,6 @@ telemetry: - "(\\w{3}\\s\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}\\.\\d+).+" # Sep 9 11:20:36.123425532 - "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+" # kinit 2023/09/15 11:20:36 log - "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log - - - logs_filter_patterns: [ "(\\w{3}\\s\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}\\.\\d+).+","kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+","(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" ] oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH events_backup: True # enables/disables cluster events collection diff --git a/kraken/kube_burner/__init__.py b/kraken/kube_burner/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/kraken/kube_burner/client.py b/kraken/kube_burner/client.py deleted file mode 100644 index 2529a34aa..000000000 --- a/kraken/kube_burner/client.py +++ /dev/null @@ -1,116 +0,0 @@ -import subprocess -import logging -import urllib.request -import shutil -import sys -import requests -import tempfile -import kraken.prometheus.client as prometheus -from urllib.parse import urlparse - - -def setup(url): - """ - Downloads and unpacks kube-burner binary - """ - - filename = "kube_burner.tar" - try: - logging.info("Fetching kube-burner binary") - urllib.request.urlretrieve(url, filename) - except Exception as e: - logging.error("Failed to download kube-burner binary located at %s" % url, e) - sys.exit(1) - try: - logging.info("Unpacking kube-burner tar ball") - shutil.unpack_archive(filename) - except Exception as e: - logging.error("Failed to unpack the kube-burner binary tarball: %s" % e) - sys.exit(1) - - -def scrape_metrics( - distribution, uuid, prometheus_url, prometheus_bearer_token, start_time, end_time, config_path, metrics_profile -): - """ - Scrapes metrics defined in the profile from Prometheus and indexes them into Elasticsearch - """ - - if not prometheus_url: - if distribution == "openshift": - logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster") - prometheus_url, prometheus_bearer_token = prometheus.instance( - distribution, prometheus_url, prometheus_bearer_token - ) - else: - logging.error("Looks like prometheus url is not defined, exiting") - sys.exit(1) - command = ( - "./kube-burner index --uuid " - + str(uuid) - + " -u " - + str(prometheus_url) - + " -t " - + str(prometheus_bearer_token) - + " -m " - + str(metrics_profile) - + " --start " - + str(start_time) - + " --end " - + str(end_time) - + " -c " - + str(config_path) - ) - try: - logging.info("Running kube-burner to capture the metrics: %s" % command) - logging.info("UUID for the run: %s" % uuid) - subprocess.run(command, shell=True, universal_newlines=True) - except Exception as e: - logging.error("Failed to run kube-burner, error: %s" % (e)) - sys.exit(1) - - -def alerts(distribution, prometheus_url, prometheus_bearer_token, start_time, end_time, alert_profile): - """ - Scrapes metrics defined in the profile from Prometheus and alerts based on the severity defined - """ - - is_url = urlparse(alert_profile) - if is_url.scheme and is_url.netloc: - response = requests.get(alert_profile) - temp_alerts = tempfile.NamedTemporaryFile() - temp_alerts.write(response.content) - temp_alerts.flush() - alert_profile = temp_alerts.name - - if not prometheus_url: - if distribution == "openshift": - logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster") - prometheus_url, prometheus_bearer_token = prometheus.instance( - distribution, prometheus_url, prometheus_bearer_token - ) - else: - logging.error("Looks like prometheus url is not defined, exiting") - sys.exit(1) - command = ( - "./kube-burner check-alerts " - + " -u " - + str(prometheus_url) - + " -t " - + str(prometheus_bearer_token) - + " -a " - + str(alert_profile) - + " --start " - + str(start_time) - + " --end " - + str(end_time) - ) - try: - logging.info("Running kube-burner to capture the metrics: %s" % command) - output = subprocess.run(command, shell=True, universal_newlines=True) - if output.returncode != 0: - logging.error("command exited with a non-zero rc, please check the logs for errors or critical alerts") - sys.exit(output.returncode) - except Exception as e: - logging.error("Failed to run kube-burner, error: %s" % (e)) - sys.exit(1) diff --git a/kraken/prometheus/__init__.py b/kraken/prometheus/__init__.py index e69de29bb..4c51a3216 100644 --- a/kraken/prometheus/__init__.py +++ b/kraken/prometheus/__init__.py @@ -0,0 +1 @@ +from .client import * \ No newline at end of file diff --git a/kraken/prometheus/client.py b/kraken/prometheus/client.py index 13ab7a737..5ebab3493 100644 --- a/kraken/prometheus/client.py +++ b/kraken/prometheus/client.py @@ -1,52 +1,30 @@ +import datetime +import os.path import urllib3 import logging -import prometheus_api_client import sys -import kraken.invoke.command as runcommand +import yaml +from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile): -# Initialize the client -def initialize_prom_client(distribution, prometheus_url, prometheus_bearer_token): - global prom_cli - prometheus_url, prometheus_bearer_token = instance(distribution, prometheus_url, prometheus_bearer_token) - if prometheus_url and prometheus_bearer_token: - bearer = "Bearer " + prometheus_bearer_token - headers = {"Authorization": bearer} - try: - prom_cli = prometheus_api_client.PrometheusConnect(url=prometheus_url, headers=headers, disable_ssl=True) - except Exception as e: - logging.error("Not able to initialize the client %s" % e) - sys.exit(1) - else: - prom_cli = None - + if alert_profile is None or os.path.exists(alert_profile) is False: + logging.error(f"{alert_profile} alert profile does not exist") + sys.exit(1) -# Process custom prometheus query -def process_prom_query(query): - if prom_cli: - try: - return prom_cli.custom_query(query=query, params=None) - except Exception as e: - logging.error("Failed to get the metrics: %s" % e) + with open(alert_profile) as profile: + profile_yaml = yaml.safe_load(profile) + if not isinstance(profile_yaml, list): + logging.error(f"{alert_profile} wrong file format, alert profile must be " + f"a valid yaml file containing a list of items with 3 properties: " + f"expr, description, severity" ) sys.exit(1) - else: - logging.info("Skipping the prometheus query as the prometheus client couldn't " "be initialized\n") - -# Get prometheus details -def instance(distribution, prometheus_url, prometheus_bearer_token): - if distribution == "openshift" and not prometheus_url: - url = runcommand.invoke( - r"""oc get routes -n openshift-monitoring -o=jsonpath='{.items[?(@.metadata.name=="prometheus-k8s")].spec.host}'""" # noqa - ) - prometheus_url = "https://" + url - if distribution == "openshift" and not prometheus_bearer_token: - prometheus_bearer_token = runcommand.invoke( - "oc create token -n openshift-monitoring prometheus-k8s --duration=12h " - "|| oc -n openshift-monitoring sa get-token prometheus-k8s " - "|| oc sa new-token -n openshift-monitoring prometheus-k8s" - ) - return prometheus_url, prometheus_bearer_token + for alert in profile_yaml: + if list(alert.keys()).sort() != ["expr", "description", "severity"].sort(): + logging.error(f"wrong alert {alert}, skipping") -def scrape_metrics() \ No newline at end of file + prom_cli.process_alert(alert, + datetime.datetime.fromtimestamp(start_time), + datetime.datetime.fromtimestamp(end_time)) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 319e349da..effa72b1c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,7 +19,7 @@ ibm_cloud_sdk_core ibm_vpc itsdangerous==2.0.1 jinja2==3.0.3 -krkn-lib@git+https://github.com/redhat-chaos/krkn-lib.git@prometheus +krkn-lib@git+https://github.com/redhat-chaos/krkn-lib.git@time_interval kubernetes lxml >= 4.3.0 oauth2client>=4.1.3 diff --git a/run_kraken.py b/run_kraken.py index 9eb33c1cc..7dcd27164 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -16,15 +16,14 @@ import kraken.shut_down.common_shut_down_func as shut_down import kraken.node_actions.run as nodeaction import kraken.managedcluster_scenarios.run as managedcluster_scenarios -import kraken.kube_burner.client as kube_burner import kraken.zone_outage.actions as zone_outages import kraken.application_outage.actions as application_outage import kraken.pvc.pvc_scenario as pvc_scenario import kraken.network_chaos.actions as network_chaos import kraken.arcaflow_plugin as arcaflow_plugin +import kraken.prometheus as prometheus_plugin import server as server from kraken import plugins - from krkn_lib.k8s import KrknKubernetes from krkn_lib.ocp import KrknOpenshift from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes @@ -147,10 +146,7 @@ def main(cfg): except: kubecli.initialize_clients(None) - # KrknTelemetry init - telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli) - telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli) - prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token) + # find node kraken might be running on kubecli.find_kraken_node() @@ -179,11 +175,20 @@ def main(cfg): cv = "" if config["kraken"]["distribution"] == "openshift": cv = ocpcli.get_clusterversion_string() + if prometheus_url is None: + connection_data = ocpcli.get_prometheus_api_connection_data() + prometheus_url = connection_data.endpoint + prometheus_bearer_token = connection_data.token if cv != "": logging.info(cv) else: logging.info("Cluster version CRD not detected, skipping") + # KrknTelemetry init + telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli) + telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli) + prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token) + logging.info("Server URL: %s" % kubecli.get_host()) # Deploy performance dashboards @@ -375,10 +380,10 @@ def main(cfg): # if platform is openshift will be collected # Cloud platform and network plugins metadata # through OCP specific APIs - if config["kraken"]["distribution"] == "openshift": - telemetry_ocp.collect_cluster_metadata(chaos_telemetry) - else: - telemetry_k8s.collect_cluster_metadata(chaos_telemetry) + # if config["kraken"]["distribution"] == "openshift": + # telemetry_ocp.collect_cluster_metadata(chaos_telemetry) + # else: + # telemetry_k8s.collect_cluster_metadata(chaos_telemetry) decoded_chaos_run_telemetry = ChaosRunTelemetry(json.loads(chaos_telemetry.to_json())) logging.info(f"Telemetry data:\n{decoded_chaos_run_telemetry.to_json()}") @@ -402,34 +407,13 @@ def main(cfg): else: logging.info("telemetry collection disabled, skipping.") - # Capture the end time - - - # Capture metrics for the run - if capture_metrics: - if config - logging.info("Capturing metrics") - kube_burner.setup(kube_burner_url) - kube_burner.scrape_metrics( - distribution, - run_uuid, - prometheus_url, - prometheus_bearer_token, - start_time, - end_time, - config_path, - metrics_profile, - ) # Check for the alerts specified if enable_alerts: logging.info("Alerts checking is enabled") - kube_burner.setup(kube_burner_url) if alert_profile: - kube_burner.alerts( - distribution, - prometheus_url, - prometheus_bearer_token, + prometheus_plugin.alerts( + prometheus, start_time, end_time, alert_profile, diff --git a/scenarios/arcaflow/cpu-hog/config.yaml b/scenarios/arcaflow/cpu-hog/config.yaml index a03beb4c5..e6bcce968 100644 --- a/scenarios/arcaflow/cpu-hog/config.yaml +++ b/scenarios/arcaflow/cpu-hog/config.yaml @@ -1,6 +1,10 @@ --- deployer: - connection: {} + connection: + cacert: '' + cert: '' + host: https://api.tsebasti-lab.aws.rhperfscale.org:6443 + key: '' type: kubernetes log: level: debug diff --git a/scenarios/arcaflow/cpu-hog/input.yaml b/scenarios/arcaflow/cpu-hog/input.yaml index 3bcbece9f..2e3591721 100644 --- a/scenarios/arcaflow/cpu-hog/input.yaml +++ b/scenarios/arcaflow/cpu-hog/input.yaml @@ -2,13 +2,7 @@ input_list: - cpu_count: 1 cpu_load_percentage: 80 cpu_method: all - duration: 30s - node_selector: {} - # node selector example - # node_selector: - # kubernetes.io/hostname: master - kubeconfig: "" + duration: 1s + kubeconfig: '' namespace: default - -# duplicate this section to run simultaneous stressors in the same run - + node_selector: {}