From 2575cf127f701a42b37c54e7ccf54001cb9ef2dd Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebastiani@users.noreply.github.com>
Date: Thu, 16 Nov 2023 16:47:46 +0100
Subject: [PATCH] krkn-lib prometheus client ready + kube_burner removed

removed certs

config file restored
---
 config/alerts.yaml                       |  90 ++++++++++++++++++
 config/{alerts => alerts_openshift.yaml} |   0
 config/config.yaml                       |   5 +-
 kraken/kube_burner/__init__.py           |   0
 kraken/kube_burner/client.py             | 116 -----------------------
 kraken/prometheus/__init__.py            |   1 +
 kraken/prometheus/client.py              |  62 ++++--------
 requirements.txt                         |   2 +-
 run_kraken.py                            |  50 ++++------
 scenarios/arcaflow/cpu-hog/config.yaml   |   6 +-
 scenarios/arcaflow/cpu-hog/input.yaml    |  12 +--
 11 files changed, 138 insertions(+), 206 deletions(-)
 create mode 100644 config/alerts.yaml
 rename config/{alerts => alerts_openshift.yaml} (100%)
 delete mode 100644 kraken/kube_burner/__init__.py
 delete mode 100644 kraken/kube_burner/client.py

diff --git a/config/alerts.yaml b/config/alerts.yaml
new file mode 100644
index 000000000..7dfc912d5
--- /dev/null
+++ b/config/alerts.yaml
@@ -0,0 +1,90 @@
+# etcd
+
+- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 0.01
+  description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 10ms. {{$value}}s
+  severity: warning
+
+- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 1
+  description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 1s. {{$value}}s
+  severity: error
+
+- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[10m:]) > 0.007
+  description: 10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 30ms. {{$value}}s
+  severity: warning
+
+- expr: rate(etcd_server_leader_changes_seen_total[2m]) > 0
+  description: etcd leader changes observed
+  severity: warning
+
+- expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
+  description: etcd cluster database is running full. 
+  severity: critical
+  
+- expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
+  description: etcd database size in use is less than 50% of the actual allocated storage.
+  severity: warning  
+
+- expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
+  description: etcd cluster has high number of proposal failures.
+  severity: warning 
+
+- expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15
+  description: etcd cluster member communication is slow.
+  severity: warning 
+
+- expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) > 0.15
+  description: etcd grpc requests are slow.
+  severity: critical
+      
+- expr: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5
+  description: etcd cluster has high number of failed grpc requests.
+  severity: critical      
+      
+- expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
+  description: etcd cluster has no leader.
+  severity: warning    
+    
+- expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
+  description: etcd cluster has insufficient number of members.
+  severity: warning      
+
+- expr: max without (endpoint) ( sum without (instance) (up{job=~".*etcd.*"} == bool 0) or count without (To) ( sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 )) > 0
+  description: etcd cluster members are down.
+  severity: warning  
+
+# API server
+- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb))[10m:]) > 1
+  description: 10 minutes avg. 99th mutating API call latency for {{$labels.verb}}/{{$labels.resource}} higher than 1 second. {{$value}}s
+  severity: error
+
+- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="resource"}[2m])) by (le, resource, verb, scope))[5m:]) > 1
+  description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 1 second. {{$value}}s
+  severity: error
+
+- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="namespace"}[2m])) by (le, resource, verb, scope))[5m:]) > 5
+  description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 5 seconds. {{$value}}s
+  severity: error
+
+- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="cluster"}[2m])) by (le, resource, verb, scope))[5m:]) > 30
+  description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 30 seconds. {{$value}}s
+  severity: error
+
+# Control plane pods
+
+- expr: up{job=~"crio|kubelet"} == 0
+  description: "{{$labels.node}}/{{$labels.job}} down"
+  severity: warning
+
+- expr: up{job="ovnkube-node"} == 0
+  description: "{{$labels.instance}}/{{$labels.pod}} {{$labels.job}} down"
+  severity: warning
+
+# Service sync latency
+- expr: histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket[2m])) by (le)) > 10
+  description: 99th Kubeproxy network programming latency higher than 10 seconds. {{$value}}s 
+  severity: warning
+
+# Prometheus alerts
+- expr: ALERTS{severity="critical", alertstate="firing"} > 0
+  description: Critical prometheus alert. {{$labels.alertname}}
+  severity: warning
diff --git a/config/alerts b/config/alerts_openshift.yaml
similarity index 100%
rename from config/alerts
rename to config/alerts_openshift.yaml
diff --git a/config/config.yaml b/config/config.yaml
index 53406636f..e1c2dacf3 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -59,7 +59,7 @@ performance_monitoring:
     prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
     uuid:                                                 # uuid for the run is generated by default if not set
     enable_alerts: False                                  # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
-    alert_profile: config/alerts                          # Path or URL to alert profile with the prometheus queries
+    alert_profile: config/alerts.yaml                          # Path or URL to alert profile with the prometheus queries
     check_critical_alerts: False                          # When enabled will check prometheus for critical alerts firing post chaos
 tunings:
     wait_duration: 60                                      # Duration to wait between each chaos scenario
@@ -87,9 +87,6 @@ telemetry:
      - "(\\w{3}\\s\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}\\.\\d+).+"         # Sep 9 11:20:36.123425532
      - "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+"          # kinit 2023/09/15 11:20:36 log
      - "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+"      # 2023-09-15T11:20:36.123425532Z log
-
-
-    logs_filter_patterns: [ "(\\w{3}\\s\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}\\.\\d+).+","kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+","(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" ]
     oc_cli_path: /usr/bin/oc                                # optional, if not specified will be search in $PATH
     events_backup: True                                     # enables/disables cluster events collection
 
diff --git a/kraken/kube_burner/__init__.py b/kraken/kube_burner/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/kraken/kube_burner/client.py b/kraken/kube_burner/client.py
deleted file mode 100644
index 2529a34aa..000000000
--- a/kraken/kube_burner/client.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import subprocess
-import logging
-import urllib.request
-import shutil
-import sys
-import requests
-import tempfile
-import kraken.prometheus.client as prometheus
-from urllib.parse import urlparse
-
-
-def setup(url):
-    """
-    Downloads and unpacks kube-burner binary
-    """
-
-    filename = "kube_burner.tar"
-    try:
-        logging.info("Fetching kube-burner binary")
-        urllib.request.urlretrieve(url, filename)
-    except Exception as e:
-        logging.error("Failed to download kube-burner binary located at %s" % url, e)
-        sys.exit(1)
-    try:
-        logging.info("Unpacking kube-burner tar ball")
-        shutil.unpack_archive(filename)
-    except Exception as e:
-        logging.error("Failed to unpack the kube-burner binary tarball: %s" % e)
-        sys.exit(1)
-
-
-def scrape_metrics(
-    distribution, uuid, prometheus_url, prometheus_bearer_token, start_time, end_time, config_path, metrics_profile
-):
-    """
-    Scrapes metrics defined in the profile from Prometheus and indexes them into Elasticsearch
-    """
-
-    if not prometheus_url:
-        if distribution == "openshift":
-            logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
-            prometheus_url, prometheus_bearer_token = prometheus.instance(
-                distribution, prometheus_url, prometheus_bearer_token
-            )
-        else:
-            logging.error("Looks like prometheus url is not defined, exiting")
-            sys.exit(1)
-    command = (
-        "./kube-burner index --uuid "
-        + str(uuid)
-        + " -u "
-        + str(prometheus_url)
-        + " -t "
-        + str(prometheus_bearer_token)
-        + " -m "
-        + str(metrics_profile)
-        + " --start "
-        + str(start_time)
-        + " --end "
-        + str(end_time)
-        + " -c "
-        + str(config_path)
-    )
-    try:
-        logging.info("Running kube-burner to capture the metrics: %s" % command)
-        logging.info("UUID for the run: %s" % uuid)
-        subprocess.run(command, shell=True, universal_newlines=True)
-    except Exception as e:
-        logging.error("Failed to run kube-burner, error: %s" % (e))
-        sys.exit(1)
-
-
-def alerts(distribution, prometheus_url, prometheus_bearer_token, start_time, end_time, alert_profile):
-    """
-    Scrapes metrics defined in the profile from Prometheus and alerts based on the severity defined
-    """
-
-    is_url = urlparse(alert_profile)
-    if is_url.scheme and is_url.netloc:
-        response = requests.get(alert_profile)
-        temp_alerts = tempfile.NamedTemporaryFile()
-        temp_alerts.write(response.content)
-        temp_alerts.flush()
-        alert_profile = temp_alerts.name
-
-    if not prometheus_url:
-        if distribution == "openshift":
-            logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
-            prometheus_url, prometheus_bearer_token = prometheus.instance(
-                distribution, prometheus_url, prometheus_bearer_token
-            )
-        else:
-            logging.error("Looks like prometheus url is not defined, exiting")
-            sys.exit(1)
-    command = (
-        "./kube-burner check-alerts "
-        + " -u "
-        + str(prometheus_url)
-        + " -t "
-        + str(prometheus_bearer_token)
-        + " -a "
-        + str(alert_profile)
-        + " --start "
-        + str(start_time)
-        + " --end "
-        + str(end_time)
-    )
-    try:
-        logging.info("Running kube-burner to capture the metrics: %s" % command)
-        output = subprocess.run(command, shell=True, universal_newlines=True)
-        if output.returncode != 0:
-            logging.error("command exited with a non-zero rc, please check the logs for errors or critical alerts")
-            sys.exit(output.returncode)
-    except Exception as e:
-        logging.error("Failed to run kube-burner, error: %s" % (e))
-        sys.exit(1)
diff --git a/kraken/prometheus/__init__.py b/kraken/prometheus/__init__.py
index e69de29bb..4c51a3216 100644
--- a/kraken/prometheus/__init__.py
+++ b/kraken/prometheus/__init__.py
@@ -0,0 +1 @@
+from .client import *
\ No newline at end of file
diff --git a/kraken/prometheus/client.py b/kraken/prometheus/client.py
index 13ab7a737..5ebab3493 100644
--- a/kraken/prometheus/client.py
+++ b/kraken/prometheus/client.py
@@ -1,52 +1,30 @@
+import datetime
+import os.path
 import urllib3
 import logging
-import prometheus_api_client
 import sys
-import kraken.invoke.command as runcommand
 
+import yaml
+from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
 
-# Initialize the client
-def initialize_prom_client(distribution, prometheus_url, prometheus_bearer_token):
-    global prom_cli
-    prometheus_url, prometheus_bearer_token = instance(distribution, prometheus_url, prometheus_bearer_token)
-    if prometheus_url and prometheus_bearer_token:
-        bearer = "Bearer " + prometheus_bearer_token
-        headers = {"Authorization": bearer}
-        try:
-            prom_cli = prometheus_api_client.PrometheusConnect(url=prometheus_url, headers=headers, disable_ssl=True)
-        except Exception as e:
-            logging.error("Not able to initialize the client %s" % e)
-            sys.exit(1)
-    else:
-        prom_cli = None
-
+    if alert_profile is None or os.path.exists(alert_profile) is False:
+        logging.error(f"{alert_profile} alert profile does not exist")
+        sys.exit(1)
 
-# Process custom prometheus query
-def process_prom_query(query):
-    if prom_cli:
-        try:
-            return prom_cli.custom_query(query=query, params=None)
-        except Exception as e:
-            logging.error("Failed to get the metrics: %s" % e)
+    with open(alert_profile) as profile:
+        profile_yaml = yaml.safe_load(profile)
+        if not isinstance(profile_yaml, list):
+            logging.error(f"{alert_profile} wrong file format, alert profile must be "
+                          f"a valid yaml file containing a list of items with 3 properties: "
+                          f"expr, description, severity" )
             sys.exit(1)
-    else:
-        logging.info("Skipping the prometheus query as the prometheus client couldn't " "be initialized\n")
-
-# Get prometheus details
-def instance(distribution, prometheus_url, prometheus_bearer_token):
-    if distribution == "openshift" and not prometheus_url:
-        url = runcommand.invoke(
-            r"""oc get routes -n openshift-monitoring -o=jsonpath='{.items[?(@.metadata.name=="prometheus-k8s")].spec.host}'"""  # noqa
-        )
-        prometheus_url = "https://" + url
-    if distribution == "openshift" and not prometheus_bearer_token:
-        prometheus_bearer_token = runcommand.invoke(
-            "oc create token -n openshift-monitoring prometheus-k8s  --duration=12h "
-            "|| oc -n openshift-monitoring sa get-token prometheus-k8s "
-            "|| oc sa new-token -n openshift-monitoring prometheus-k8s"
-        )
-    return prometheus_url, prometheus_bearer_token
 
+        for alert in profile_yaml:
+            if list(alert.keys()).sort() != ["expr", "description", "severity"].sort():
+                logging.error(f"wrong alert {alert}, skipping")
 
-def scrape_metrics()
\ No newline at end of file
+            prom_cli.process_alert(alert,
+                                   datetime.datetime.fromtimestamp(start_time),
+                                   datetime.datetime.fromtimestamp(end_time))
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 319e349da..effa72b1c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,7 +19,7 @@ ibm_cloud_sdk_core
 ibm_vpc
 itsdangerous==2.0.1
 jinja2==3.0.3
-krkn-lib@git+https://github.com/redhat-chaos/krkn-lib.git@prometheus
+krkn-lib@git+https://github.com/redhat-chaos/krkn-lib.git@time_interval
 kubernetes
 lxml >= 4.3.0
 oauth2client>=4.1.3
diff --git a/run_kraken.py b/run_kraken.py
index 9eb33c1cc..7dcd27164 100644
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -16,15 +16,14 @@
 import kraken.shut_down.common_shut_down_func as shut_down
 import kraken.node_actions.run as nodeaction
 import kraken.managedcluster_scenarios.run as managedcluster_scenarios
-import kraken.kube_burner.client as kube_burner
 import kraken.zone_outage.actions as zone_outages
 import kraken.application_outage.actions as application_outage
 import kraken.pvc.pvc_scenario as pvc_scenario
 import kraken.network_chaos.actions as network_chaos
 import kraken.arcaflow_plugin as arcaflow_plugin
+import kraken.prometheus as prometheus_plugin
 import server as server
 from kraken import plugins
-
 from krkn_lib.k8s import KrknKubernetes
 from krkn_lib.ocp import KrknOpenshift
 from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
@@ -147,10 +146,7 @@ def main(cfg):
         except:
             kubecli.initialize_clients(None)
 
-        # KrknTelemetry init
-        telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
-        telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
-        prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)
+
 
         # find node kraken might be running on
         kubecli.find_kraken_node()
@@ -179,11 +175,20 @@ def main(cfg):
         cv = ""
         if config["kraken"]["distribution"] == "openshift":
             cv = ocpcli.get_clusterversion_string()
+            if prometheus_url is None:
+                connection_data = ocpcli.get_prometheus_api_connection_data()
+                prometheus_url = connection_data.endpoint
+                prometheus_bearer_token = connection_data.token
         if cv != "":
             logging.info(cv)
         else:
             logging.info("Cluster version CRD not detected, skipping")
 
+        # KrknTelemetry init
+        telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
+        telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
+        prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)
+
         logging.info("Server URL: %s" % kubecli.get_host())
 
         # Deploy performance dashboards
@@ -375,10 +380,10 @@ def main(cfg):
         # if platform is openshift will be collected
         # Cloud platform and network plugins metadata
         # through OCP specific APIs
-        if config["kraken"]["distribution"] == "openshift":
-            telemetry_ocp.collect_cluster_metadata(chaos_telemetry)
-        else:
-            telemetry_k8s.collect_cluster_metadata(chaos_telemetry)
+        # if config["kraken"]["distribution"] == "openshift":
+        #     telemetry_ocp.collect_cluster_metadata(chaos_telemetry)
+        # else:
+        #     telemetry_k8s.collect_cluster_metadata(chaos_telemetry)
 
         decoded_chaos_run_telemetry = ChaosRunTelemetry(json.loads(chaos_telemetry.to_json()))
         logging.info(f"Telemetry data:\n{decoded_chaos_run_telemetry.to_json()}")
@@ -402,34 +407,13 @@ def main(cfg):
         else:
             logging.info("telemetry collection disabled, skipping.")
 
-        # Capture the end time
-
-
-        # Capture metrics for the run
-        if capture_metrics:
-            if config
-            logging.info("Capturing metrics")
-            kube_burner.setup(kube_burner_url)
-            kube_burner.scrape_metrics(
-                distribution,
-                run_uuid,
-                prometheus_url,
-                prometheus_bearer_token,
-                start_time,
-                end_time,
-                config_path,
-                metrics_profile,
-            )
 
         # Check for the alerts specified
         if enable_alerts:
             logging.info("Alerts checking is enabled")
-            kube_burner.setup(kube_burner_url)
             if alert_profile:
-                kube_burner.alerts(
-                    distribution,
-                    prometheus_url,
-                    prometheus_bearer_token,
+                prometheus_plugin.alerts(
+                    prometheus,
                     start_time,
                     end_time,
                     alert_profile,
diff --git a/scenarios/arcaflow/cpu-hog/config.yaml b/scenarios/arcaflow/cpu-hog/config.yaml
index a03beb4c5..e6bcce968 100644
--- a/scenarios/arcaflow/cpu-hog/config.yaml
+++ b/scenarios/arcaflow/cpu-hog/config.yaml
@@ -1,6 +1,10 @@
 ---
 deployer:
-  connection: {}
+  connection:
+    cacert: ''
+    cert: ''
+    host: https://api.tsebasti-lab.aws.rhperfscale.org:6443
+    key: ''
   type: kubernetes
 log:
   level: debug
diff --git a/scenarios/arcaflow/cpu-hog/input.yaml b/scenarios/arcaflow/cpu-hog/input.yaml
index 3bcbece9f..2e3591721 100644
--- a/scenarios/arcaflow/cpu-hog/input.yaml
+++ b/scenarios/arcaflow/cpu-hog/input.yaml
@@ -2,13 +2,7 @@ input_list:
 - cpu_count: 1
   cpu_load_percentage: 80
   cpu_method: all
-  duration: 30s
-  node_selector: {}
-  # node selector example
-  # node_selector:
-  #   kubernetes.io/hostname: master
-  kubeconfig: ""
+  duration: 1s
+  kubeconfig: ''
   namespace: default
-
-# duplicate this section to run simultaneous stressors in the same run
-
+  node_selector: {}