Accept list of namespaces in chaos recommender

Signed-off-by: jtydlack <[email protected]>
krkn-chaos · Apr 10, 2024 · 804d7cb · 804d7cb
1 parent 54af2fc
commit 804d7cb
Show file tree

Hide file tree

Showing 5 changed files with 137 additions and 73 deletions.
diff --git a/config/recommender_config.yaml b/config/recommender_config.yaml
@@ -1,5 +1,5 @@
 application: openshift-etcd
-namespace: openshift-etcd
+namespaces: openshift-etcd
 labels: app=openshift-etcd
 kubeconfig: ~/.kube/config.yaml
 prometheus_endpoint: <Prometheus_Endpoint>

diff --git a/kraken/chaos_recommender/analysis.py b/kraken/chaos_recommender/analysis.py
@@ -19,6 +19,7 @@ def load_telemetry_data(file_path):
 
 def calculate_zscores(data):
     zscores = pd.DataFrame()
+    zscores["Namespace"] = data["namespace"]
     zscores["Service"] = data["service"]
     zscores["CPU"] = (data["CPU"] - data["CPU"].mean()) / data["CPU"].std()
     zscores["Memory"] = (data["MEM"] - data["MEM"].mean()) / data["MEM"].std()
@@ -46,28 +47,49 @@ def get_services_above_heatmap_threshold(dataframe, cpu_threshold, mem_threshold
     return cpu_services, mem_services
 
 
-def analysis(file_path, chaos_tests_config, threshold, heatmap_cpu_threshold, heatmap_mem_threshold):
+def analysis(file_path, namespaces, chaos_tests_config, threshold,
+             heatmap_cpu_threshold, heatmap_mem_threshold):
     # Load the telemetry data from file
-    logging.info("Fetching the Telemetry data")
+    logging.info("Fetching the Telemetry data...")
     data = load_telemetry_data(file_path)
 
     # Calculate Z-scores for CPU, Memory, and Network columns
     zscores = calculate_zscores(data)
-
-    # Identify outliers
-    logging.info("Identifying outliers")
-    outliers_cpu, outliers_memory, outliers_network = identify_outliers(zscores, threshold)
-    cpu_services, mem_services = get_services_above_heatmap_threshold(data, heatmap_cpu_threshold, heatmap_mem_threshold)
-
-    analysis_data = analysis_json(outliers_cpu, outliers_memory,
-                                  outliers_network, cpu_services,
-                                  mem_services, chaos_tests_config)
-
-    if not cpu_services:
-        logging.info("There are no services that are using significant CPU compared to their assigned limits (infinite in case no limits are set).")
-    if not mem_services:
-        logging.info("There are no services that are using significant MEMORY compared to their assigned limits (infinite in case no limits are set).")
-    time.sleep(2)
+    # Dict for saving analysis data -- key is the namespace
+    analysis_data = {}
+
+    # Identify outliers for each namespace
+    for namespace in namespaces:
+
+        logging.info(f"Identifying outliers for namespace {namespace}...")
+
+        namespace_zscores = zscores.loc[zscores["Namespace"] == namespace]
+        namespace_data = data.loc[data["namespace"] == namespace]
+        outliers_cpu, outliers_memory, outliers_network = identify_outliers(
+            namespace_zscores, threshold)
+        cpu_services, mem_services = get_services_above_heatmap_threshold(
+            namespace_data, heatmap_cpu_threshold, heatmap_mem_threshold)
+
+        analysis_data[namespace] = analysis_json(outliers_cpu, outliers_memory,
+                                                 outliers_network,
+                                                 cpu_services, mem_services,
+                                                 chaos_tests_config)
+
+        if cpu_services:
+            logging.info(f"These services use significant CPU compared to "
+                         f"their assigned limits: {cpu_services}")
+        else:
+            logging.info("There are no services that are using significant "
+                         "CPU compared to their assigned limits "
+                         "(infinite in case no limits are set).")
+        if mem_services:
+            logging.info(f"These services use significant MEMORY compared to "
+                         f"their assigned limits: {mem_services}")
+        else:
+            logging.info("There are no services that are using significant "
+                         "MEMORY compared to their assigned limits "
+                         "(infinite in case no limits are set).")
+        time.sleep(2)
 
     logging.info("Please check data in utilisation.txt for further analysis")
 

diff --git a/kraken/chaos_recommender/prometheus.py b/kraken/chaos_recommender/prometheus.py
@@ -26,19 +26,26 @@ def convert_data(data, service):
     return result.get(service, '100000000000') # for those pods whose limits are not defined they can take as much resources, there assigning a very high value
 
 
-def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, filename):
-    df_cpu = convert_data_to_dataframe(cpu_data, "CPU")
-    merged_df = pd.DataFrame(columns=['service','CPU','CPU_LIMITS','MEM','MEM_LIMITS','NETWORK'])
-    services = df_cpu.service.unique()
-    logging.info(services)
-
-    for s in services:
-
-        new_row_df = pd.DataFrame( {"service": s, "CPU" : convert_data(cpu_data, s),
-                    "CPU_LIMITS" : convert_data(cpu_limits_result, s),
-                    "MEM" : convert_data(mem_data, s), "MEM_LIMITS" : convert_data(mem_limits_result, s),
-                    "NETWORK" : convert_data(network_data, s)}, index=[0])
-        merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)
+def save_utilization_to_file(utilization, filename):
+    merged_df = pd.DataFrame(columns=['namespace', 'service', 'CPU', 'CPU_LIMITS', 'MEM', 'MEM_LIMITS', 'NETWORK'])
+    for namespace in utilization:
+        # Loading utilization_data[] for namespace
+        # indexes -- 0 CPU, 1 CPU limits, 2 mem, 3 mem limits, 4 network
+        utilization_data = utilization[namespace]
+        df_cpu = convert_data_to_dataframe(utilization_data[0], "CPU")
+        services = df_cpu.service.unique()
+        logging.info(f"Services for namespace {namespace}: {services}")
+
+        for s in services:
+
+            new_row_df = pd.DataFrame({
+                "namespace": namespace, "service": s,
+                "CPU": convert_data(utilization_data[0], s),
+                "CPU_LIMITS": convert_data(utilization_data[1], s),
+                "MEM": convert_data(utilization_data[2], s),
+                "MEM_LIMITS": convert_data(utilization_data[3], s),
+                "NETWORK": convert_data(utilization_data[4], s)}, index=[0])
+            merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)
 
     # Convert columns to string
     merged_df['CPU'] = merged_df['CPU'].astype(str)
@@ -57,38 +64,49 @@ def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_r
     merged_df.to_csv(filename, sep='\t', index=False)
 
 
-def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration):
+def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token,
+                                      namespaces, scrape_duration):
     urllib3.disable_warnings()
-    prometheus = PrometheusConnect(url=prometheus_endpoint, headers={'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)
+    prometheus = PrometheusConnect(url=prometheus_endpoint, headers={
+        'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)
+
+    # Dicts for saving utilisation and queries -- key is namespace
+    utilization = {}
+    queries = {}
+
+    logging.info("Fetching utilization...")
+    for namespace in namespaces:
+
+        # Fetch CPU utilization
+        cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
+        cpu_result = prometheus.custom_query(cpu_query)
 
-    # Fetch CPU utilization
-    logging.info("Fetching utilization")
-    cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
-    cpu_result = prometheus.custom_query(cpu_query)
+        cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
+        cpu_limits_result = prometheus.custom_query(cpu_limits_query)
 
-    cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
-    cpu_limits_result = prometheus.custom_query(cpu_limits_query)
+        mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
+        mem_result = prometheus.custom_query(mem_query)
 
-    mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
-    mem_result = prometheus.custom_query(mem_query)
+        mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"})  ' %(namespace)
+        mem_limits_result = prometheus.custom_query(mem_limits_query)
 
-    mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"})  ' %(namespace)
-    mem_limits_result = prometheus.custom_query(mem_limits_query)
+        network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
+        (avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
+        network_result = prometheus.custom_query(network_query)
 
-    network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
-    (avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
-    network_result = prometheus.custom_query(network_query)
+        utilization[namespace] = [cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result]
+        queries[namespace] = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query)
 
-    save_utilization_to_file(cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result, saved_metrics_path)
-    queries = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query)
+    save_utilization_to_file(utilization, saved_metrics_path)
     return saved_metrics_path, queries
 
 
-def json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query):
+def json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query):
     queries = {
         "cpu_query": cpu_query,
         "cpu_limit_query": cpu_limits_query,
         "memory_query": mem_query,
-        "memory_limit_query": mem_limits_query
+        "memory_limit_query": mem_limits_query,
+        "network_query": network_query
     }
     return queries
diff --git a/utils/chaos_recommender/README.md b/utils/chaos_recommender/README.md
@@ -32,7 +32,7 @@ To run the recommender with a config file specify the config file path with the
 You can customize the default values by editing the `krkn/config/recommender_config.yaml` file. The configuration file contains the following options:
 
   - `application`: Specify the application name.
-  - `namespace`: Specify the namespace name. If you want to profile
+  - `namespaces`: Specify the namespaces names (separated by coma or space). If you want to profile
   - `labels`: Specify the labels (not used).
   - `kubeconfig`: Specify the location of the kubeconfig file (not used).
   - `prometheus_endpoint`: Specify the prometheus endpoint (must).
@@ -65,8 +65,8 @@ You can also provide the input values through command-line arguments launching t
   -o, --options         Evaluate command line options
   -a APPLICATION, --application APPLICATION
                         Kubernetes application name
-  -n NAMESPACE, --namespace NAMESPACE
-                        Kubernetes application namespace
+  -n NAMESPACES, --namespaces NAMESPACE
+                        Kubernetes application namespaces separated by space
   -l LABELS, --labels LABELS
                         Kubernetes application labels
   -p PROMETHEUS_ENDPOINT, --prometheus-endpoint PROMETHEUS_ENDPOINT

diff --git a/utils/chaos_recommender/chaos_recommender.py b/utils/chaos_recommender/chaos_recommender.py
@@ -2,6 +2,7 @@
 import json
 import logging
 import os.path
+import re
 import sys
 import time
 import yaml
@@ -23,7 +24,7 @@ def parse_arguments(parser):
     # command line options
     parser.add_argument("-c", "--config-file", action="store", help="Config file path")
     parser.add_argument("-o", "--options", action="store_true", help="Evaluate command line options")
-    parser.add_argument("-n", "--namespace", action="store", default="", help="Kubernetes application namespace")
+    parser.add_argument("-n", "--namespaces", action="store", default="", nargs="+", help="Kubernetes application namespaces separated by space")
     parser.add_argument("-p", "--prometheus-endpoint", action="store", default="", help="Prometheus endpoint URI")
     parser.add_argument("-k", "--kubeconfig", action="store", default=kube_config.KUBE_CONFIG_DEFAULT_LOCATION, help="Kubeconfig path")
     parser.add_argument("-t", "--token", action="store", default="", help="Kubernetes authentication token")
@@ -57,7 +58,8 @@ def read_configuration(config_file_path):
         config = yaml.safe_load(config_file)
 
     log_level = config.get("log level", "INFO")
-    namespace = config.get("namespace")
+    namespaces = config.get("namespaces")
+    namespaces = re.split(r",+\s+|,+|\s+", namespaces)
     kubeconfig = get_yaml_item_value(config, "kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
 
     prometheus_endpoint = config.get("prometheus_endpoint")
@@ -72,9 +74,9 @@ def read_configuration(config_file_path):
     else:
         output_path = False
     chaos_tests = config.get("chaos_tests", {})
-    return (namespace, kubeconfig, prometheus_endpoint, auth_token, scrape_duration,
-            chaos_tests, log_level, threshold, heatmap_cpu_threshold,
-            heatmap_mem_threshold, output_path)
+    return (namespaces, kubeconfig, prometheus_endpoint, auth_token,
+            scrape_duration, chaos_tests, log_level, threshold,
+            heatmap_cpu_threshold, heatmap_mem_threshold, output_path)
 
 
 def prompt_input(prompt, default_value):
@@ -84,21 +86,18 @@ def prompt_input(prompt, default_value):
     return default_value
 
 
-def make_json_output(inputs, queries, analysis_data, output_path):
+def make_json_output(inputs, namespace_data, output_path):
     time_str = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
 
     data = {
         "inputs": inputs,
-        "queries": queries,
-        "profiling": analysis_data[0],
-        "heatmap_analysis": analysis_data[1],
-        "recommendations": analysis_data[2]
+        "analysis_outputs": namespace_data
     }
 
     logging.info(f"Summary\n{json.dumps(data, indent=4)}")
 
     if output_path is not False:
-        file = f"recommender_{inputs['namespace']}_{time_str}.json"
+        file = f"recommender_{time_str}.json"
         path = f"{os.path.expanduser(output_path)}/{file}"
 
         with open(path, "w") as json_output:
@@ -107,9 +106,11 @@ def make_json_output(inputs, queries, analysis_data, output_path):
             logging.info(f"Recommendation output saved in {file}.")
 
 
-def json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold):
+def json_inputs(namespaces, kubeconfig, prometheus_endpoint, scrape_duration,
+                chaos_tests, threshold, heatmap_cpu_threshold,
+                heatmap_mem_threshold):
     inputs = {
-        "namespace": namespace,
+        "namespaces": namespaces,
         "kubeconfig": kubeconfig,
         "prometheus_endpoint": prometheus_endpoint,
         "scrape_duration": scrape_duration,
@@ -121,6 +122,17 @@ def json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, cha
     return inputs
 
 
+def json_namespace(namespace, queries, analysis_data):
+    data = {
+        "namespace": namespace,
+        "queries": queries,
+        "profiling": analysis_data[0],
+        "heatmap_analysis": analysis_data[1],
+        "recommendations": analysis_data[2]
+    }
+    return data
+
+
 def main():
     parser = argparse.ArgumentParser(description="Krkn Chaos Recommender Command-Line tool")
     args = parse_arguments(parser)
@@ -132,7 +144,7 @@ def main():
 
     if args.config_file is not None:
         (
-         namespace,
+         namespaces,
          kubeconfig,
          prometheus_endpoint,
          auth_token,
@@ -146,7 +158,7 @@ def main():
          ) = read_configuration(args.config_file)
 
     if args.options:
-        namespace = args.namespace
+        namespaces = args.namespaces
         kubeconfig = args.kubeconfig
         auth_token = args.token
         scrape_duration = args.scrape_duration
@@ -172,14 +184,26 @@ def main():
         if not os.path.exists(os.path.expanduser(output_path)):
             logging.error(f"Folder {output_path} for output not found.")
             sys.exit(1)
+
     logging.info("Loading inputs...")
-    inputs = json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold)
-    logging.info("Starting Analysis ...")
+    inputs = json_inputs(namespaces, kubeconfig, prometheus_endpoint,
+                         scrape_duration, chaos_tests, threshold,
+                         heatmap_cpu_threshold, heatmap_mem_threshold)
+    namespaces_data = []
+
+    logging.info("Starting Analysis...")
+
+    file_path, queries = prometheus.fetch_utilization_from_prometheus(
+        prometheus_endpoint, auth_token, namespaces, scrape_duration)
 
-    file_path, queries = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration)
-    analysis_data = analysis(file_path, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold)
+    analysis_data = analysis(file_path, namespaces, chaos_tests, threshold,
+                             heatmap_cpu_threshold, heatmap_mem_threshold)
 
-    make_json_output(inputs, queries, analysis_data, output_path)
+    for namespace in namespaces:
+        namespace_data = json_namespace(namespace, queries[namespace],
+                                        analysis_data[namespace])
+        namespaces_data.append(namespace_data)
+    make_json_output(inputs, namespaces_data, output_path)
 
 
 if __name__ == "__main__":