Skip to content

Commit

Permalink
Accept list of namespaces in chaos recommender
Browse files Browse the repository at this point in the history
Signed-off-by: jtydlack <[email protected]>
  • Loading branch information
jtydlack authored and chaitanyaenr committed Apr 10, 2024
1 parent 54af2fc commit 804d7cb
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 73 deletions.
2 changes: 1 addition & 1 deletion config/recommender_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
application: openshift-etcd
namespace: openshift-etcd
namespaces: openshift-etcd
labels: app=openshift-etcd
kubeconfig: ~/.kube/config.yaml
prometheus_endpoint: <Prometheus_Endpoint>
Expand Down
56 changes: 39 additions & 17 deletions kraken/chaos_recommender/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def load_telemetry_data(file_path):

def calculate_zscores(data):
zscores = pd.DataFrame()
zscores["Namespace"] = data["namespace"]
zscores["Service"] = data["service"]
zscores["CPU"] = (data["CPU"] - data["CPU"].mean()) / data["CPU"].std()
zscores["Memory"] = (data["MEM"] - data["MEM"].mean()) / data["MEM"].std()
Expand Down Expand Up @@ -46,28 +47,49 @@ def get_services_above_heatmap_threshold(dataframe, cpu_threshold, mem_threshold
return cpu_services, mem_services


def analysis(file_path, chaos_tests_config, threshold, heatmap_cpu_threshold, heatmap_mem_threshold):
def analysis(file_path, namespaces, chaos_tests_config, threshold,
heatmap_cpu_threshold, heatmap_mem_threshold):
# Load the telemetry data from file
logging.info("Fetching the Telemetry data")
logging.info("Fetching the Telemetry data...")
data = load_telemetry_data(file_path)

# Calculate Z-scores for CPU, Memory, and Network columns
zscores = calculate_zscores(data)

# Identify outliers
logging.info("Identifying outliers")
outliers_cpu, outliers_memory, outliers_network = identify_outliers(zscores, threshold)
cpu_services, mem_services = get_services_above_heatmap_threshold(data, heatmap_cpu_threshold, heatmap_mem_threshold)

analysis_data = analysis_json(outliers_cpu, outliers_memory,
outliers_network, cpu_services,
mem_services, chaos_tests_config)

if not cpu_services:
logging.info("There are no services that are using significant CPU compared to their assigned limits (infinite in case no limits are set).")
if not mem_services:
logging.info("There are no services that are using significant MEMORY compared to their assigned limits (infinite in case no limits are set).")
time.sleep(2)
# Dict for saving analysis data -- key is the namespace
analysis_data = {}

# Identify outliers for each namespace
for namespace in namespaces:

logging.info(f"Identifying outliers for namespace {namespace}...")

namespace_zscores = zscores.loc[zscores["Namespace"] == namespace]
namespace_data = data.loc[data["namespace"] == namespace]
outliers_cpu, outliers_memory, outliers_network = identify_outliers(
namespace_zscores, threshold)
cpu_services, mem_services = get_services_above_heatmap_threshold(
namespace_data, heatmap_cpu_threshold, heatmap_mem_threshold)

analysis_data[namespace] = analysis_json(outliers_cpu, outliers_memory,
outliers_network,
cpu_services, mem_services,
chaos_tests_config)

if cpu_services:
logging.info(f"These services use significant CPU compared to "
f"their assigned limits: {cpu_services}")
else:
logging.info("There are no services that are using significant "
"CPU compared to their assigned limits "
"(infinite in case no limits are set).")
if mem_services:
logging.info(f"These services use significant MEMORY compared to "
f"their assigned limits: {mem_services}")
else:
logging.info("There are no services that are using significant "
"MEMORY compared to their assigned limits "
"(infinite in case no limits are set).")
time.sleep(2)

logging.info("Please check data in utilisation.txt for further analysis")

Expand Down
82 changes: 50 additions & 32 deletions kraken/chaos_recommender/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,26 @@ def convert_data(data, service):
return result.get(service, '100000000000') # for those pods whose limits are not defined they can take as much resources, there assigning a very high value


def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, filename):
df_cpu = convert_data_to_dataframe(cpu_data, "CPU")
merged_df = pd.DataFrame(columns=['service','CPU','CPU_LIMITS','MEM','MEM_LIMITS','NETWORK'])
services = df_cpu.service.unique()
logging.info(services)

for s in services:

new_row_df = pd.DataFrame( {"service": s, "CPU" : convert_data(cpu_data, s),
"CPU_LIMITS" : convert_data(cpu_limits_result, s),
"MEM" : convert_data(mem_data, s), "MEM_LIMITS" : convert_data(mem_limits_result, s),
"NETWORK" : convert_data(network_data, s)}, index=[0])
merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)
def save_utilization_to_file(utilization, filename):
merged_df = pd.DataFrame(columns=['namespace', 'service', 'CPU', 'CPU_LIMITS', 'MEM', 'MEM_LIMITS', 'NETWORK'])
for namespace in utilization:
# Loading utilization_data[] for namespace
# indexes -- 0 CPU, 1 CPU limits, 2 mem, 3 mem limits, 4 network
utilization_data = utilization[namespace]
df_cpu = convert_data_to_dataframe(utilization_data[0], "CPU")
services = df_cpu.service.unique()
logging.info(f"Services for namespace {namespace}: {services}")

for s in services:

new_row_df = pd.DataFrame({
"namespace": namespace, "service": s,
"CPU": convert_data(utilization_data[0], s),
"CPU_LIMITS": convert_data(utilization_data[1], s),
"MEM": convert_data(utilization_data[2], s),
"MEM_LIMITS": convert_data(utilization_data[3], s),
"NETWORK": convert_data(utilization_data[4], s)}, index=[0])
merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)

# Convert columns to string
merged_df['CPU'] = merged_df['CPU'].astype(str)
Expand All @@ -57,38 +64,49 @@ def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_r
merged_df.to_csv(filename, sep='\t', index=False)


def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration):
def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token,
namespaces, scrape_duration):
urllib3.disable_warnings()
prometheus = PrometheusConnect(url=prometheus_endpoint, headers={'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)
prometheus = PrometheusConnect(url=prometheus_endpoint, headers={
'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)

# Dicts for saving utilisation and queries -- key is namespace
utilization = {}
queries = {}

logging.info("Fetching utilization...")
for namespace in namespaces:

# Fetch CPU utilization
cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
cpu_result = prometheus.custom_query(cpu_query)

# Fetch CPU utilization
logging.info("Fetching utilization")
cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
cpu_result = prometheus.custom_query(cpu_query)
cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
cpu_limits_result = prometheus.custom_query(cpu_limits_query)

cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
cpu_limits_result = prometheus.custom_query(cpu_limits_query)
mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
mem_result = prometheus.custom_query(mem_query)

mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
mem_result = prometheus.custom_query(mem_query)
mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"}) ' %(namespace)
mem_limits_result = prometheus.custom_query(mem_limits_query)

mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"}) ' %(namespace)
mem_limits_result = prometheus.custom_query(mem_limits_query)
network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
(avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
network_result = prometheus.custom_query(network_query)

network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
(avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
network_result = prometheus.custom_query(network_query)
utilization[namespace] = [cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result]
queries[namespace] = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query)

save_utilization_to_file(cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result, saved_metrics_path)
queries = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query)
save_utilization_to_file(utilization, saved_metrics_path)
return saved_metrics_path, queries


def json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query):
def json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query):
queries = {
"cpu_query": cpu_query,
"cpu_limit_query": cpu_limits_query,
"memory_query": mem_query,
"memory_limit_query": mem_limits_query
"memory_limit_query": mem_limits_query,
"network_query": network_query
}
return queries
6 changes: 3 additions & 3 deletions utils/chaos_recommender/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ To run the recommender with a config file specify the config file path with the
You can customize the default values by editing the `krkn/config/recommender_config.yaml` file. The configuration file contains the following options:
- `application`: Specify the application name.
- `namespace`: Specify the namespace name. If you want to profile
- `namespaces`: Specify the namespaces names (separated by coma or space). If you want to profile
- `labels`: Specify the labels (not used).
- `kubeconfig`: Specify the location of the kubeconfig file (not used).
- `prometheus_endpoint`: Specify the prometheus endpoint (must).
Expand Down Expand Up @@ -65,8 +65,8 @@ You can also provide the input values through command-line arguments launching t
-o, --options Evaluate command line options
-a APPLICATION, --application APPLICATION
Kubernetes application name
-n NAMESPACE, --namespace NAMESPACE
Kubernetes application namespace
-n NAMESPACES, --namespaces NAMESPACE
Kubernetes application namespaces separated by space
-l LABELS, --labels LABELS
Kubernetes application labels
-p PROMETHEUS_ENDPOINT, --prometheus-endpoint PROMETHEUS_ENDPOINT
Expand Down
64 changes: 44 additions & 20 deletions utils/chaos_recommender/chaos_recommender.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import logging
import os.path
import re
import sys
import time
import yaml
Expand All @@ -23,7 +24,7 @@ def parse_arguments(parser):
# command line options
parser.add_argument("-c", "--config-file", action="store", help="Config file path")
parser.add_argument("-o", "--options", action="store_true", help="Evaluate command line options")
parser.add_argument("-n", "--namespace", action="store", default="", help="Kubernetes application namespace")
parser.add_argument("-n", "--namespaces", action="store", default="", nargs="+", help="Kubernetes application namespaces separated by space")
parser.add_argument("-p", "--prometheus-endpoint", action="store", default="", help="Prometheus endpoint URI")
parser.add_argument("-k", "--kubeconfig", action="store", default=kube_config.KUBE_CONFIG_DEFAULT_LOCATION, help="Kubeconfig path")
parser.add_argument("-t", "--token", action="store", default="", help="Kubernetes authentication token")
Expand Down Expand Up @@ -57,7 +58,8 @@ def read_configuration(config_file_path):
config = yaml.safe_load(config_file)

log_level = config.get("log level", "INFO")
namespace = config.get("namespace")
namespaces = config.get("namespaces")
namespaces = re.split(r",+\s+|,+|\s+", namespaces)
kubeconfig = get_yaml_item_value(config, "kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)

prometheus_endpoint = config.get("prometheus_endpoint")
Expand All @@ -72,9 +74,9 @@ def read_configuration(config_file_path):
else:
output_path = False
chaos_tests = config.get("chaos_tests", {})
return (namespace, kubeconfig, prometheus_endpoint, auth_token, scrape_duration,
chaos_tests, log_level, threshold, heatmap_cpu_threshold,
heatmap_mem_threshold, output_path)
return (namespaces, kubeconfig, prometheus_endpoint, auth_token,
scrape_duration, chaos_tests, log_level, threshold,
heatmap_cpu_threshold, heatmap_mem_threshold, output_path)


def prompt_input(prompt, default_value):
Expand All @@ -84,21 +86,18 @@ def prompt_input(prompt, default_value):
return default_value


def make_json_output(inputs, queries, analysis_data, output_path):
def make_json_output(inputs, namespace_data, output_path):
time_str = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())

data = {
"inputs": inputs,
"queries": queries,
"profiling": analysis_data[0],
"heatmap_analysis": analysis_data[1],
"recommendations": analysis_data[2]
"analysis_outputs": namespace_data
}

logging.info(f"Summary\n{json.dumps(data, indent=4)}")

if output_path is not False:
file = f"recommender_{inputs['namespace']}_{time_str}.json"
file = f"recommender_{time_str}.json"
path = f"{os.path.expanduser(output_path)}/{file}"

with open(path, "w") as json_output:
Expand All @@ -107,9 +106,11 @@ def make_json_output(inputs, queries, analysis_data, output_path):
logging.info(f"Recommendation output saved in {file}.")


def json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold):
def json_inputs(namespaces, kubeconfig, prometheus_endpoint, scrape_duration,
chaos_tests, threshold, heatmap_cpu_threshold,
heatmap_mem_threshold):
inputs = {
"namespace": namespace,
"namespaces": namespaces,
"kubeconfig": kubeconfig,
"prometheus_endpoint": prometheus_endpoint,
"scrape_duration": scrape_duration,
Expand All @@ -121,6 +122,17 @@ def json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, cha
return inputs


def json_namespace(namespace, queries, analysis_data):
data = {
"namespace": namespace,
"queries": queries,
"profiling": analysis_data[0],
"heatmap_analysis": analysis_data[1],
"recommendations": analysis_data[2]
}
return data


def main():
parser = argparse.ArgumentParser(description="Krkn Chaos Recommender Command-Line tool")
args = parse_arguments(parser)
Expand All @@ -132,7 +144,7 @@ def main():

if args.config_file is not None:
(
namespace,
namespaces,
kubeconfig,
prometheus_endpoint,
auth_token,
Expand All @@ -146,7 +158,7 @@ def main():
) = read_configuration(args.config_file)

if args.options:
namespace = args.namespace
namespaces = args.namespaces
kubeconfig = args.kubeconfig
auth_token = args.token
scrape_duration = args.scrape_duration
Expand All @@ -172,14 +184,26 @@ def main():
if not os.path.exists(os.path.expanduser(output_path)):
logging.error(f"Folder {output_path} for output not found.")
sys.exit(1)

logging.info("Loading inputs...")
inputs = json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold)
logging.info("Starting Analysis ...")
inputs = json_inputs(namespaces, kubeconfig, prometheus_endpoint,
scrape_duration, chaos_tests, threshold,
heatmap_cpu_threshold, heatmap_mem_threshold)
namespaces_data = []

logging.info("Starting Analysis...")

file_path, queries = prometheus.fetch_utilization_from_prometheus(
prometheus_endpoint, auth_token, namespaces, scrape_duration)

file_path, queries = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration)
analysis_data = analysis(file_path, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold)
analysis_data = analysis(file_path, namespaces, chaos_tests, threshold,
heatmap_cpu_threshold, heatmap_mem_threshold)

make_json_output(inputs, queries, analysis_data, output_path)
for namespace in namespaces:
namespace_data = json_namespace(namespace, queries[namespace],
analysis_data[namespace])
namespaces_data.append(namespace_data)
make_json_output(inputs, namespaces_data, output_path)


if __name__ == "__main__":
Expand Down

0 comments on commit 804d7cb

Please sign in to comment.