From 7a966a71d0d2824a8157af26bb5cbeb3775bfd11 Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Tue, 31 Oct 2023 19:31:33 +0100 Subject: [PATCH] krkn integration of telemetry events collection (#523) * function package refactoring in krkn-lib * cluster events collection flag * krkn-lib version bump requirements * dockerfile bump --- config/config.yaml | 1 + containers/Dockerfile | 2 +- containers/Dockerfile-ppc64le | 2 +- kraken/application_outage/actions.py | 5 +++-- kraken/network_chaos/actions.py | 4 ++-- kraken/node_actions/run.py | 2 +- kraken/plugins/__init__.py | 4 ++-- kraken/pod_scenarios/setup.py | 4 ++-- kraken/pvc/pvc_scenario.py | 4 ++-- .../common_service_disruption_functions.py | 4 ++-- kraken/shut_down/common_shut_down_func.py | 3 ++- kraken/time_actions/common_time_functions.py | 5 +++-- kraken/zone_outage/actions.py | 4 ++-- requirements.txt | 2 +- run_kraken.py | 1 + 15 files changed, 26 insertions(+), 21 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index da5868b9c..e26779f8a 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -88,4 +88,5 @@ telemetry: - "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+" # kinit 2023/09/15 11:20:36 log - "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH + events_backup: True # enables/disables cluster events collection diff --git a/containers/Dockerfile b/containers/Dockerfile index a561ef7bd..401f76021 100644 --- a/containers/Dockerfile +++ b/containers/Dockerfile @@ -14,7 +14,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az # Install dependencies RUN yum install -y git python39 python3-pip jq gettext wget && \ python3.9 -m pip install -U pip && \ - git clone https://github.com/redhat-chaos/krkn.git --branch v1.4.7 /root/kraken && \ + git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.0 /root/kraken && \ mkdir -p /root/.kube && cd /root/kraken && \ pip3.9 install -r requirements.txt && \ pip3.9 install virtualenv && \ diff --git a/containers/Dockerfile-ppc64le b/containers/Dockerfile-ppc64le index ddfef8b69..c630ec35a 100644 --- a/containers/Dockerfile-ppc64le +++ b/containers/Dockerfile-ppc64le @@ -14,7 +14,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az # Install dependencies RUN yum install -y git python39 python3-pip jq gettext wget && \ python3.9 -m pip install -U pip && \ - git clone https://github.com/redhat-chaos/krkn.git --branch v1.4.7 /root/kraken && \ + git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.0 /root/kraken && \ mkdir -p /root/.kube && cd /root/kraken && \ pip3.9 install -r requirements.txt && \ pip3.9 install virtualenv && \ diff --git a/kraken/application_outage/actions.py b/kraken/application_outage/actions.py index 5a8f44762..dce725089 100644 --- a/kraken/application_outage/actions.py +++ b/kraken/application_outage/actions.py @@ -6,7 +6,8 @@ import kraken.invoke.command as runcommand from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.utils.functions import get_yaml_item_value +from krkn_lib.utils.functions import get_yaml_item_value, log_exception + # Reads the scenario config, applies and deletes a network policy to # block the traffic for the specified duration @@ -76,7 +77,7 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernete except Exception as e : scenario_telemetry.exitStatus = 1 failed_scenarios.append(app_outage_config) - telemetry.log_exception(app_outage_config) + log_exception(app_outage_config) else: scenario_telemetry.exitStatus = 0 scenario_telemetry.endTimeStamp = time.time() diff --git a/kraken/network_chaos/actions.py b/kraken/network_chaos/actions.py index 069dd0fbd..080be67c1 100644 --- a/kraken/network_chaos/actions.py +++ b/kraken/network_chaos/actions.py @@ -9,7 +9,7 @@ from krkn_lib.k8s import KrknKubernetes from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.utils.functions import get_yaml_item_value +from krkn_lib.utils.functions import get_yaml_item_value, log_exception # krkn_lib @@ -116,7 +116,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr except (RuntimeError, Exception): scenario_telemetry.exitStatus = 1 failed_scenarios.append(net_config) - telemetry.log_exception(net_config) + log_exception(net_config) else: scenario_telemetry.exitStatus = 0 scenario_telemetries.append(scenario_telemetry) diff --git a/kraken/node_actions/run.py b/kraken/node_actions/run.py index ad3821336..1eff6a142 100644 --- a/kraken/node_actions/run.py +++ b/kraken/node_actions/run.py @@ -78,7 +78,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr except (RuntimeError, Exception) as e: scenario_telemetry.exitStatus = 1 failed_scenarios.append(node_scenario_config) - telemetry.log_exception(node_scenario_config) + log_exception(node_scenario_config) else: scenario_telemetry.exitStatus = 0 diff --git a/kraken/plugins/__init__.py b/kraken/plugins/__init__.py index 8f455966e..dce41382e 100644 --- a/kraken/plugins/__init__.py +++ b/kraken/plugins/__init__.py @@ -15,7 +15,7 @@ from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_egress_shaping from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes from krkn_lib.models.telemetry import ScenarioTelemetry - +from krkn_lib.utils.functions import log_exception @dataclasses.dataclass @@ -241,7 +241,7 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p except Exception as e: scenario_telemetry.exitStatus = 1 failed_post_scenarios.append(scenario) - telemetry.log_exception(scenario) + log_exception(scenario) else: scenario_telemetry.exitStatus = 0 logging.info("Waiting for the specified duration: %s" % (wait_duration)) diff --git a/kraken/pod_scenarios/setup.py b/kraken/pod_scenarios/setup.py index f1ca1edc1..80fc00b97 100644 --- a/kraken/pod_scenarios/setup.py +++ b/kraken/pod_scenarios/setup.py @@ -10,7 +10,7 @@ from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes from krkn_lib.models.telemetry import ScenarioTelemetry from arcaflow_plugin_sdk import serialization -from krkn_lib.utils.functions import get_yaml_item_value +from krkn_lib.utils.functions import get_yaml_item_value, log_exception # Run pod based scenarios @@ -118,7 +118,7 @@ def container_run(kubeconfig_path, cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) except (RuntimeError, Exception): failed_scenarios.append(container_scenario_config[0]) - telemetry.log_exception(container_scenario_config[0]) + log_exception(container_scenario_config[0]) scenario_telemetry.exitStatus = 1 # removed_exit # sys.exit(1) diff --git a/kraken/pvc/pvc_scenario.py b/kraken/pvc/pvc_scenario.py index ee92b6823..4031f4d11 100644 --- a/kraken/pvc/pvc_scenario.py +++ b/kraken/pvc/pvc_scenario.py @@ -7,7 +7,7 @@ from krkn_lib.k8s import KrknKubernetes from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.utils.functions import get_yaml_item_value +from krkn_lib.utils.functions import get_yaml_item_value, log_exception # krkn_lib @@ -316,7 +316,7 @@ def run(scenarios_list, config, kubecli: KrknKubernetes, telemetry: KrknTelemetr except (RuntimeError, Exception): scenario_telemetry.exitStatus = 1 failed_scenarios.append(app_config) - telemetry.log_exception(app_config) + log_exception(app_config) else: scenario_telemetry.exitStatus = 0 scenario_telemetries.append(scenario_telemetry) diff --git a/kraken/service_disruption/common_service_disruption_functions.py b/kraken/service_disruption/common_service_disruption_functions.py index 845229827..23436d2c5 100644 --- a/kraken/service_disruption/common_service_disruption_functions.py +++ b/kraken/service_disruption/common_service_disruption_functions.py @@ -7,7 +7,7 @@ from krkn_lib.k8s import KrknKubernetes from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.utils.functions import get_yaml_item_value +from krkn_lib.utils.functions import get_yaml_item_value, log_exception def delete_objects(kubecli, namespace): @@ -251,7 +251,7 @@ def run( except (Exception, RuntimeError): scenario_telemetry.exitStatus = 1 failed_scenarios.append(scenario_config[0]) - telemetry.log_exception(scenario_config[0]) + log_exception(scenario_config[0]) else: scenario_telemetry.exitStatus = 0 scenario_telemetry.endTimeStamp = time.time() diff --git a/kraken/shut_down/common_shut_down_func.py b/kraken/shut_down/common_shut_down_func.py index d63df3e98..7af446d4c 100644 --- a/kraken/shut_down/common_shut_down_func.py +++ b/kraken/shut_down/common_shut_down_func.py @@ -12,6 +12,7 @@ from krkn_lib.k8s import KrknKubernetes from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes from krkn_lib.models.telemetry import ScenarioTelemetry +from krkn_lib.utils.functions import log_exception def multiprocess_nodes(cloud_object_function, nodes): try: @@ -165,7 +166,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr ) except (RuntimeError, Exception): - telemetry.log_exception(shut_down_config[0]) + log_exception(shut_down_config[0]) failed_scenarios.append(shut_down_config[0]) scenario_telemetry.exitStatus = 1 else: diff --git a/kraken/time_actions/common_time_functions.py b/kraken/time_actions/common_time_functions.py index 480894c0e..aae24d2d5 100644 --- a/kraken/time_actions/common_time_functions.py +++ b/kraken/time_actions/common_time_functions.py @@ -9,7 +9,8 @@ from krkn_lib.k8s import KrknKubernetes from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes from krkn_lib.models.telemetry import ScenarioTelemetry -from krkn_lib.utils.functions import get_yaml_item_value +from krkn_lib.utils.functions import get_yaml_item_value, log_exception + # krkn_lib def pod_exec(pod_name, command, namespace, container_name, kubecli:KrknKubernetes): @@ -339,7 +340,7 @@ def run(scenarios_list, config, wait_duration, kubecli:KrknKubernetes, telemetry ) except (RuntimeError, Exception): scenario_telemetry.exitStatus = 1 - telemetry.log_exception(time_scenario_config) + log_exception(time_scenario_config) failed_scenarios.append(time_scenario_config) else: scenario_telemetry.exitStatus = 0 diff --git a/kraken/zone_outage/actions.py b/kraken/zone_outage/actions.py index afbd194a1..aea9077ea 100644 --- a/kraken/zone_outage/actions.py +++ b/kraken/zone_outage/actions.py @@ -5,7 +5,7 @@ from ..cerberus import setup as cerberus from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes from krkn_lib.models.telemetry import ScenarioTelemetry - +from krkn_lib.utils.functions import log_exception def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]) : """ @@ -112,7 +112,7 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernete except (RuntimeError, Exception): scenario_telemetry.exitStatus = 1 failed_scenarios.append(zone_outage_config) - telemetry.log_exception(zone_outage_config) + log_exception(zone_outage_config) else: scenario_telemetry.exitStatus = 0 scenario_telemetry.endTimeStamp = time.time() diff --git a/requirements.txt b/requirements.txt index f91b0509f..40d05606f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,7 +19,7 @@ ibm_cloud_sdk_core ibm_vpc itsdangerous==2.0.1 jinja2==3.0.3 -krkn-lib>=1.4.0 +krkn-lib>=1.4.1 kubernetes lxml >= 4.3.0 oauth2client>=4.1.3 diff --git a/run_kraken.py b/run_kraken.py index 6bff7a738..cde8e13eb 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -440,6 +440,7 @@ def main(cfg): logging.info(f"telemetry upload log: {safe_logger.log_file_name}") try: telemetry_k8s.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry) + telemetry_k8s.put_cluster_events(telemetry_request_id, config["telemetry"], start_time, end_time) # prometheus data collection is available only on Openshift if config["telemetry"]["prometheus_backup"] and config["kraken"]["distribution"] == "openshift": safe_logger.info("archives download started:")