Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

krkn integration of telemetry events collection #523

Merged
merged 4 commits into from
Oct 31, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,5 @@ telemetry:
- "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+" # kinit 2023/09/15 11:20:36 log
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
events_backup: True # enables/disables cluster events collection

5 changes: 3 additions & 2 deletions kraken/application_outage/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import kraken.invoke.command as runcommand
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.utils.functions import get_yaml_item_value
from krkn_lib.utils.functions import get_yaml_item_value, log_exception


# Reads the scenario config, applies and deletes a network policy to
# block the traffic for the specified duration
Expand Down Expand Up @@ -76,7 +77,7 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernete
except Exception as e :
scenario_telemetry.exitStatus = 1
failed_scenarios.append(app_outage_config)
telemetry.log_exception(app_outage_config)
log_exception(app_outage_config)
else:
scenario_telemetry.exitStatus = 0
scenario_telemetry.endTimeStamp = time.time()
Expand Down
4 changes: 2 additions & 2 deletions kraken/network_chaos/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.utils.functions import get_yaml_item_value
from krkn_lib.utils.functions import get_yaml_item_value, log_exception


# krkn_lib
Expand Down Expand Up @@ -116,7 +116,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
except (RuntimeError, Exception):
scenario_telemetry.exitStatus = 1
failed_scenarios.append(net_config)
telemetry.log_exception(net_config)
log_exception(net_config)
else:
scenario_telemetry.exitStatus = 0
scenario_telemetries.append(scenario_telemetry)
Expand Down
2 changes: 1 addition & 1 deletion kraken/node_actions/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
except (RuntimeError, Exception) as e:
scenario_telemetry.exitStatus = 1
failed_scenarios.append(node_scenario_config)
telemetry.log_exception(node_scenario_config)
log_exception(node_scenario_config)
else:
scenario_telemetry.exitStatus = 0

Expand Down
4 changes: 2 additions & 2 deletions kraken/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_egress_shaping
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry

from krkn_lib.utils.functions import log_exception


@dataclasses.dataclass
Expand Down Expand Up @@ -241,7 +241,7 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p
except Exception as e:
scenario_telemetry.exitStatus = 1
failed_post_scenarios.append(scenario)
telemetry.log_exception(scenario)
log_exception(scenario)
else:
scenario_telemetry.exitStatus = 0
logging.info("Waiting for the specified duration: %s" % (wait_duration))
Expand Down
4 changes: 2 additions & 2 deletions kraken/pod_scenarios/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from arcaflow_plugin_sdk import serialization
from krkn_lib.utils.functions import get_yaml_item_value
from krkn_lib.utils.functions import get_yaml_item_value, log_exception


# Run pod based scenarios
Expand Down Expand Up @@ -118,7 +118,7 @@ def container_run(kubeconfig_path,
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
except (RuntimeError, Exception):
failed_scenarios.append(container_scenario_config[0])
telemetry.log_exception(container_scenario_config[0])
log_exception(container_scenario_config[0])
scenario_telemetry.exitStatus = 1
# removed_exit
# sys.exit(1)
Expand Down
4 changes: 2 additions & 2 deletions kraken/pvc/pvc_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.utils.functions import get_yaml_item_value
from krkn_lib.utils.functions import get_yaml_item_value, log_exception


# krkn_lib
Expand Down Expand Up @@ -316,7 +316,7 @@ def run(scenarios_list, config, kubecli: KrknKubernetes, telemetry: KrknTelemetr
except (RuntimeError, Exception):
scenario_telemetry.exitStatus = 1
failed_scenarios.append(app_config)
telemetry.log_exception(app_config)
log_exception(app_config)
else:
scenario_telemetry.exitStatus = 0
scenario_telemetries.append(scenario_telemetry)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.utils.functions import get_yaml_item_value
from krkn_lib.utils.functions import get_yaml_item_value, log_exception


def delete_objects(kubecli, namespace):
Expand Down Expand Up @@ -251,7 +251,7 @@ def run(
except (Exception, RuntimeError):
scenario_telemetry.exitStatus = 1
failed_scenarios.append(scenario_config[0])
telemetry.log_exception(scenario_config[0])
log_exception(scenario_config[0])
else:
scenario_telemetry.exitStatus = 0
scenario_telemetry.endTimeStamp = time.time()
Expand Down
3 changes: 2 additions & 1 deletion kraken/shut_down/common_shut_down_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.utils.functions import log_exception

def multiprocess_nodes(cloud_object_function, nodes):
try:
Expand Down Expand Up @@ -165,7 +166,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
)

except (RuntimeError, Exception):
telemetry.log_exception(shut_down_config[0])
log_exception(shut_down_config[0])
failed_scenarios.append(shut_down_config[0])
scenario_telemetry.exitStatus = 1
else:
Expand Down
5 changes: 3 additions & 2 deletions kraken/time_actions/common_time_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.utils.functions import get_yaml_item_value
from krkn_lib.utils.functions import get_yaml_item_value, log_exception


# krkn_lib
def pod_exec(pod_name, command, namespace, container_name, kubecli:KrknKubernetes):
Expand Down Expand Up @@ -339,7 +340,7 @@ def run(scenarios_list, config, wait_duration, kubecli:KrknKubernetes, telemetry
)
except (RuntimeError, Exception):
scenario_telemetry.exitStatus = 1
telemetry.log_exception(time_scenario_config)
log_exception(time_scenario_config)
failed_scenarios.append(time_scenario_config)
else:
scenario_telemetry.exitStatus = 0
Expand Down
4 changes: 2 additions & 2 deletions kraken/zone_outage/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ..cerberus import setup as cerberus
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry

from krkn_lib.utils.functions import log_exception

def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]) :
"""
Expand Down Expand Up @@ -112,7 +112,7 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernete
except (RuntimeError, Exception):
scenario_telemetry.exitStatus = 1
failed_scenarios.append(zone_outage_config)
telemetry.log_exception(zone_outage_config)
log_exception(zone_outage_config)
else:
scenario_telemetry.exitStatus = 0
scenario_telemetry.endTimeStamp = time.time()
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ ibm_cloud_sdk_core
ibm_vpc
itsdangerous==2.0.1
jinja2==3.0.3
krkn-lib>=1.4.0
krkn-lib@git+https://github.com/redhat-chaos/krkn-lib.git@events_collection
kubernetes
lxml >= 4.3.0
oauth2client>=4.1.3
Expand Down
1 change: 1 addition & 0 deletions run_kraken.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ def main(cfg):
logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
try:
telemetry_k8s.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
telemetry_k8s.put_cluster_events(telemetry_request_id, config["telemetry"], start_time, end_time)
# prometheus data collection is available only on Openshift
if config["telemetry"]["prometheus_backup"] and config["kraken"]["distribution"] == "openshift":
safe_logger.info("archives download started:")
Expand Down