Skip to content

Commit

Permalink
krkn integration of telemetry events collection (#523)
Browse files Browse the repository at this point in the history
* function package refactoring in krkn-lib

* cluster events collection flag

* krkn-lib version bump

requirements

* dockerfile bump
  • Loading branch information
tsebastiani authored Oct 31, 2023
1 parent 43d891a commit 7a966a7
Show file tree
Hide file tree
Showing 15 changed files with 26 additions and 21 deletions.
1 change: 1 addition & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,5 @@ telemetry:
- "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+" # kinit 2023/09/15 11:20:36 log
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
events_backup: True # enables/disables cluster events collection

2 changes: 1 addition & 1 deletion containers/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
# Install dependencies
RUN yum install -y git python39 python3-pip jq gettext wget && \
python3.9 -m pip install -U pip && \
git clone https://github.com/redhat-chaos/krkn.git --branch v1.4.7 /root/kraken && \
git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.0 /root/kraken && \
mkdir -p /root/.kube && cd /root/kraken && \
pip3.9 install -r requirements.txt && \
pip3.9 install virtualenv && \
Expand Down
2 changes: 1 addition & 1 deletion containers/Dockerfile-ppc64le
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
# Install dependencies
RUN yum install -y git python39 python3-pip jq gettext wget && \
python3.9 -m pip install -U pip && \
git clone https://github.com/redhat-chaos/krkn.git --branch v1.4.7 /root/kraken && \
git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.0 /root/kraken && \
mkdir -p /root/.kube && cd /root/kraken && \
pip3.9 install -r requirements.txt && \
pip3.9 install virtualenv && \
Expand Down
5 changes: 3 additions & 2 deletions kraken/application_outage/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import kraken.invoke.command as runcommand
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.utils.functions import get_yaml_item_value
from krkn_lib.utils.functions import get_yaml_item_value, log_exception


# Reads the scenario config, applies and deletes a network policy to
# block the traffic for the specified duration
Expand Down Expand Up @@ -76,7 +77,7 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernete
except Exception as e :
scenario_telemetry.exitStatus = 1
failed_scenarios.append(app_outage_config)
telemetry.log_exception(app_outage_config)
log_exception(app_outage_config)
else:
scenario_telemetry.exitStatus = 0
scenario_telemetry.endTimeStamp = time.time()
Expand Down
4 changes: 2 additions & 2 deletions kraken/network_chaos/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.utils.functions import get_yaml_item_value
from krkn_lib.utils.functions import get_yaml_item_value, log_exception


# krkn_lib
Expand Down Expand Up @@ -116,7 +116,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
except (RuntimeError, Exception):
scenario_telemetry.exitStatus = 1
failed_scenarios.append(net_config)
telemetry.log_exception(net_config)
log_exception(net_config)
else:
scenario_telemetry.exitStatus = 0
scenario_telemetries.append(scenario_telemetry)
Expand Down
2 changes: 1 addition & 1 deletion kraken/node_actions/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
except (RuntimeError, Exception) as e:
scenario_telemetry.exitStatus = 1
failed_scenarios.append(node_scenario_config)
telemetry.log_exception(node_scenario_config)
log_exception(node_scenario_config)
else:
scenario_telemetry.exitStatus = 0

Expand Down
4 changes: 2 additions & 2 deletions kraken/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_egress_shaping
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry

from krkn_lib.utils.functions import log_exception


@dataclasses.dataclass
Expand Down Expand Up @@ -241,7 +241,7 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p
except Exception as e:
scenario_telemetry.exitStatus = 1
failed_post_scenarios.append(scenario)
telemetry.log_exception(scenario)
log_exception(scenario)
else:
scenario_telemetry.exitStatus = 0
logging.info("Waiting for the specified duration: %s" % (wait_duration))
Expand Down
4 changes: 2 additions & 2 deletions kraken/pod_scenarios/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from arcaflow_plugin_sdk import serialization
from krkn_lib.utils.functions import get_yaml_item_value
from krkn_lib.utils.functions import get_yaml_item_value, log_exception


# Run pod based scenarios
Expand Down Expand Up @@ -118,7 +118,7 @@ def container_run(kubeconfig_path,
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
except (RuntimeError, Exception):
failed_scenarios.append(container_scenario_config[0])
telemetry.log_exception(container_scenario_config[0])
log_exception(container_scenario_config[0])
scenario_telemetry.exitStatus = 1
# removed_exit
# sys.exit(1)
Expand Down
4 changes: 2 additions & 2 deletions kraken/pvc/pvc_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.utils.functions import get_yaml_item_value
from krkn_lib.utils.functions import get_yaml_item_value, log_exception


# krkn_lib
Expand Down Expand Up @@ -316,7 +316,7 @@ def run(scenarios_list, config, kubecli: KrknKubernetes, telemetry: KrknTelemetr
except (RuntimeError, Exception):
scenario_telemetry.exitStatus = 1
failed_scenarios.append(app_config)
telemetry.log_exception(app_config)
log_exception(app_config)
else:
scenario_telemetry.exitStatus = 0
scenario_telemetries.append(scenario_telemetry)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.utils.functions import get_yaml_item_value
from krkn_lib.utils.functions import get_yaml_item_value, log_exception


def delete_objects(kubecli, namespace):
Expand Down Expand Up @@ -251,7 +251,7 @@ def run(
except (Exception, RuntimeError):
scenario_telemetry.exitStatus = 1
failed_scenarios.append(scenario_config[0])
telemetry.log_exception(scenario_config[0])
log_exception(scenario_config[0])
else:
scenario_telemetry.exitStatus = 0
scenario_telemetry.endTimeStamp = time.time()
Expand Down
3 changes: 2 additions & 1 deletion kraken/shut_down/common_shut_down_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.utils.functions import log_exception

def multiprocess_nodes(cloud_object_function, nodes):
try:
Expand Down Expand Up @@ -165,7 +166,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
)

except (RuntimeError, Exception):
telemetry.log_exception(shut_down_config[0])
log_exception(shut_down_config[0])
failed_scenarios.append(shut_down_config[0])
scenario_telemetry.exitStatus = 1
else:
Expand Down
5 changes: 3 additions & 2 deletions kraken/time_actions/common_time_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.utils.functions import get_yaml_item_value
from krkn_lib.utils.functions import get_yaml_item_value, log_exception


# krkn_lib
def pod_exec(pod_name, command, namespace, container_name, kubecli:KrknKubernetes):
Expand Down Expand Up @@ -339,7 +340,7 @@ def run(scenarios_list, config, wait_duration, kubecli:KrknKubernetes, telemetry
)
except (RuntimeError, Exception):
scenario_telemetry.exitStatus = 1
telemetry.log_exception(time_scenario_config)
log_exception(time_scenario_config)
failed_scenarios.append(time_scenario_config)
else:
scenario_telemetry.exitStatus = 0
Expand Down
4 changes: 2 additions & 2 deletions kraken/zone_outage/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ..cerberus import setup as cerberus
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry

from krkn_lib.utils.functions import log_exception

def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]) :
"""
Expand Down Expand Up @@ -112,7 +112,7 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernete
except (RuntimeError, Exception):
scenario_telemetry.exitStatus = 1
failed_scenarios.append(zone_outage_config)
telemetry.log_exception(zone_outage_config)
log_exception(zone_outage_config)
else:
scenario_telemetry.exitStatus = 0
scenario_telemetry.endTimeStamp = time.time()
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ ibm_cloud_sdk_core
ibm_vpc
itsdangerous==2.0.1
jinja2==3.0.3
krkn-lib>=1.4.0
krkn-lib>=1.4.1
kubernetes
lxml >= 4.3.0
oauth2client>=4.1.3
Expand Down
1 change: 1 addition & 0 deletions run_kraken.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ def main(cfg):
logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
try:
telemetry_k8s.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
telemetry_k8s.put_cluster_events(telemetry_request_id, config["telemetry"], start_time, end_time)
# prometheus data collection is available only on Openshift
if config["telemetry"]["prometheus_backup"] and config["kraken"]["distribution"] == "openshift":
safe_logger.info("archives download started:")
Expand Down

0 comments on commit 7a966a7

Please sign in to comment.