From 9b83dbcf044bc8e7b7b2464672e886b87f6a64c7 Mon Sep 17 00:00:00 2001 From: Jared O'Connell <46976761+jaredoconnell@users.noreply.github.com> Date: Fri, 2 Jul 2021 17:31:40 -0400 Subject: [PATCH] Baremetal Node Support (#74) * Support for baremtal node scenarious * Finished baremetal support * Added documentation for baremetal * Clarify limitations of implementation in documentation * Add baremetal support to new run.py file * Allow use on newer machines Some older machines require lanplus instead of lan * Setup to allow per-device user, pass, and bmc address Also set min version for a dependency * Fix linting issues * More linting issue fixes * More linter issues * Account for linter standard non-conformity * Added baremetal warning Co-authored-by: jaredoconnell --- docs/node_scenarios.md | 28 +++ kraken/node_actions/bm_node_scenarios.py | 178 +++++++++++++++++++ kraken/node_actions/common_node_functions.py | 2 +- kraken/node_actions/run.py | 5 + requirements.txt | 4 +- setup.cfg | 2 +- 6 files changed, 216 insertions(+), 3 deletions(-) create mode 100644 kraken/node_actions/bm_node_scenarios.py diff --git a/docs/node_scenarios.md b/docs/node_scenarios.md index a524b215..1ca074bb 100644 --- a/docs/node_scenarios.md +++ b/docs/node_scenarios.md @@ -20,6 +20,18 @@ Following node chaos scenarios are supported: How to set up AWS cli to run node scenarios is defined [here](cloud_setup.md#aws) +#### Baremetal +**NOTE**: Baremetal requires setting the IPMI user and password to power on, off, and reboot nodes, using the config options `bm_user` and `bm_password`. It can either be set in the root of the entry in the scenarios config, or it can be set per machine. + +If no per-machine addresses are specified, kraken attempts to use the BMC value in the BareMetalHost object. To list them, you can do 'oc get bmh -o wide --all-namespaces'. If the BMC values are blank, you must specify them per-machine using the config option 'bmc_addr' as specified below. + +For per-machine settings, add a "bmc_info" section to the entry in the scenarios config. Inside there, add a configuration section using the node name. In that, add per-machine settings. Valid settings are 'bmc_user', 'bmc_password', and 'bmc_addr'. +For examples, see the example node scenario or the example below. + +**NOTE**: Baremetal requires oc (openshift client) be installed on the machine running Kraken. + +**NOTE**: Baremetal machines are fragile. Some node actions can occasionally corrupt the filesystem if it does not shut down properly, and sometimes the kubelet does not start properly. + #### GCP How to set up GCP cli to run node scenarios is defined [here](cloud_setup.md#gcp) @@ -80,4 +92,20 @@ node_scenarios: - named ssh_private_key: /root/.ssh/id_rsa # ssh key to access the helper node cloud_type: openstack + - actions: + - node_stop_start_scenario + node_name: + label_selector: node-role.kubernetes.io/worker + instance_kill_count: 1 + timeout: 120 + cloud_type: bm + bmc_user: defaultuser # For baremetal (bm) cloud type. The default IPMI username. Optional if specified for all machines. + bmc_password: defaultpass # For baremetal (bm) cloud type. The default IPMI password. Optional if specified for all machines. + bmc_info: # This section is here to specify baremetal per-machine info, so it is optional if there is no per-machine info. + node-1: # The node name for the baremetal machine + bmc_addr: mgmt-machine1.example.com # Optional. For baremetal nodes with the IPMI BMC address missing from 'oc get bmh' + node-2: + bmc_addr: mgmt-machine2.example.com + bmc_user: user # The baremetal IPMI user. Overrides the default IPMI user specified above. Optional if the default is set. + bmc_password: pass # The baremetal IPMI password. Overrides the default IPMI user specified above. Optional if the default is set. ``` diff --git a/kraken/node_actions/bm_node_scenarios.py b/kraken/node_actions/bm_node_scenarios.py new file mode 100644 index 00000000..074f0b3e --- /dev/null +++ b/kraken/node_actions/bm_node_scenarios.py @@ -0,0 +1,178 @@ +import kraken.node_actions.common_node_functions as nodeaction +from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios +import logging +import openshift as oc +import pyipmi +import pyipmi.interfaces +import sys +import time +import traceback + + +class BM: + def __init__(self, bm_info, user, passwd): + self.user = user + self.passwd = passwd + self.bm_info = bm_info + + def get_node_object(self, node_name): + with oc.project("openshift-machine-api"): + return oc.selector("node/" + node_name).object() + + # Get the ipmi or other BMC address of the baremetal node + def get_bmc_addr(self, node_name): + # Addresses in the config get higher priority. + if self.bm_info is not None and node_name in self.bm_info and "bmc_addr" in self.bm_info[node_name]: + return self.bm_info[node_name]["bmc_addr"] + + # Get the bmc addr from the BareMetalHost object. + with oc.project("openshift-machine-api"): + logging.info("Getting node with name: %s" % (node_name)) + node = self.get_node_object(node_name) + provider_id = node.model.spec.providerID + startOfUid = provider_id.rfind("/") # The / before the uid + startOfName = provider_id.rfind("/", 0, startOfUid) + 1 + bmh_name = provider_id[startOfName:startOfUid] + bmh_resource_name = "baremetalhost.metal3.io/" + bmh_name + bmh_object = oc.selector(bmh_resource_name).object() + if len(bmh_object.model.spec.bmc.addr) == 0: + logging.error( + 'BMC addr empty for node "%s". Either fix the BMH object,' + " or specify the address in the scenario config" % node_name + ) + sys.exit(1) + return bmh_object.model.spec.bmc.address + + def get_ipmi_connection(self, bmc_addr, node_name): + type_position = bmc_addr.find("://") + if type_position == -1: + host = bmc_addr + else: + host = bmc_addr[type_position + 3 :] + port_position = host.find(":") + if port_position == -1: + port = 623 + else: + port = int(host[port_position + 1 :]) + host = host[0:port_position] + + # Determine correct username and password + # If specified, uses device-specific user/pass. Else uses the global one. + if self.bm_info is not None and node_name in self.bm_info: + user = self.bm_info[node_name].get("bmc_user", self.user) + passwd = self.bm_info[node_name].get("bmc_password", self.passwd) + else: + user = self.user + passwd = self.passwd + if user is None or passwd is None: + logging.error( + "Missing IPMI BMI user and/or password for baremetal cloud. " + "Please specify either a global or per-machine user and pass" + ) + sys.exit(1) + + # Establish connection + interface = pyipmi.interfaces.create_interface("ipmitool", interface_type="lanplus") + + connection = pyipmi.create_connection(interface) + + connection.target = pyipmi.Target(ipmb_address=0x20) + connection.session.set_session_type_rmcp(host, port) + connection.session.set_auth_type_user(user, passwd) + connection.session.establish() + return connection + + # Start the node instance + def start_instances(self, bmc_addr, node_name): + self.get_ipmi_connection(bmc_addr, node_name).chassis_control_power_up() + + # Stop the node instance + def stop_instances(self, bmc_addr, node_name): + self.get_ipmi_connection(bmc_addr, node_name).chassis_control_power_down() + + # Reboot the node instance + def reboot_instances(self, bmc_addr, node_name): + self.get_ipmi_connection(bmc_addr, node_name).chassis_control_power_cycle() + + # Wait until the node instance is running + def wait_until_running(self, bmc_addr, node_name): + while not self.get_ipmi_connection(bmc_addr, node_name).get_chassis_status().power_on: + time.sleep(1) + + # Wait until the node instance is stopped + def wait_until_stopped(self, bmc_addr, node_name): + while self.get_ipmi_connection(bmc_addr, node_name).get_chassis_status().power_on: + time.sleep(1) + + +class bm_node_scenarios(abstract_node_scenarios): + def __init__(self, bm_info, user, passwd): + self.bm = BM(bm_info, user, passwd) + + # Node scenario to start the node + def node_start_scenario(self, instance_kill_count, node, timeout): + for _ in range(instance_kill_count): + try: + logging.info("Starting node_start_scenario injection") + bmc_addr = self.bm.get_bmc_addr(node) + logging.info("Starting the node %s with bmc address: %s " % (node, bmc_addr)) + self.bm.start_instances(bmc_addr, node) + self.bm.wait_until_running(bmc_addr, node) + nodeaction.wait_for_ready_status(node, timeout) + logging.info("Node with bmc address: %s is in running state" % (bmc_addr)) + logging.info("node_start_scenario has been successfully injected!") + except Exception as e: + logging.error( + "Failed to start node instance. Encountered following " + "exception: %s. Test Failed. Most errors are caused by " + "an incorrect ipmi address or login" % (e) + ) + logging.error("node_start_scenario injection failed!") + sys.exit(1) + + # Node scenario to stop the node + def node_stop_scenario(self, instance_kill_count, node, timeout): + for _ in range(instance_kill_count): + try: + logging.info("Starting node_stop_scenario injection") + bmc_addr = self.bm.get_bmc_addr(node) + logging.info("Stopping the node %s with bmc address: %s " % (node, bmc_addr)) + self.bm.stop_instances(bmc_addr, node) + self.bm.wait_until_stopped(bmc_addr, node) + logging.info("Node with bmc address: %s is in stopped state" % (bmc_addr)) + nodeaction.wait_for_unknown_status(node, timeout) + except Exception as e: + logging.error( + "Failed to stop node instance. Encountered following exception: %s. " + "Test Failed. Most errors are caused by " + "an incorrect ipmi address or login" % (e) + ) + logging.error("node_stop_scenario injection failed!") + sys.exit(1) + + # Node scenario to terminate the node + def node_termination_scenario(self, instance_kill_count, node, timeout): + logging.info("Node termination scenario is not supported on baremetal") + + # Node scenario to reboot the node + def node_reboot_scenario(self, instance_kill_count, node, timeout): + for _ in range(instance_kill_count): + try: + logging.info("Starting node_reboot_scenario injection") + bmc_addr = self.bm.get_bmc_addr(node) + logging.info("BMC Addr: %s" % (bmc_addr)) + logging.info("Rebooting the node %s with bmc address: %s " % (node, bmc_addr)) + self.bm.reboot_instances(bmc_addr, node) + nodeaction.wait_for_unknown_status(node, timeout) + nodeaction.wait_for_ready_status(node, timeout) + logging.info("Node with bmc address: %s has been rebooted" % (bmc_addr)) + logging.info("node_reboot_scenario has been successfuly injected!") + except Exception as e: + logging.error( + "Failed to reboot node instance. Encountered following exception:" + " %s. Test Failed. Most errors are caused by " + "an incorrect ipmi address or login" % (e) + ) + traceback.print_exc() + logging.error("node_reboot_scenario injection failed!") + sys.exit(1) diff --git a/kraken/node_actions/common_node_functions.py b/kraken/node_actions/common_node_functions.py index 42da1be0..bf997faf 100644 --- a/kraken/node_actions/common_node_functions.py +++ b/kraken/node_actions/common_node_functions.py @@ -34,7 +34,7 @@ def wait_for_unknown_status(node, timeout): for _ in range(timeout): if kubecli.get_node_status(node) == "Unknown": break - time.sleep(1) + time.sleep(3) if kubecli.get_node_status(node) != "Unknown": raise Exception("Node condition status isn't Unknown") diff --git a/kraken/node_actions/run.py b/kraken/node_actions/run.py index 27cd2592..b606588e 100644 --- a/kraken/node_actions/run.py +++ b/kraken/node_actions/run.py @@ -7,6 +7,7 @@ from kraken.node_actions.az_node_scenarios import azure_node_scenarios from kraken.node_actions.gcp_node_scenarios import gcp_node_scenarios from kraken.node_actions.openstack_node_scenarios import openstack_node_scenarios +from kraken.node_actions.bm_node_scenarios import bm_node_scenarios import kraken.node_actions.common_node_functions as common_node_functions import kraken.cerberus.setup as cerberus @@ -28,6 +29,10 @@ def get_node_scenario_object(node_scenario): return openstack_node_scenarios() elif node_scenario["cloud_type"] == "azure" or node_scenario["cloud_type"] == "az": return azure_node_scenarios() + elif node_scenario["cloud_type"] == "bm": + return bm_node_scenarios( + node_scenario.get("bmc_info"), node_scenario.get("bmc_user", None), node_scenario.get("bmc_password", None) + ) else: logging.error( "Cloud type " + node_scenario["cloud_type"] + " is not currently supported; " diff --git a/requirements.txt b/requirements.txt index 38224a06..a97d8f55 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ datetime pyfiglet -PyYAML +PyYAML>=5.1 git+https://github.com/powerfulseal/powerfulseal.git requests boto3 @@ -14,3 +14,5 @@ python-openstackclient gitpython paramiko setuptools +openshift-client +python-ipmi diff --git a/setup.cfg b/setup.cfg index 3d36afb2..ec317d68 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,4 +39,4 @@ universal = 1 [flake8] # Ignore specified error codes -extend-ignore = W503 +extend-ignore = W503, E203