Skip to content

Commit

Permalink
Baremetal Node Support (krkn-chaos#74)
Browse files Browse the repository at this point in the history
* Support for baremtal node scenarious

* Finished baremetal support

* Added documentation for baremetal

* Clarify limitations of implementation in documentation

* Add baremetal support to new run.py file

* Allow use on newer machines

Some older machines require lanplus instead of lan

* Setup to allow per-device user, pass, and bmc address

Also set min version for a dependency

* Fix linting issues

* More linting issue fixes

* More linter issues

* Account for linter standard non-conformity

* Added baremetal warning

Co-authored-by: jaredoconnell <[email protected]>
  • Loading branch information
jaredoconnell and jaredoconnell authored Jul 2, 2021
1 parent 0afcd22 commit 9b83dbc
Show file tree
Hide file tree
Showing 6 changed files with 216 additions and 3 deletions.
28 changes: 28 additions & 0 deletions docs/node_scenarios.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@ Following node chaos scenarios are supported:

How to set up AWS cli to run node scenarios is defined [here](cloud_setup.md#aws)

#### Baremetal
**NOTE**: Baremetal requires setting the IPMI user and password to power on, off, and reboot nodes, using the config options `bm_user` and `bm_password`. It can either be set in the root of the entry in the scenarios config, or it can be set per machine.

If no per-machine addresses are specified, kraken attempts to use the BMC value in the BareMetalHost object. To list them, you can do 'oc get bmh -o wide --all-namespaces'. If the BMC values are blank, you must specify them per-machine using the config option 'bmc_addr' as specified below.

For per-machine settings, add a "bmc_info" section to the entry in the scenarios config. Inside there, add a configuration section using the node name. In that, add per-machine settings. Valid settings are 'bmc_user', 'bmc_password', and 'bmc_addr'.
For examples, see the example node scenario or the example below.

**NOTE**: Baremetal requires oc (openshift client) be installed on the machine running Kraken.

**NOTE**: Baremetal machines are fragile. Some node actions can occasionally corrupt the filesystem if it does not shut down properly, and sometimes the kubelet does not start properly.

#### GCP
How to set up GCP cli to run node scenarios is defined [here](cloud_setup.md#gcp)

Expand Down Expand Up @@ -80,4 +92,20 @@ node_scenarios:
- named
ssh_private_key: /root/.ssh/id_rsa # ssh key to access the helper node
cloud_type: openstack
- actions:
- node_stop_start_scenario
node_name:
label_selector: node-role.kubernetes.io/worker
instance_kill_count: 1
timeout: 120
cloud_type: bm
bmc_user: defaultuser # For baremetal (bm) cloud type. The default IPMI username. Optional if specified for all machines.
bmc_password: defaultpass # For baremetal (bm) cloud type. The default IPMI password. Optional if specified for all machines.
bmc_info: # This section is here to specify baremetal per-machine info, so it is optional if there is no per-machine info.
node-1: # The node name for the baremetal machine
bmc_addr: mgmt-machine1.example.com # Optional. For baremetal nodes with the IPMI BMC address missing from 'oc get bmh'
node-2:
bmc_addr: mgmt-machine2.example.com
bmc_user: user # The baremetal IPMI user. Overrides the default IPMI user specified above. Optional if the default is set.
bmc_password: pass # The baremetal IPMI password. Overrides the default IPMI user specified above. Optional if the default is set.
```
178 changes: 178 additions & 0 deletions kraken/node_actions/bm_node_scenarios.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
import kraken.node_actions.common_node_functions as nodeaction
from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
import logging
import openshift as oc
import pyipmi
import pyipmi.interfaces
import sys
import time
import traceback


class BM:
def __init__(self, bm_info, user, passwd):
self.user = user
self.passwd = passwd
self.bm_info = bm_info

def get_node_object(self, node_name):
with oc.project("openshift-machine-api"):
return oc.selector("node/" + node_name).object()

# Get the ipmi or other BMC address of the baremetal node
def get_bmc_addr(self, node_name):
# Addresses in the config get higher priority.
if self.bm_info is not None and node_name in self.bm_info and "bmc_addr" in self.bm_info[node_name]:
return self.bm_info[node_name]["bmc_addr"]

# Get the bmc addr from the BareMetalHost object.
with oc.project("openshift-machine-api"):
logging.info("Getting node with name: %s" % (node_name))
node = self.get_node_object(node_name)
provider_id = node.model.spec.providerID
startOfUid = provider_id.rfind("/") # The / before the uid
startOfName = provider_id.rfind("/", 0, startOfUid) + 1
bmh_name = provider_id[startOfName:startOfUid]
bmh_resource_name = "baremetalhost.metal3.io/" + bmh_name
bmh_object = oc.selector(bmh_resource_name).object()
if len(bmh_object.model.spec.bmc.addr) == 0:
logging.error(
'BMC addr empty for node "%s". Either fix the BMH object,'
" or specify the address in the scenario config" % node_name
)
sys.exit(1)
return bmh_object.model.spec.bmc.address

def get_ipmi_connection(self, bmc_addr, node_name):
type_position = bmc_addr.find("://")
if type_position == -1:
host = bmc_addr
else:
host = bmc_addr[type_position + 3 :]
port_position = host.find(":")
if port_position == -1:
port = 623
else:
port = int(host[port_position + 1 :])
host = host[0:port_position]

# Determine correct username and password
# If specified, uses device-specific user/pass. Else uses the global one.
if self.bm_info is not None and node_name in self.bm_info:
user = self.bm_info[node_name].get("bmc_user", self.user)
passwd = self.bm_info[node_name].get("bmc_password", self.passwd)
else:
user = self.user
passwd = self.passwd
if user is None or passwd is None:
logging.error(
"Missing IPMI BMI user and/or password for baremetal cloud. "
"Please specify either a global or per-machine user and pass"
)
sys.exit(1)

# Establish connection
interface = pyipmi.interfaces.create_interface("ipmitool", interface_type="lanplus")

connection = pyipmi.create_connection(interface)

connection.target = pyipmi.Target(ipmb_address=0x20)
connection.session.set_session_type_rmcp(host, port)
connection.session.set_auth_type_user(user, passwd)
connection.session.establish()
return connection

# Start the node instance
def start_instances(self, bmc_addr, node_name):
self.get_ipmi_connection(bmc_addr, node_name).chassis_control_power_up()

# Stop the node instance
def stop_instances(self, bmc_addr, node_name):
self.get_ipmi_connection(bmc_addr, node_name).chassis_control_power_down()

# Reboot the node instance
def reboot_instances(self, bmc_addr, node_name):
self.get_ipmi_connection(bmc_addr, node_name).chassis_control_power_cycle()

# Wait until the node instance is running
def wait_until_running(self, bmc_addr, node_name):
while not self.get_ipmi_connection(bmc_addr, node_name).get_chassis_status().power_on:
time.sleep(1)

# Wait until the node instance is stopped
def wait_until_stopped(self, bmc_addr, node_name):
while self.get_ipmi_connection(bmc_addr, node_name).get_chassis_status().power_on:
time.sleep(1)


class bm_node_scenarios(abstract_node_scenarios):
def __init__(self, bm_info, user, passwd):
self.bm = BM(bm_info, user, passwd)

# Node scenario to start the node
def node_start_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
try:
logging.info("Starting node_start_scenario injection")
bmc_addr = self.bm.get_bmc_addr(node)
logging.info("Starting the node %s with bmc address: %s " % (node, bmc_addr))
self.bm.start_instances(bmc_addr, node)
self.bm.wait_until_running(bmc_addr, node)
nodeaction.wait_for_ready_status(node, timeout)
logging.info("Node with bmc address: %s is in running state" % (bmc_addr))
logging.info("node_start_scenario has been successfully injected!")
except Exception as e:
logging.error(
"Failed to start node instance. Encountered following "
"exception: %s. Test Failed. Most errors are caused by "
"an incorrect ipmi address or login" % (e)
)
logging.error("node_start_scenario injection failed!")
sys.exit(1)

# Node scenario to stop the node
def node_stop_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
try:
logging.info("Starting node_stop_scenario injection")
bmc_addr = self.bm.get_bmc_addr(node)
logging.info("Stopping the node %s with bmc address: %s " % (node, bmc_addr))
self.bm.stop_instances(bmc_addr, node)
self.bm.wait_until_stopped(bmc_addr, node)
logging.info("Node with bmc address: %s is in stopped state" % (bmc_addr))
nodeaction.wait_for_unknown_status(node, timeout)
except Exception as e:
logging.error(
"Failed to stop node instance. Encountered following exception: %s. "
"Test Failed. Most errors are caused by "
"an incorrect ipmi address or login" % (e)
)
logging.error("node_stop_scenario injection failed!")
sys.exit(1)

# Node scenario to terminate the node
def node_termination_scenario(self, instance_kill_count, node, timeout):
logging.info("Node termination scenario is not supported on baremetal")

# Node scenario to reboot the node
def node_reboot_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
try:
logging.info("Starting node_reboot_scenario injection")
bmc_addr = self.bm.get_bmc_addr(node)
logging.info("BMC Addr: %s" % (bmc_addr))
logging.info("Rebooting the node %s with bmc address: %s " % (node, bmc_addr))
self.bm.reboot_instances(bmc_addr, node)
nodeaction.wait_for_unknown_status(node, timeout)
nodeaction.wait_for_ready_status(node, timeout)
logging.info("Node with bmc address: %s has been rebooted" % (bmc_addr))
logging.info("node_reboot_scenario has been successfuly injected!")
except Exception as e:
logging.error(
"Failed to reboot node instance. Encountered following exception:"
" %s. Test Failed. Most errors are caused by "
"an incorrect ipmi address or login" % (e)
)
traceback.print_exc()
logging.error("node_reboot_scenario injection failed!")
sys.exit(1)
2 changes: 1 addition & 1 deletion kraken/node_actions/common_node_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def wait_for_unknown_status(node, timeout):
for _ in range(timeout):
if kubecli.get_node_status(node) == "Unknown":
break
time.sleep(1)
time.sleep(3)
if kubecli.get_node_status(node) != "Unknown":
raise Exception("Node condition status isn't Unknown")

Expand Down
5 changes: 5 additions & 0 deletions kraken/node_actions/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from kraken.node_actions.az_node_scenarios import azure_node_scenarios
from kraken.node_actions.gcp_node_scenarios import gcp_node_scenarios
from kraken.node_actions.openstack_node_scenarios import openstack_node_scenarios
from kraken.node_actions.bm_node_scenarios import bm_node_scenarios
import kraken.node_actions.common_node_functions as common_node_functions
import kraken.cerberus.setup as cerberus

Expand All @@ -28,6 +29,10 @@ def get_node_scenario_object(node_scenario):
return openstack_node_scenarios()
elif node_scenario["cloud_type"] == "azure" or node_scenario["cloud_type"] == "az":
return azure_node_scenarios()
elif node_scenario["cloud_type"] == "bm":
return bm_node_scenarios(
node_scenario.get("bmc_info"), node_scenario.get("bmc_user", None), node_scenario.get("bmc_password", None)
)
else:
logging.error(
"Cloud type " + node_scenario["cloud_type"] + " is not currently supported; "
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
datetime
pyfiglet
PyYAML
PyYAML>=5.1
git+https://github.com/powerfulseal/powerfulseal.git
requests
boto3
Expand All @@ -14,3 +14,5 @@ python-openstackclient
gitpython
paramiko
setuptools
openshift-client
python-ipmi
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,4 @@ universal = 1

[flake8]
# Ignore specified error codes
extend-ignore = W503
extend-ignore = W503, E203

0 comments on commit 9b83dbc

Please sign in to comment.