From 3cd404e971d13d1acd69cf4c9fb8d779e6e13e83 Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Wed, 17 Jan 2024 06:02:30 +0000 Subject: [PATCH 01/11] DAOS-15008 test: VMD Hot Plug Automate - Replace during no activity Conduct VMD hot-remove and hot-plug during no activity. 1. Determine the PCI address (TrAddr) of the disk we'll hot-remove and verify that its state is NORMAL and LED is OFF. 2. Store the total NVMe size. 3. Create a pool and a container. 4. Write data with IOR. 5. Hot remove the disk we selected at step 1. 6. Repeatedly call "dmg storage query list-devices" until the disk state becomes UNPLUGGED and LED becomes NA. 7. Verify that the disk space is down compared to before the remove. 8. Hot-plug. 9. Repeatedly call "dmg storage query list-devices" until the disk state becomes NORMAL and LED becomes OFF. 10. Verify that the disk space is back to the original. 11. Verify that none of the engines have crashed. 12. Verify that the disks are healthy by checking the container status. 13. Rung IOR and check that it works. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: test_no_activity Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/util/server_utils_params.py | 133 ++++++++++- src/tests/ftest/vmd/hot_plug_no_activity.py | 226 ++++++++++++++++++ src/tests/ftest/vmd/hot_plug_no_activity.yaml | 43 ++++ 3 files changed, 399 insertions(+), 3 deletions(-) create mode 100644 src/tests/ftest/vmd/hot_plug_no_activity.py create mode 100644 src/tests/ftest/vmd/hot_plug_no_activity.yaml diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py index a79c750b54d..c471b64452f 100644 --- a/src/tests/ftest/util/server_utils_params.py +++ b/src/tests/ftest/util/server_utils_params.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -161,6 +161,9 @@ def __init__(self, filename, common_yaml): self.fault_path = BasicParameter(None) self.fault_cb = BasicParameter(None) + # VMD hot-plug support. + self.enable_hotplug = BasicParameter(None) + def get_params(self, test): """Get values for all of the command params from the yaml file. @@ -512,6 +515,9 @@ def __init__(self, base_namespace, index, provider=None, max_storage_tiers=MAX_S # the storage configuration for this engine self.storage = StorageYamlParameters(self.namespace, max_storage_tiers) + # For spdk_rpc_server field that defines enable and sock_addr. + self.spdk_rpc_server = SpdkRpcServerYamlParameters(self.namespace) + def get_params(self, test): """Get values for the daos server yaml config file. @@ -554,6 +560,9 @@ def get_params(self, test): new_env_vars = ["=".join([key, str(value)]) for key, value in env_var_dict.items()] self.env_vars.update(new_env_vars, "env_var") + # Create spdk_rpc_server fields. + self.spdk_rpc_server.get_params(test) + @property def using_nvme(self): """Is the configuration file setup to use NVMe devices. @@ -595,21 +604,26 @@ def get_yaml_data(self): # Add the storage tier yaml parameters yaml_data.update(self.storage.get_yaml_data()) + # Add the spdk_rpc_server yaml parameters. + yaml_data.update(self.spdk_rpc_server.get_yaml_data()) + return yaml_data def is_yaml_data_updated(self): """Determine if any of the yaml file parameters have been updated. Returns: - bool: whether or not a yaml file parameter has been updated + bool: whether the yaml file parameter has been updated """ - return super().is_yaml_data_updated() or self.storage.is_yaml_data_updated() + return super().is_yaml_data_updated() or self.storage.is_yaml_data_updated() or \ + self.spdk_rpc_server.is_yaml_data_updated() def reset_yaml_data_updated(self): """Reset each yaml file parameter updated state to False.""" super().reset_yaml_data_updated() self.storage.reset_yaml_data_updated() + self.spdk_rpc_server.reset_yaml_data_updated() def set_value(self, name, value): """Set the value for a specified attribute name. @@ -662,6 +676,7 @@ def _get_new(self): Returns: EngineYamlParameters: a new EngineYamlParameters object + """ return EngineYamlParameters( self._base_namespace, self._index, self._provider, self._max_storage_tiers) @@ -948,3 +963,115 @@ def _get_new(self): StorageTierYamlParameters: a new StorageTierYamlParameters object """ return StorageTierYamlParameters(self._base_namespace, self._tier) + + +class SpdkRpcServerYamlParameters(YamlParameters): + """Defines the configuration yaml parameters for spdk_rpc_server block in an engine field.""" + + def __init__(self, base_namespace): + """Create a SpdkRpcServerYamlParameters object. + + Args: + base_namespace (str): namespace for the server engine configuration + """ + super().__init__(os.path.join(base_namespace)) + self.spdk_rpc_server_tier = SpdkRpcServerTierYamlParameters(self.namespace) + + def get_params(self, test): + """Get values for the daos server yaml config file. + + Args: + test (Test): avocado Test object + """ + super().get_params(test) + self.spdk_rpc_server_tier.get_params(test) + + def get_yaml_data(self): + """Convert the parameters into a dictionary to use to write a yaml file. + + Returns: + dict: a dictionary of parameter name keys and values + + """ + # Get the common config yaml parameters + yaml_data = super().get_yaml_data() + yaml_data["spdk_rpc_server"] = self.spdk_rpc_server_tier.get_yaml_data() + return yaml_data + + def is_yaml_data_updated(self): + """Determine if any of the yaml file parameters have been updated. + + Returns: + bool: whether or not a yaml file parameter has been updated + + """ + if super().is_yaml_data_updated(): + return True + + return self.spdk_rpc_server_tier.is_yaml_data_updated() + + def set_value(self, name, value): + """Set the value for a specified attribute name. + + Args: + name (str): name of the attribute for which to set the value + value (object): the value to set + + Returns: + bool: if the attribute name was found and the value was set + + """ + if super().set_value(name, value): + return True + + return self.spdk_rpc_server_tier.set_value(name, value) + + def get_value(self, name): + """Get the value of the specified attribute name. + + Args: + name (str): name of the attribute from which to get the value + + Returns: + object: the object's value referenced by the attribute name + + """ + value = super().get_value(name) + if value: + return value + + return self.spdk_rpc_server_tier.get_value(name) + + def _get_new(self): + """Get a new object based upon this one. + + Returns: + SpdkRpcServerYamlParameters: a new SpdkRpcServerYamlParameters object + """ + return SpdkRpcServerYamlParameters(self.namespace) + + +class SpdkRpcServerTierYamlParameters(YamlParameters): + """Defines the configuration yaml parameters for each field in spdk_rpc_server.""" + + def __init__(self, base_namespace): + """Create a SpdkRpcServerTierYamlParameters object. + + Args: + base_namespace (str): namespace for the server engine configuration + """ + namespace = [os.sep] + base_namespace.split(os.sep)[1:-1] + ["spdk_rpc_server", "*"] + self._base_namespace = base_namespace + super().__init__(os.path.join(*namespace)) + + self.enable = BasicParameter(None, position=1) + self.sock_addr = BasicParameter(None, position=2) + + def _get_new(self): + """Get a new object based upon this one. + + Returns: + SpdkRpcServerTierYamlParameters: a new SpdkRpcServerTierYamlParameters object + + """ + return SpdkRpcServerTierYamlParameters(self.namespace) diff --git a/src/tests/ftest/vmd/hot_plug_no_activity.py b/src/tests/ftest/vmd/hot_plug_no_activity.py new file mode 100644 index 00000000000..8c14b3f4f30 --- /dev/null +++ b/src/tests/ftest/vmd/hot_plug_no_activity.py @@ -0,0 +1,226 @@ +""" + (C) Copyright 2024 Intel Corporation. + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +import time + +from general_utils import run_pcmd, report_errors +from dmg_utils import get_storage_query_device_info, check_system_query_status +from ior_test_base import IorTestBase + + +class HotPlugNoActivityTest(IorTestBase): + """Test class to test VMD hot-remove and hot-plug during no activity. + + :avocado: recursive + """ + def repeat_query_list_devices(self, exp_disk_state, exp_led_state, error_msg, errors, + dmg_command): + """Repeatedly call dmg storage query list-devices verify the states. + + Args: + exp_disk_state (str): Expected disk state. + exp_led_state (str): Expected LED state. + error_msg (str): Error message to add to the error list. + errors (list): Error list for this test. + dmg_command (DmgCommand): DmgCommand object to call the command. + """ + removed_state = False + for count in range(2): + time.sleep(30) + device_info = get_storage_query_device_info(dmg=dmg_command) + self.log.info(f"device_info = {device_info}") + # We'll use the first device. + device_control = device_info[0]["ctrlr"] + dev_state = device_control["dev_state"] + led_state = device_control["led_state"] + if dev_state == exp_disk_state and led_state == exp_led_state: + removed_state = True + break + self.log.info(f"{count}: Disk state = {dev_state}; LED state = {led_state}") + if not removed_state: + errors.append(error_msg) + + def get_total_nvme_bytes(self, dmg_command): + """Get total NVMe bytes using 'dmg storage query usage' + + Args: + dmg_command (DmgCommand): DmgCommand object used to call the command and to obtain the + port number. + + Returns: + int: total_bytes value of the first NVMe device in the server. + + """ + usage_out = dmg_command.storage_query_usage() + hosts_to_smd_devices = {} + # There's a hash value under HostStorage. Obtain "storage" -> "nvme_devices" and "hosts" + # under it. There may be multiple hosts depending on the setup, so use "hosts" as key and + # "smd_devices" list as value. + for value in usage_out["response"]["HostStorage"].values(): + smd_devices = value["storage"]["nvme_devices"][0]["smd_devices"] + hosts_to_smd_devices[value["hosts"]] = smd_devices + self.log.info(f"hosts_to_smd_devices = {hosts_to_smd_devices}") + port_num = dmg_command.yaml.port.value + host_port = f"{str(self.hostlist_servers)}:{port_num}" + total_bytes = hosts_to_smd_devices[host_port][0]["total_bytes"] + return total_bytes + + def test_no_activity(self): + """Conduct VMD hot-remove and hot-plug during no activity. + + 1. Determine the PCI address (TrAddr) of the disk we'll hot-remove and verify that its state + is NORMAL and LED is OFF. + 2. Store the total NVMe size. + 3. Create a pool and a container. + 4. Write data with IOR. + 5. Hot remove the disk we selected at step 1. + 6. Repeatedly call "dmg storage query list-devices" until the disk state becomes UNPLUGGED + and LED becomes NA. + 7. Verify that the disk space is down compared to before the remove. + 8. Hot-plug. + 9. Repeatedly call "dmg storage query list-devices" until the disk state becomes NORMAL and + LED becomes OFF. + 10. Verify that the disk space is back to the original. + 11. Verify that none of the engines have crashed. + 12. Verify that the disks are healthy by checking the container status. + 13. Rung IOR and check that it works. + + Jira ID: DAOS-15008 + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=vmd,hot_plug + :avocado: tags=HotPlugNoActivityTest,test_no_activity + """ + # 1. Determine the PCI address (TrAddr) of the disk we'll hot-remove and verify that its + # state is NORMAL and LED is OFF. + msg = ("Determine the PCI address (TrAddr) of the disk we'll hot-remove and verify that " + "its state is NORMAL and LED is OFF.") + self.log_step(msg) + dmg_command = self.get_dmg_command() + device_info = get_storage_query_device_info(dmg=dmg_command) + self.log.info(f"device_info = {device_info}") + # We'll use the first device. + device_control = device_info[0]["ctrlr"] + pci_addr = device_control["pci_addr"] + dev_state = device_control["dev_state"] + led_state = device_control["led_state"] + self.log.info(f"pci_addr = {pci_addr}; dev_state = {dev_state}; led_state = {led_state}") + errors = [] + exp_disk_state = "NORMAL" + if dev_state != exp_disk_state: + errors.append( + f"Unexpected disk state! Expected = {exp_disk_state}, Actual = {dev_state}") + exp_led_state = "OFF" + if led_state != exp_led_state: + errors.append(f"Unexpected LED state! Expected = {exp_led_state}, Actual = {led_state}") + + # 2. Store the total NVMe size. + self.log_step("Store the total NVMe size.") + total_bytes_orig = self.get_total_nvme_bytes(dmg_command=dmg_command) + + # 3. Create a pool and a container. + self.log_step("Create a pool and a container") + self.pool = self.get_pool(connect=False) + self.container = self.get_container(pool=self.pool) + + # 4. Write data with IOR. + self.log_step("Write data with IOR.") + self.ior_cmd.set_daos_params(self.server_group, self.pool, self.container.identifier) + self.run_ior_with_pool(create_pool=False, create_cont=False) + + # 5. Hot remove the disk we selected at step 1. + # self.log_step("Hot remove the disk we selected at step 1.") + # command = f"sudo python3 /home/mkano/daos/build/external/debug/spdk/scripts/rpc.py -s /var/tmp/spdk_0.sock vmd_remove_device {pci_addr}" + # rpc_out = run_pcmd(hosts=self.hostlist_servers, command=command) + # self.log.debug(f"## Hot remove out = {rpc_out}") + # self.log.debug("## Sleep for 30 sec...") + # time.sleep(30) + # stdout contains the output as list. Each item in the list represents a line. e.g., + # '[', + # ' {', + # ' "name": "Nvme_wolf-313.wolf.hpdd.intel.com_0_1_0",', + # ' "ctrlrs": [', + # ' {', + # ' "state": "enabled",', + # ... + # We'll remove the first and the last bracket, concatenate all items into a single string, + # then pass it into yaml.safe_load(). (It works even there are whitespaces at the beginning + # of each line.) + # stdout = rpc_out[0]["stdout"] + # First and last item of stdout is "[" and "]", so remove them. + # stdout.pop() + # stdout.pop(0) + # Concatenate each line into a single string. + # stdout_str = "".join(stdout) + # Convert the string to a yaml object so that we can easily reference the values. + # yaml_out = yaml.safe_load(stdout_str) + # self.log.debug(f"## yaml_out = {yaml_out}") + # name = yaml_out["name"] + # self.log.debug(f"## name = {name}") + # state = yaml_out["ctrlrs"][0]["state"] + # self.log.debug(f"## state = {state}") + + # 6. Repeatedly call "dmg storage query list-devices" until the disk state becomes UNPLUGGED + # and LED becomes NA. + # msg = ("Repeatedly call 'dmg storage query list-devices' until the disk state becomes " + # "UNPLUGGED and LED becomes NA.") + # self.log_step(msg) + # error_msg = "Disk and LED state didn't turn to removed state. (UNPLUGGED and NA)" + + # 7. Verify that the disk space is down compared to before the remove. + self.log_step("Verify that the disk space is down compared to before the remove.") + total_bytes_hot_remove = self.get_total_nvme_bytes(dmg_command=dmg_command) + if total_bytes_hot_remove >= total_bytes_orig: + msg = (f"Total NVMe bytes haven't been reduced after a hot remove! " + f"Original = {total_bytes_orig}; Hot-removed = {total_bytes_hot_remove}") + # errors.append(msg) + + # # 8. Hot-plug. + # # self.log_step("Hot-plug.") + # # command = f"sudo python3 /home/mkano/daos/build/external/debug/spdk/scripts/rpc.py -s /var/tmp/spdk_0.sock vmd_rescan" + # # rpc_out = run_pcmd(hosts=self.hostlist_servers, command=command) + # # self.log.debug(f"## Hot plug out = {rpc_out}") + + # 9. Repeatedly call "dmg storage query list-devices" until the disk state becomes NORMAL + # and LED becomes OFF. + msg = ("Repeatedly call 'dmg storage query list-devices' until the disk state becomes " + "NORMAL and LED becomes OFF.") + self.log_step(msg) + error_msg = "Disk and LED state didn't turn to plugged state. (NORMAL and OFF)" + self.repeat_query_list_devices( + exp_disk_state="NORMAL", exp_led_state="OFF", error_msg=error_msg, errors=errors, + dmg_command=dmg_command) + + # 10. Verify that the disk space is back to the original. + self.log_step("Verify that the disk space is back to the original.") + total_bytes_hot_plug = self.get_total_nvme_bytes(dmg_command=dmg_command) + if total_bytes_hot_plug != total_bytes_orig: + msg = (f"Total NVMe bytes haven't been recovered! Original = {total_bytes_orig}; " + f"After hot plug = {total_bytes_hot_plug}") + errors.append(msg) + + # 11. Verify that none of the engines have crashed. + self.log_step("Verify that none of the engines have crashed.") + system_query_out = dmg_command.system_query() + system_healthy = check_system_query_status(data=system_query_out) + if not system_healthy: + errors.append("One or more ranks crashed after hot remove and plug!") + + # 12. Verify that the disks are healthy by checking the container status. + self.log_step("Verify that the disks are healthy by checking the container status.") + expected_props = {"status": "HEALTHY"} + container_healthy = self.container.verify_prop(expected_props=expected_props) + if not container_healthy: + errors.append("Container status isn't HEALTHY after hot remove and plug!") + + # 13. Rung IOR and check that it works. + self.log_step("Rung IOR and check that it works.") + cmd_result = self.run_ior_with_pool(create_pool=False, create_cont=False) + if cmd_result.exit_status != 0: + errors.append(f"IOR after hot-plug failed! cmd_result = {cmd_result}") + + self.log.info("##### Errors ######") + report_errors(test=self, errors=errors) diff --git a/src/tests/ftest/vmd/hot_plug_no_activity.yaml b/src/tests/ftest/vmd/hot_plug_no_activity.yaml new file mode 100644 index 00000000000..b31a7a12c37 --- /dev/null +++ b/src/tests/ftest/vmd/hot_plug_no_activity.yaml @@ -0,0 +1,43 @@ +hosts: + # wolf + # test_servers: 1 + test_servers: 2 + +timeout: 300 + +server_config: + enable_hotplug: true + # wolf + # engines_per_host: 1 + engines_per_host: 2 + engines: + 0: + targets: 4 + nr_xs_helpers: 0 + storage: auto + # wolf + # storage: + # 0: + # class: dcpm + # scm_mount: /mnt/daos0 + # scm_list: [/dev/pmem0] + # 1: + # class: nvme + # bdev_list: ["5d0505:01:00.0"] + # spdk_rpc_server: + # enable: true + # sock_addr: /var/tmp/spdk_0.sock + +pool: + size: 5G + +container: + type: POSIX + control_method: daos + +ior: + flags: -v -W -w + api: DFS + transfer_size: 1M + block_size: 1G + dfs_oclass: SX From a0bd39d9456e6d5c6d8ab706567ff7f8ab4ff9b3 Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Wed, 17 Jan 2024 06:20:08 +0000 Subject: [PATCH 02/11] DAOS-15008 test: Fix pylint Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: test_no_activity Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/vmd/hot_plug_no_activity.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/tests/ftest/vmd/hot_plug_no_activity.py b/src/tests/ftest/vmd/hot_plug_no_activity.py index 8c14b3f4f30..9d3f3b485ed 100644 --- a/src/tests/ftest/vmd/hot_plug_no_activity.py +++ b/src/tests/ftest/vmd/hot_plug_no_activity.py @@ -5,8 +5,8 @@ """ import time -from general_utils import run_pcmd, report_errors -from dmg_utils import get_storage_query_device_info, check_system_query_status +from dmg_utils import check_system_query_status, get_storage_query_device_info +from general_utils import report_errors, run_pcmd from ior_test_base import IorTestBase @@ -30,7 +30,7 @@ def repeat_query_list_devices(self, exp_disk_state, exp_led_state, error_msg, er for count in range(2): time.sleep(30) device_info = get_storage_query_device_info(dmg=dmg_command) - self.log.info(f"device_info = {device_info}") + self.log.info("device_info = %s", device_info) # We'll use the first device. device_control = device_info[0]["ctrlr"] dev_state = device_control["dev_state"] @@ -38,7 +38,7 @@ def repeat_query_list_devices(self, exp_disk_state, exp_led_state, error_msg, er if dev_state == exp_disk_state and led_state == exp_led_state: removed_state = True break - self.log.info(f"{count}: Disk state = {dev_state}; LED state = {led_state}") + self.log.info("%d: Disk state = %s; LED state = %s", count, dev_state, led_state) if not removed_state: errors.append(error_msg) @@ -61,7 +61,7 @@ def get_total_nvme_bytes(self, dmg_command): for value in usage_out["response"]["HostStorage"].values(): smd_devices = value["storage"]["nvme_devices"][0]["smd_devices"] hosts_to_smd_devices[value["hosts"]] = smd_devices - self.log.info(f"hosts_to_smd_devices = {hosts_to_smd_devices}") + self.log.info("hosts_to_smd_devices = %s", hosts_to_smd_devices) port_num = dmg_command.yaml.port.value host_port = f"{str(self.hostlist_servers)}:{port_num}" total_bytes = hosts_to_smd_devices[host_port][0]["total_bytes"] @@ -101,13 +101,14 @@ def test_no_activity(self): self.log_step(msg) dmg_command = self.get_dmg_command() device_info = get_storage_query_device_info(dmg=dmg_command) - self.log.info(f"device_info = {device_info}") + self.log.info("device_info = %s", device_info) # We'll use the first device. device_control = device_info[0]["ctrlr"] pci_addr = device_control["pci_addr"] dev_state = device_control["dev_state"] led_state = device_control["led_state"] - self.log.info(f"pci_addr = {pci_addr}; dev_state = {dev_state}; led_state = {led_state}") + self.log.info( + "pci_addr = %s; dev_state = %s; led_state = %s", pci_addr, dev_state, led_state) errors = [] exp_disk_state = "NORMAL" if dev_state != exp_disk_state: @@ -147,7 +148,7 @@ def test_no_activity(self): # ' "state": "enabled",', # ... # We'll remove the first and the last bracket, concatenate all items into a single string, - # then pass it into yaml.safe_load(). (It works even there are whitespaces at the beginning + # then pass it into yaml.safe_load(). (It works even there are white spaces at the beginning # of each line.) # stdout = rpc_out[0]["stdout"] # First and last item of stdout is "[" and "]", so remove them. From 7badac7e053a197159f683ae77e8f9e9dd1cbc1d Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Sat, 20 Jan 2024 22:40:56 +0000 Subject: [PATCH 03/11] DAOS-15008 test: Update test yaml to support two engines Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: test_no_activity Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/vmd/hot_plug_no_activity.yaml | 52 ++++++++++++++----- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/src/tests/ftest/vmd/hot_plug_no_activity.yaml b/src/tests/ftest/vmd/hot_plug_no_activity.yaml index b31a7a12c37..24f16fbaf72 100644 --- a/src/tests/ftest/vmd/hot_plug_no_activity.yaml +++ b/src/tests/ftest/vmd/hot_plug_no_activity.yaml @@ -1,32 +1,56 @@ hosts: # wolf # test_servers: 1 + # CI test_servers: 2 timeout: 300 +# wolf +# server_config: +# enable_hotplug: true +# engines_per_host: 1 +# engines: +# 0: +# targets: 4 +# nr_xs_helpers: 0 +# storage: +# 0: +# class: dcpm +# scm_mount: /mnt/daos0 +# scm_list: [/dev/pmem0] +# 1: +# class: nvme +# bdev_list: ["5d0505:01:00.0"] +# spdk_rpc_server: +# enable: true +# sock_addr: /var/tmp/spdk_0.sock + +# CI server_config: enable_hotplug: true - # wolf - # engines_per_host: 1 engines_per_host: 2 engines: 0: + fabric_iface: ib0 + fabric_iface_port: 31416 + log_file: daos_server_0.log + nr_xs_helpers: 0 + storage: auto targets: 4 + spdk_rpc_server: + enable: true + sock_addr: /var/tmp/spdk_0.sock + 1: + fabric_iface: ib1 + fabric_iface_port: 31516 + log_file: daos_server_1.log nr_xs_helpers: 0 storage: auto - # wolf - # storage: - # 0: - # class: dcpm - # scm_mount: /mnt/daos0 - # scm_list: [/dev/pmem0] - # 1: - # class: nvme - # bdev_list: ["5d0505:01:00.0"] - # spdk_rpc_server: - # enable: true - # sock_addr: /var/tmp/spdk_0.sock + targets: 4 + spdk_rpc_server: + enable: true + sock_addr: /var/tmp/spdk_1.sock pool: size: 5G From bb9192faf36e4451470d407175e0ed46d0c47bd7 Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Mon, 22 Jan 2024 06:51:23 +0000 Subject: [PATCH 04/11] DAOS-15008 test: Support multiple engines per rank when obtaining NVMe info Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: test_no_activity Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/vmd/hot_plug_no_activity.py | 109 ++++++++++++-------- 1 file changed, 66 insertions(+), 43 deletions(-) diff --git a/src/tests/ftest/vmd/hot_plug_no_activity.py b/src/tests/ftest/vmd/hot_plug_no_activity.py index 9d3f3b485ed..73d9068ef95 100644 --- a/src/tests/ftest/vmd/hot_plug_no_activity.py +++ b/src/tests/ftest/vmd/hot_plug_no_activity.py @@ -16,7 +16,7 @@ class HotPlugNoActivityTest(IorTestBase): :avocado: recursive """ def repeat_query_list_devices(self, exp_disk_state, exp_led_state, error_msg, errors, - dmg_command): + dmg_command, rank): """Repeatedly call dmg storage query list-devices verify the states. Args: @@ -25,16 +25,20 @@ def repeat_query_list_devices(self, exp_disk_state, exp_led_state, error_msg, er error_msg (str): Error message to add to the error list. errors (list): Error list for this test. dmg_command (DmgCommand): DmgCommand object to call the command. + rank (int): Rank where the NVMe disk we want to access is located. """ removed_state = False for count in range(2): time.sleep(30) device_info = get_storage_query_device_info(dmg=dmg_command) - self.log.info("device_info = %s", device_info) - # We'll use the first device. - device_control = device_info[0]["ctrlr"] - dev_state = device_control["dev_state"] - led_state = device_control["led_state"] + self.log.info("device_info (repeat_query_list_devices) = %s", device_info) + dev_state = None + led_state = None + for device in device_info: + if device["rank"] == rank: + device_control = device["ctrlr"] + dev_state = device_control["dev_state"] + led_state = device_control["led_state"] if dev_state == exp_disk_state and led_state == exp_led_state: removed_state = True break @@ -42,29 +46,34 @@ def repeat_query_list_devices(self, exp_disk_state, exp_led_state, error_msg, er if not removed_state: errors.append(error_msg) - def get_total_nvme_bytes(self, dmg_command): + def get_total_nvme_bytes(self, dmg_command, host_port, rank): """Get total NVMe bytes using 'dmg storage query usage' Args: dmg_command (DmgCommand): DmgCommand object used to call the command and to obtain the port number. + host_port (str): Host where the NVMe disk we want to access is located. : + rank (int): Rank where the NVMe disk we want to access is located. Returns: - int: total_bytes value of the first NVMe device in the server. + int: total_bytes value of the NVMe device we're interested, which is specified by + host_port and rank parameter. If not found, for example because of invalid host_port + rank combination, returns None. """ + total_bytes = None usage_out = dmg_command.storage_query_usage() - hosts_to_smd_devices = {} # There's a hash value under HostStorage. Obtain "storage" -> "nvme_devices" and "hosts" - # under it. There may be multiple hosts depending on the setup, so use "hosts" as key and - # "smd_devices" list as value. - for value in usage_out["response"]["HostStorage"].values(): - smd_devices = value["storage"]["nvme_devices"][0]["smd_devices"] - hosts_to_smd_devices[value["hosts"]] = smd_devices - self.log.info("hosts_to_smd_devices = %s", hosts_to_smd_devices) - port_num = dmg_command.yaml.port.value - host_port = f"{str(self.hostlist_servers)}:{port_num}" - total_bytes = hosts_to_smd_devices[host_port][0]["total_bytes"] + # under it. There may be multiple hosts depending on the setup. + for hash_value in usage_out["response"]["HostStorage"].values(): + if hash_value["hosts"] == host_port: + for nvme_device in hash_value["storage"]["nvme_devices"]: + # In HW medium cluster, there's only one dictionary in the smd_devices list, so + # we may be able to just index it, but use for loop just in case. + for smd_device in nvme_device["smd_devices"]: + if smd_device["rank"] == rank: + total_bytes = smd_device["total_bytes"] + self.log.info("total_bytes = %s", total_bytes) return total_bytes def test_no_activity(self): @@ -102,13 +111,23 @@ def test_no_activity(self): dmg_command = self.get_dmg_command() device_info = get_storage_query_device_info(dmg=dmg_command) self.log.info("device_info = %s", device_info) - # We'll use the first device. - device_control = device_info[0]["ctrlr"] - pci_addr = device_control["pci_addr"] - dev_state = device_control["dev_state"] - led_state = device_control["led_state"] - self.log.info( - "pci_addr = %s; dev_state = %s; led_state = %s", pci_addr, dev_state, led_state) + pci_addr = None + dev_state = None + led_state = None + host_port = None + rank = 0 + # We'll use the rank 0 device. + for device in device_info: + if device["rank"] == rank: + device_control = device["ctrlr"] + pci_addr = device_control["pci_addr"] + dev_state = device_control["dev_state"] + led_state = device_control["led_state"] + host_port = device["hosts"] + break + msg = (f"pci_addr = {pci_addr}; dev_state = {dev_state}; led_state = {led_state}; " + f"host_port = {host_port}") + self.log.info(msg) errors = [] exp_disk_state = "NORMAL" if dev_state != exp_disk_state: @@ -120,7 +139,8 @@ def test_no_activity(self): # 2. Store the total NVMe size. self.log_step("Store the total NVMe size.") - total_bytes_orig = self.get_total_nvme_bytes(dmg_command=dmg_command) + total_bytes_orig = self.get_total_nvme_bytes( + dmg_command=dmg_command, host_port=host_port, rank=rank) # 3. Create a pool and a container. self.log_step("Create a pool and a container") @@ -137,8 +157,6 @@ def test_no_activity(self): # command = f"sudo python3 /home/mkano/daos/build/external/debug/spdk/scripts/rpc.py -s /var/tmp/spdk_0.sock vmd_remove_device {pci_addr}" # rpc_out = run_pcmd(hosts=self.hostlist_servers, command=command) # self.log.debug(f"## Hot remove out = {rpc_out}") - # self.log.debug("## Sleep for 30 sec...") - # time.sleep(30) # stdout contains the output as list. Each item in the list represents a line. e.g., # '[', # ' {', @@ -169,21 +187,25 @@ def test_no_activity(self): # msg = ("Repeatedly call 'dmg storage query list-devices' until the disk state becomes " # "UNPLUGGED and LED becomes NA.") # self.log_step(msg) - # error_msg = "Disk and LED state didn't turn to removed state. (UNPLUGGED and NA)" + # error_msg = "Disk and LED state didn't turn to removed state. (UNPLUGGED and OFF)" + # self.repeat_query_list_devices( + # exp_disk_state="UNPLUGGED", exp_led_state="OFF", error_msg=error_msg, errors=errors, + # dmg_command=dmg_command, rank=rank) # 7. Verify that the disk space is down compared to before the remove. - self.log_step("Verify that the disk space is down compared to before the remove.") - total_bytes_hot_remove = self.get_total_nvme_bytes(dmg_command=dmg_command) - if total_bytes_hot_remove >= total_bytes_orig: - msg = (f"Total NVMe bytes haven't been reduced after a hot remove! " - f"Original = {total_bytes_orig}; Hot-removed = {total_bytes_hot_remove}") - # errors.append(msg) - - # # 8. Hot-plug. - # # self.log_step("Hot-plug.") - # # command = f"sudo python3 /home/mkano/daos/build/external/debug/spdk/scripts/rpc.py -s /var/tmp/spdk_0.sock vmd_rescan" - # # rpc_out = run_pcmd(hosts=self.hostlist_servers, command=command) - # # self.log.debug(f"## Hot plug out = {rpc_out}") + # self.log_step("Verify that the disk space is down compared to before the remove.") + # total_bytes_hot_remove = self.get_total_nvme_bytes( + # dmg_command=dmg_command, host_port=host_port, rank=rank) + # if total_bytes_hot_remove >= total_bytes_orig: + # msg = (f"Total NVMe bytes haven't been reduced after a hot remove! " + # f"Original = {total_bytes_orig}; Hot-removed = {total_bytes_hot_remove}") + # errors.append(msg) + + # 8. Hot-plug. + # self.log_step("Hot-plug.") + # command = f"sudo python3 /home/mkano/daos/build/external/debug/spdk/scripts/rpc.py -s /var/tmp/spdk_0.sock vmd_rescan" + # rpc_out = run_pcmd(hosts=self.hostlist_servers, command=command) + # self.log.debug(f"## Hot plug out = {rpc_out}") # 9. Repeatedly call "dmg storage query list-devices" until the disk state becomes NORMAL # and LED becomes OFF. @@ -193,11 +215,12 @@ def test_no_activity(self): error_msg = "Disk and LED state didn't turn to plugged state. (NORMAL and OFF)" self.repeat_query_list_devices( exp_disk_state="NORMAL", exp_led_state="OFF", error_msg=error_msg, errors=errors, - dmg_command=dmg_command) + dmg_command=dmg_command, rank = rank) # 10. Verify that the disk space is back to the original. self.log_step("Verify that the disk space is back to the original.") - total_bytes_hot_plug = self.get_total_nvme_bytes(dmg_command=dmg_command) + total_bytes_hot_plug = self.get_total_nvme_bytes( + dmg_command=dmg_command, host_port=host_port, rank=rank) if total_bytes_hot_plug != total_bytes_orig: msg = (f"Total NVMe bytes haven't been recovered! Original = {total_bytes_orig}; " f"After hot plug = {total_bytes_hot_plug}") From ad7df9c9356faefe0ab989d315ca451503cdfe5f Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Mon, 22 Jan 2024 06:57:49 +0000 Subject: [PATCH 05/11] DAOS-15008 test: Fix pylint Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: test_no_activity Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/vmd/hot_plug_no_activity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/vmd/hot_plug_no_activity.py b/src/tests/ftest/vmd/hot_plug_no_activity.py index 73d9068ef95..c121adb5800 100644 --- a/src/tests/ftest/vmd/hot_plug_no_activity.py +++ b/src/tests/ftest/vmd/hot_plug_no_activity.py @@ -215,7 +215,7 @@ def test_no_activity(self): error_msg = "Disk and LED state didn't turn to plugged state. (NORMAL and OFF)" self.repeat_query_list_devices( exp_disk_state="NORMAL", exp_led_state="OFF", error_msg=error_msg, errors=errors, - dmg_command=dmg_command, rank = rank) + dmg_command=dmg_command, rank=rank) # 10. Verify that the disk space is back to the original. self.log_step("Verify that the disk space is back to the original.") From 00c1e57feaa83a3f7697e428150180a6fefb682a Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Mon, 22 Jan 2024 09:29:07 +0000 Subject: [PATCH 06/11] DAOS-15008 test: Use 10% pool size Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: test_no_activity Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/vmd/hot_plug_no_activity.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/vmd/hot_plug_no_activity.yaml b/src/tests/ftest/vmd/hot_plug_no_activity.yaml index 24f16fbaf72..b3e3af2f7c5 100644 --- a/src/tests/ftest/vmd/hot_plug_no_activity.yaml +++ b/src/tests/ftest/vmd/hot_plug_no_activity.yaml @@ -53,7 +53,7 @@ server_config: sock_addr: /var/tmp/spdk_1.sock pool: - size: 5G + size: 10% container: type: POSIX From 6e78208b9434719723f3a56e29569df5a9b06639 Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Mon, 22 Apr 2024 06:23:09 +0000 Subject: [PATCH 07/11] DAOS-15008 test: Update test steps and made other changes Include set nvme-faulty and replace steps. Update the logic to check total available NVMe size. Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: test_no_activity Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/vmd/hot_plug_no_activity.py | 241 ++++++++++-------- src/tests/ftest/vmd/hot_plug_no_activity.yaml | 2 +- 2 files changed, 131 insertions(+), 112 deletions(-) diff --git a/src/tests/ftest/vmd/hot_plug_no_activity.py b/src/tests/ftest/vmd/hot_plug_no_activity.py index c121adb5800..e140617a121 100644 --- a/src/tests/ftest/vmd/hot_plug_no_activity.py +++ b/src/tests/ftest/vmd/hot_plug_no_activity.py @@ -16,7 +16,7 @@ class HotPlugNoActivityTest(IorTestBase): :avocado: recursive """ def repeat_query_list_devices(self, exp_disk_state, exp_led_state, error_msg, errors, - dmg_command, rank): + dmg_command, uuid): """Repeatedly call dmg storage query list-devices verify the states. Args: @@ -25,43 +25,40 @@ def repeat_query_list_devices(self, exp_disk_state, exp_led_state, error_msg, er error_msg (str): Error message to add to the error list. errors (list): Error list for this test. dmg_command (DmgCommand): DmgCommand object to call the command. - rank (int): Rank where the NVMe disk we want to access is located. + uuid (str): Device UUID. """ removed_state = False for count in range(2): time.sleep(30) device_info = get_storage_query_device_info(dmg=dmg_command) - self.log.info("device_info (repeat_query_list_devices) = %s", device_info) + self.log.info("## device_info (repeat_query_list_devices) = %s", device_info) dev_state = None led_state = None for device in device_info: - if device["rank"] == rank: + if device["uuid"] == uuid: device_control = device["ctrlr"] dev_state = device_control["dev_state"] led_state = device_control["led_state"] if dev_state == exp_disk_state and led_state == exp_led_state: removed_state = True break - self.log.info("%d: Disk state = %s; LED state = %s", count, dev_state, led_state) + self.log.info("## %d: Disk state = %s; LED state = %s", count, dev_state, led_state) if not removed_state: errors.append(error_msg) - def get_total_nvme_bytes(self, dmg_command, host_port, rank): - """Get total NVMe bytes using 'dmg storage query usage' + def get_uuid_to_total_byes(self, dmg_command, host_port): + """Call 'dmg storage query usage' and get UUID and total bytes of each NVMe drive. Args: dmg_command (DmgCommand): DmgCommand object used to call the command and to obtain the port number. host_port (str): Host where the NVMe disk we want to access is located. : - rank (int): Rank where the NVMe disk we want to access is located. Returns: - int: total_bytes value of the NVMe device we're interested, which is specified by - host_port and rank parameter. If not found, for example because of invalid host_port - rank combination, returns None. + dict: UUID to total bytes of each NVMe drive that are in the given host. """ - total_bytes = None + uuid_to_total_bytes = {} usage_out = dmg_command.storage_query_usage() # There's a hash value under HostStorage. Obtain "storage" -> "nvme_devices" and "hosts" # under it. There may be multiple hosts depending on the setup. @@ -71,30 +68,34 @@ def get_total_nvme_bytes(self, dmg_command, host_port, rank): # In HW medium cluster, there's only one dictionary in the smd_devices list, so # we may be able to just index it, but use for loop just in case. for smd_device in nvme_device["smd_devices"]: - if smd_device["rank"] == rank: - total_bytes = smd_device["total_bytes"] - self.log.info("total_bytes = %s", total_bytes) - return total_bytes + uuid_to_total_bytes[smd_device["uuid"]] = smd_device["total_bytes"] + return uuid_to_total_bytes def test_no_activity(self): - """Conduct VMD hot-remove and hot-plug during no activity. + """Test VMD hot-remove and hot-plug during no activity. 1. Determine the PCI address (TrAddr) of the disk we'll hot-remove and verify that its state is NORMAL and LED is OFF. - 2. Store the total NVMe size. + 2. Store the total NVMe size of each drive. 3. Create a pool and a container. 4. Write data with IOR. - 5. Hot remove the disk we selected at step 1. - 6. Repeatedly call "dmg storage query list-devices" until the disk state becomes UNPLUGGED + 5. Call dmg storage set nvme-faulty --uuid= + 6. Repeatedly call "dmg storage query list-devices" until the disk state becomes EVICTED + and LED becomes ON. + 7. Hot remove the disk we selected at step 1. + 8. Repeatedly call "dmg storage query list-devices" until the disk state becomes UNPLUGGED and LED becomes NA. - 7. Verify that the disk space is down compared to before the remove. - 8. Hot-plug. - 9. Repeatedly call "dmg storage query list-devices" until the disk state becomes NORMAL and + 9. For those untouched devices, verify that the space is unchanged after the hot remove. + 10. Hot-plug. + 11. Repeatedly call "dmg storage query list-devices" until the disk state becomes EVICTED + and LED becomes ON. + 12. Call dmg storage replace nvme --old-uuid= --new-uuid= + 13. Repeatedly call "dmg storage query list-devices" until the disk state becomes NORMAL and LED becomes OFF. - 10. Verify that the disk space is back to the original. - 11. Verify that none of the engines have crashed. - 12. Verify that the disks are healthy by checking the container status. - 13. Rung IOR and check that it works. + 14. Verify that the disk spaces are back to the original. + 15. Verify that none of the engines have crashed. + 16. Verify that the disks are healthy by checking the container status. + 17. Rung IOR and check that it works. Jira ID: DAOS-15008 @@ -103,8 +104,6 @@ def test_no_activity(self): :avocado: tags=vmd,hot_plug :avocado: tags=HotPlugNoActivityTest,test_no_activity """ - # 1. Determine the PCI address (TrAddr) of the disk we'll hot-remove and verify that its - # state is NORMAL and LED is OFF. msg = ("Determine the PCI address (TrAddr) of the disk we'll hot-remove and verify that " "its state is NORMAL and LED is OFF.") self.log_step(msg) @@ -116,9 +115,10 @@ def test_no_activity(self): led_state = None host_port = None rank = 0 - # We'll use the rank 0 device. + # We'll use the first rank 0 device. There could be multiple devices mapped to rank 0. for device in device_info: if device["rank"] == rank: + remove_uuid = device["uuid"] # UUID of the drive that we'll remove and plug. device_control = device["ctrlr"] pci_addr = device_control["pci_addr"] dev_state = device_control["dev_state"] @@ -126,7 +126,7 @@ def test_no_activity(self): host_port = device["hosts"] break msg = (f"pci_addr = {pci_addr}; dev_state = {dev_state}; led_state = {led_state}; " - f"host_port = {host_port}") + f"host_port = {host_port}; uuid = {remove_uuid}") self.log.info(msg) errors = [] exp_disk_state = "NORMAL" @@ -137,110 +137,129 @@ def test_no_activity(self): if led_state != exp_led_state: errors.append(f"Unexpected LED state! Expected = {exp_led_state}, Actual = {led_state}") - # 2. Store the total NVMe size. - self.log_step("Store the total NVMe size.") - total_bytes_orig = self.get_total_nvme_bytes( - dmg_command=dmg_command, host_port=host_port, rank=rank) + self.log_step("Store the total NVMe size of each drive.") + uuid_to_total_bytes_orig = self.get_uuid_to_total_byes( + dmg_command=dmg_command, host_port=host_port) + self.log.debug(f"## uuid_to_total_bytes_orig = {uuid_to_total_bytes_orig}") - # 3. Create a pool and a container. self.log_step("Create a pool and a container") self.pool = self.get_pool(connect=False) self.container = self.get_container(pool=self.pool) - # 4. Write data with IOR. self.log_step("Write data with IOR.") self.ior_cmd.set_daos_params(self.server_group, self.pool, self.container.identifier) self.run_ior_with_pool(create_pool=False, create_cont=False) - # 5. Hot remove the disk we selected at step 1. - # self.log_step("Hot remove the disk we selected at step 1.") - # command = f"sudo python3 /home/mkano/daos/build/external/debug/spdk/scripts/rpc.py -s /var/tmp/spdk_0.sock vmd_remove_device {pci_addr}" - # rpc_out = run_pcmd(hosts=self.hostlist_servers, command=command) - # self.log.debug(f"## Hot remove out = {rpc_out}") - # stdout contains the output as list. Each item in the list represents a line. e.g., - # '[', - # ' {', - # ' "name": "Nvme_wolf-313.wolf.hpdd.intel.com_0_1_0",', - # ' "ctrlrs": [', - # ' {', - # ' "state": "enabled",', - # ... - # We'll remove the first and the last bracket, concatenate all items into a single string, - # then pass it into yaml.safe_load(). (It works even there are white spaces at the beginning - # of each line.) - # stdout = rpc_out[0]["stdout"] - # First and last item of stdout is "[" and "]", so remove them. - # stdout.pop() - # stdout.pop(0) - # Concatenate each line into a single string. - # stdout_str = "".join(stdout) - # Convert the string to a yaml object so that we can easily reference the values. - # yaml_out = yaml.safe_load(stdout_str) - # self.log.debug(f"## yaml_out = {yaml_out}") - # name = yaml_out["name"] - # self.log.debug(f"## name = {name}") - # state = yaml_out["ctrlrs"][0]["state"] - # self.log.debug(f"## state = {state}") - - # 6. Repeatedly call "dmg storage query list-devices" until the disk state becomes UNPLUGGED - # and LED becomes NA. - # msg = ("Repeatedly call 'dmg storage query list-devices' until the disk state becomes " - # "UNPLUGGED and LED becomes NA.") - # self.log_step(msg) - # error_msg = "Disk and LED state didn't turn to removed state. (UNPLUGGED and OFF)" - # self.repeat_query_list_devices( - # exp_disk_state="UNPLUGGED", exp_led_state="OFF", error_msg=error_msg, errors=errors, - # dmg_command=dmg_command, rank=rank) - - # 7. Verify that the disk space is down compared to before the remove. - # self.log_step("Verify that the disk space is down compared to before the remove.") - # total_bytes_hot_remove = self.get_total_nvme_bytes( - # dmg_command=dmg_command, host_port=host_port, rank=rank) - # if total_bytes_hot_remove >= total_bytes_orig: - # msg = (f"Total NVMe bytes haven't been reduced after a hot remove! " - # f"Original = {total_bytes_orig}; Hot-removed = {total_bytes_hot_remove}") - # errors.append(msg) - - # 8. Hot-plug. - # self.log_step("Hot-plug.") - # command = f"sudo python3 /home/mkano/daos/build/external/debug/spdk/scripts/rpc.py -s /var/tmp/spdk_0.sock vmd_rescan" - # rpc_out = run_pcmd(hosts=self.hostlist_servers, command=command) - # self.log.debug(f"## Hot plug out = {rpc_out}") - - # 9. Repeatedly call "dmg storage query list-devices" until the disk state becomes NORMAL - # and LED becomes OFF. + self.log_step("Call dmg storage set nvme-faulty --uuid=.") + dmg_command.storage_set_faulty(uuid=remove_uuid) + + exp_disk_state = "EVICTED" + exp_led_state = "ON" + msg = ("Repeatedly call 'dmg storage query list-devices' until the disk state becomes " + f"{exp_disk_state} and LED becomes {exp_led_state}.") + self.log_step(msg) + error_msg = f"Disk state must be {exp_disk_state} and LED state must be {exp_led_state}." + self.repeat_query_list_devices( + exp_disk_state=exp_disk_state, exp_led_state=exp_led_state, error_msg=error_msg, + errors=errors, dmg_command=dmg_command, uuid=remove_uuid) + + self.log_step("Hot remove the disk we selected at step 1.") + spdk_sock_path = self.params.get( + "sock_addr", "/run/server_config/engines/0/spdk_rpc_server/*") + command = (f"sudo /usr/share/spdk/scripts/rpc.py -s {spdk_sock_path} vmd_remove_device " + f"{pci_addr}") + rpc_out = run_pcmd(hosts=self.hostlist_servers, command=command) + self.log.debug(f"## Hot remove out = {rpc_out}") + exit_status = rpc_out[0]["exit_status"] + if exit_status != 0: + self.fail(f"Hot remove failed! {rpc_out}") + + exp_disk_state = "UNPLUGGED" + exp_led_state = "NA" + msg = ("Repeatedly call 'dmg storage query list-devices' until the disk state becomes " + f"{exp_disk_state} and LED becomes {exp_led_state}.") + self.log_step(msg) + error_msg = (f"Disk and LED state didn't turn to removed state. ({exp_disk_state} and " + f"{exp_led_state})") + self.repeat_query_list_devices( + exp_disk_state=exp_disk_state, exp_led_state=exp_led_state, error_msg=error_msg, + errors=errors, dmg_command=dmg_command, uuid=remove_uuid) + + # Note: For this step, if we don't use JSON as in manual test, we get the sum of total bytes + # from all drives in a host, so we just compare the values before and after the hot remove. + # However, automated test uses JSON, which returns total bytes for each drive, so we can do + # finer grained comparisons. + msg = ("For those untouched devices, verify that the space is unchanged after the hot " + "remove.") + self.log_step(msg) + uuid_to_total_bytes_after = self.get_uuid_to_total_byes( + dmg_command=dmg_command, host_port=host_port) + self.log.debug(f"## uuid_to_total_bytes_after = {uuid_to_total_bytes_after}") + # Check that the removed device doesn't appear in 'dmg storage query usage' output. + if remove_uuid in uuid_to_total_bytes_after: + msg = (f"Removed device ({disk_uuid}) appears after hot remove! " + f"{uuid_to_total_bytes_after}") + errors.append(msg) + for disk_uuid, total_bytes in uuid_to_total_bytes_orig.items(): + if disk_uuid != remove_uuid: + if disk_uuid not in uuid_to_total_bytes_after: + msg = (f"Untouched disk ({disk_uuid}) disappeared after a hot remove!") + errors.append(msg) + elif total_bytes != uuid_to_total_bytes_after[disk_uuid]: + msg = (f"Hot remove resulted in untouched disk ({disk_uuid}) size change!") + errors.append(msg) + + self.log_step("Hot-plug.") + command = f"sudo /usr/share/spdk/scripts/rpc.py -s {spdk_sock_path} vmd_rescan" + rpc_out = run_pcmd(hosts=self.hostlist_servers, command=command) + self.log.debug(f"## Hot plug out = {rpc_out}") + exit_status = rpc_out[0]["exit_status"] + if exit_status != 0: + self.fail(f"Hot plug failed! {rpc_out}") + + exp_disk_state = "EVICTED" + exp_led_state = "ON" msg = ("Repeatedly call 'dmg storage query list-devices' until the disk state becomes " - "NORMAL and LED becomes OFF.") + f"{exp_disk_state} and LED becomes {exp_led_state}.") self.log_step(msg) - error_msg = "Disk and LED state didn't turn to plugged state. (NORMAL and OFF)" + error_msg = f"Disk must be {exp_disk_state} and LED must be {exp_led_state}." self.repeat_query_list_devices( - exp_disk_state="NORMAL", exp_led_state="OFF", error_msg=error_msg, errors=errors, - dmg_command=dmg_command, rank=rank) - - # 10. Verify that the disk space is back to the original. - self.log_step("Verify that the disk space is back to the original.") - total_bytes_hot_plug = self.get_total_nvme_bytes( - dmg_command=dmg_command, host_port=host_port, rank=rank) - if total_bytes_hot_plug != total_bytes_orig: - msg = (f"Total NVMe bytes haven't been recovered! Original = {total_bytes_orig}; " - f"After hot plug = {total_bytes_hot_plug}") + exp_disk_state=exp_disk_state, exp_led_state=exp_led_state, error_msg=error_msg, + errors=errors, dmg_command=dmg_command, uuid=remove_uuid) + + self.log_step("Call dmg storage replace nvme --old-uuid= --new-uuid=") + dmg_command.storage_replace_nvme(old_uuid=remove_uuid, new_uuid=remove_uuid) + + exp_disk_state = "NORMAL" + exp_led_state = "OFF" + msg = ("Repeatedly call 'dmg storage query list-devices' until the disk state becomes " + f"{exp_disk_state} and LED becomes {exp_led_state}.") + self.log_step(msg) + error_msg = (f"Disk and LED state didn't go back to normal state! ({exp_disk_state} and " + f"{exp_led_state})") + self.repeat_query_list_devices( + exp_disk_state=exp_disk_state, exp_led_state=exp_led_state, error_msg=error_msg, + errors=errors, dmg_command=dmg_command, uuid=remove_uuid) + + self.log_step("Verify that the disk spaces are back to the original.") + uuid_to_total_bytes_after = self.get_uuid_to_total_byes( + dmg_command=dmg_command, host_port=host_port) + if uuid_to_total_bytes_after != uuid_to_total_bytes_orig: + msg = (f"Disk sizes changed after hot remove! Orig = {uuid_to_total_bytes_orig}; " + f"After = {uuid_to_total_bytes_after}") errors.append(msg) - # 11. Verify that none of the engines have crashed. self.log_step("Verify that none of the engines have crashed.") system_query_out = dmg_command.system_query() - system_healthy = check_system_query_status(data=system_query_out) - if not system_healthy: + if not check_system_query_status(data=system_query_out): errors.append("One or more ranks crashed after hot remove and plug!") - # 12. Verify that the disks are healthy by checking the container status. self.log_step("Verify that the disks are healthy by checking the container status.") expected_props = {"status": "HEALTHY"} container_healthy = self.container.verify_prop(expected_props=expected_props) if not container_healthy: errors.append("Container status isn't HEALTHY after hot remove and plug!") - # 13. Rung IOR and check that it works. self.log_step("Rung IOR and check that it works.") cmd_result = self.run_ior_with_pool(create_pool=False, create_cont=False) if cmd_result.exit_status != 0: diff --git a/src/tests/ftest/vmd/hot_plug_no_activity.yaml b/src/tests/ftest/vmd/hot_plug_no_activity.yaml index b3e3af2f7c5..12b2284bd23 100644 --- a/src/tests/ftest/vmd/hot_plug_no_activity.yaml +++ b/src/tests/ftest/vmd/hot_plug_no_activity.yaml @@ -21,7 +21,7 @@ timeout: 300 # scm_list: [/dev/pmem0] # 1: # class: nvme -# bdev_list: ["5d0505:01:00.0"] +# bdev_list: ["0000:5d:05.5"] # spdk_rpc_server: # enable: true # sock_addr: /var/tmp/spdk_0.sock From 88d72f22cf3e74c12c9cb5dd2f6bce8e0f389fe1 Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Mon, 22 Apr 2024 06:37:52 +0000 Subject: [PATCH 08/11] DAOS-15008 test: Fix pylint Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: test_no_activity Required-githooks: true Signed-off-by: Makito Kano --- src/tests/ftest/vmd/hot_plug_no_activity.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/tests/ftest/vmd/hot_plug_no_activity.py b/src/tests/ftest/vmd/hot_plug_no_activity.py index e140617a121..3dd832330a6 100644 --- a/src/tests/ftest/vmd/hot_plug_no_activity.py +++ b/src/tests/ftest/vmd/hot_plug_no_activity.py @@ -140,7 +140,7 @@ def test_no_activity(self): self.log_step("Store the total NVMe size of each drive.") uuid_to_total_bytes_orig = self.get_uuid_to_total_byes( dmg_command=dmg_command, host_port=host_port) - self.log.debug(f"## uuid_to_total_bytes_orig = {uuid_to_total_bytes_orig}") + self.log.debug("## uuid_to_total_bytes_orig = %s", uuid_to_total_bytes_orig) self.log_step("Create a pool and a container") self.pool = self.get_pool(connect=False) @@ -169,7 +169,7 @@ def test_no_activity(self): command = (f"sudo /usr/share/spdk/scripts/rpc.py -s {spdk_sock_path} vmd_remove_device " f"{pci_addr}") rpc_out = run_pcmd(hosts=self.hostlist_servers, command=command) - self.log.debug(f"## Hot remove out = {rpc_out}") + self.log.debug("## Hot remove out = %s", rpc_out) exit_status = rpc_out[0]["exit_status"] if exit_status != 0: self.fail(f"Hot remove failed! {rpc_out}") @@ -194,25 +194,25 @@ def test_no_activity(self): self.log_step(msg) uuid_to_total_bytes_after = self.get_uuid_to_total_byes( dmg_command=dmg_command, host_port=host_port) - self.log.debug(f"## uuid_to_total_bytes_after = {uuid_to_total_bytes_after}") + self.log.debug("## uuid_to_total_bytes_after = %s", uuid_to_total_bytes_after) # Check that the removed device doesn't appear in 'dmg storage query usage' output. if remove_uuid in uuid_to_total_bytes_after: - msg = (f"Removed device ({disk_uuid}) appears after hot remove! " + msg = (f"Removed device ({remove_uuid}) appears after hot remove! " f"{uuid_to_total_bytes_after}") errors.append(msg) for disk_uuid, total_bytes in uuid_to_total_bytes_orig.items(): if disk_uuid != remove_uuid: if disk_uuid not in uuid_to_total_bytes_after: - msg = (f"Untouched disk ({disk_uuid}) disappeared after a hot remove!") + msg = f"Untouched disk ({disk_uuid}) disappeared after a hot remove!" errors.append(msg) elif total_bytes != uuid_to_total_bytes_after[disk_uuid]: - msg = (f"Hot remove resulted in untouched disk ({disk_uuid}) size change!") + msg = f"Hot remove resulted in untouched disk ({disk_uuid}) size change!" errors.append(msg) self.log_step("Hot-plug.") command = f"sudo /usr/share/spdk/scripts/rpc.py -s {spdk_sock_path} vmd_rescan" rpc_out = run_pcmd(hosts=self.hostlist_servers, command=command) - self.log.debug(f"## Hot plug out = {rpc_out}") + self.log.debug("## Hot plug out = %s", rpc_out) exit_status = rpc_out[0]["exit_status"] if exit_status != 0: self.fail(f"Hot plug failed! {rpc_out}") From 99e4d7558cde42686c88b3ff51c932ac1a44c57a Mon Sep 17 00:00:00 2001 From: "Gromadzki, Tomasz" Date: Wed, 24 Apr 2024 07:50:21 +0200 Subject: [PATCH 09/11] Force Jenkins to use ci_vmd5 for tests instead of ci_nvme5. Test-tag: pr,hw,medium,hot_plug Required-githooks: true Quick-functional: true Skip-coverity-test: true Skip-fault-injection-test: true Skip-func-test-el8: true Skip-nlt: true Skip-python-bandit: true Skip-test-el-8.6-rpms: true Skip-unit-test-memcheck: true Skip-unit-tests: true Skip-func-hw-test-medium: true Skip-func-hw-test-medium-verbs-provider: true Skip-func-hw-test-medium-md-on-ssd: false Skip-func-hw-test-medium-verbs-provider-md-on-ssd: false Skip-func-hw-test-medium-ucx-provider: true Skip-func-hw-test-large: true Signed-off-by: Gromadzki, Tomasz --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 6e902890060..73935956ebb 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -291,10 +291,10 @@ pipeline { defaultValue: 'ci_nlt_1', description: 'Label to use for NLT tests') string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_LABEL', - defaultValue: 'ci_nvme5', + defaultValue: 'ci_vmd5', description: 'Label to use for the Functional Hardware Medium (MD on SSD) stages') string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_VERBS_PROVIDER_LABEL', - defaultValue: 'ci_nvme5', + defaultValue: 'ci_vmd5', description: 'Label to use for 5 node Functional Hardware Medium Verbs Provider (MD on SSD) stages') string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_UCX_PROVIDER_LABEL', defaultValue: 'ci_ofed5', From cda21907a2e1fa6c2c3be039d6501600829a7665 Mon Sep 17 00:00:00 2001 From: "Gromadzki, Tomasz" Date: Wed, 24 Apr 2024 11:14:25 +0200 Subject: [PATCH 10/11] Force to use custom SPDK version. Test-tag: hw,medium,hot_plug PR-repos: spdk@PR-61 Required-githooks: true Quick-functional: true Skip-coverity-test: true Skip-fault-injection-test: true Skip-func-test-el8: true Skip-nlt: true Skip-python-bandit: true Skip-test-el-8.6-rpms: true Skip-unit-test-memcheck: true Skip-unit-tests: true Skip-func-hw-test-medium: true Skip-func-hw-test-medium-verbs-provider: true Skip-func-hw-test-medium-md-on-ssd: false Skip-func-hw-test-medium-verbs-provider-md-on-ssd: false Skip-func-hw-test-medium-ucx-provider: true Skip-func-hw-test-large: true Signed-off-by: Gromadzki, Tomasz --- Jenkinsfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 73935956ebb..d509cb03d17 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -217,13 +217,13 @@ pipeline { defaultValue: false, description: 'Do not build RPM packages for EL 8') booleanParam(name: 'CI_RPM_el9_NOBUILD', - defaultValue: false, + defaultValue: true, description: 'Do not build RPM packages for EL 9') booleanParam(name: 'CI_RPM_leap15_NOBUILD', - defaultValue: false, + defaultValue: true, description: 'Do not build RPM packages for Leap 15') booleanParam(name: 'CI_DEB_Ubuntu20_NOBUILD', - defaultValue: false, + defaultValue: true, description: 'Do not build DEB packages for Ubuntu 20') booleanParam(name: 'CI_ALLOW_UNSTABLE_TEST', defaultValue: false, From 55b555b891282c3e44a798ee3b044cd58e8c0512 Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Sat, 27 Apr 2024 01:23:54 +0000 Subject: [PATCH 11/11] DAOS-15008 test: Use rd_fac:1 for container and RP_2GX for IOR object type Skip-unit-tests: true Skip-fault-injection-test: true Test-tag: test_no_activity PR-repos: spdk@PR-61:53 Signed-off-by: Makito Kano --- src/tests/ftest/vmd/hot_plug_no_activity.yaml | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/tests/ftest/vmd/hot_plug_no_activity.yaml b/src/tests/ftest/vmd/hot_plug_no_activity.yaml index 12b2284bd23..bd5d01494ac 100644 --- a/src/tests/ftest/vmd/hot_plug_no_activity.yaml +++ b/src/tests/ftest/vmd/hot_plug_no_activity.yaml @@ -1,7 +1,7 @@ hosts: # wolf # test_servers: 1 - # CI + # Aurora/CI test_servers: 2 timeout: 300 @@ -15,13 +15,20 @@ timeout: 300 # targets: 4 # nr_xs_helpers: 0 # storage: +# # PMEM # 0: # class: dcpm # scm_mount: /mnt/daos0 # scm_list: [/dev/pmem0] +# # MD-on-SSD +# # 0: +# # class: ram +# # scm_mount: /mnt/daos0 +# # scm_size: 5 # 1: # class: nvme -# bdev_list: ["0000:5d:05.5"] +# bdev_list: ["0000:5d:05.5"] # wolf-313 +# # bdev_roles: [wal, data, meta] # spdk_rpc_server: # enable: true # sock_addr: /var/tmp/spdk_0.sock @@ -58,10 +65,15 @@ pool: container: type: POSIX control_method: daos + properties: rd_fac:1 ior: flags: -v -W -w api: DFS - transfer_size: 1M - block_size: 1G - dfs_oclass: SX + transfer_size: 1G + block_size: 5G + # wolf + # dfs_oclass: SX + # CI + dfs_oclass: RP_2GX + dfs_dir_oclass: RP_2GX