diff --git a/lisa/features/gpu.py b/lisa/features/gpu.py index d7dd21a267..2e48f81a33 100644 --- a/lisa/features/gpu.py +++ b/lisa/features/gpu.py @@ -193,7 +193,7 @@ def get_gpu_count_with_lsvmbus(self) -> int: bridge_device_count = 0 lsvmbus_tool = self._node.tools[Lsvmbus] - device_list = lsvmbus_tool.get_device_channels_from_lsvmbus() + device_list = lsvmbus_tool.get_device_channels() for device in device_list: for name, id, bridge_count in self.gpu_devices: if id in device.device_id: diff --git a/lisa/tools/lsvmbus.py b/lisa/tools/lsvmbus.py index d7138b613c..20cdd1376f 100644 --- a/lisa/tools/lsvmbus.py +++ b/lisa/tools/lsvmbus.py @@ -192,10 +192,9 @@ def install(self) -> bool: return self._check_exists() - def get_device_channels_from_lsvmbus( - self, force_run: bool = False - ) -> List[VmBusDevice]: + def get_device_channels(self, force_run: bool = False) -> List[VmBusDevice]: if (not self._vmbus_devices) or force_run: + self._vmbus_devices = [] result = self.run("-vv", force_run=force_run, shell=True) if result.exit_code != 0: result = self.run("-vv", force_run=force_run, shell=True, sudo=True) diff --git a/microsoft/testsuites/core/cpu.py b/microsoft/testsuites/core/cpu.py index e6747e1430..a4e5d44a94 100644 --- a/microsoft/testsuites/core/cpu.py +++ b/microsoft/testsuites/core/cpu.py @@ -3,12 +3,10 @@ from __future__ import annotations import time -from pathlib import PurePosixPath from assertpy.assertpy import assert_that from lisa import ( - BadEnvironmentStateException, LisaException, Logger, Node, @@ -17,89 +15,19 @@ TestSuite, TestSuiteMetadata, ) -from lisa.tools import Cat, Echo, InterruptInspector, Lscpu, Lsvmbus, TaskSet, Uname - - -class CPUState: - OFFLINE: str = "0" - ONLINE: str = "1" - +from lisa.tools import Cat, InterruptInspector, Lscpu, TaskSet, Uname hyperv_interrupt_substr = ["hyperv", "Hypervisor", "Hyper-V"] @TestSuiteMetadata( - area="cpu", + area="core", category="functional", description=""" This test suite is used to run CPU related tests. """, ) class CPU(TestSuite): - @TestCaseMetadata( - description=""" - This test will check that CPU assigned to lsvmbus - channels cannot be put offline. - Steps : - 1. Get the list of lsvmbus channel cpu mappings using - command `lsvmbus -vv`. - 2. Create a set of cpu's assigned to lsvmbus channels. - 3. Try to put cpu offline by running - `echo 0 > /sys/devices/system/cpu/cpu//online`. - Note : We skip cpu 0 as it handles system interrupts. - 4. Ensure that cpu is still online by checking state '1' in - `/sys/devices/system/cpu/cpu//online`. - """, - priority=2, - ) - def cpu_verify_vmbus_force_online(self, node: Node, log: Logger) -> None: - cpu_count = node.tools[Lscpu].get_core_count() - log.debug(f"{cpu_count} CPU cores detected...") - - # Find CPUs(except CPU0) which are mapped to LSVMBUS channels and have - # `sys/devices/system/cpu/cpu/cpu/online` file present. - channels = node.tools[Lsvmbus].get_device_channels_from_lsvmbus() - is_non_zero_cpu_id_mapped = False - mapped_cpu_set = set() - for channel in channels: - for channel_vp_map in channel.channel_vp_map: - target_cpu = channel_vp_map.target_cpu - if target_cpu == "0": - continue - is_non_zero_cpu_id_mapped = True - file_path = self._get_cpu_config_file(target_cpu) - file_exists = node.shell.exists(PurePosixPath(file_path)) - if file_exists: - mapped_cpu_set.add(target_cpu) - - # Fail test if `/sys/devices/system/cpu/cpu/cpu/online` file does - # not exist for all CPUs(except CPU0) mapped to LSVMBUS channels. This - # is to catch distros which have this unexpected behaviour. - if is_non_zero_cpu_id_mapped and not mapped_cpu_set: - raise LisaException( - "/sys/devices/system/cpu/cpu/cpu/online file" - "does not exists for all CPUs mapped to LSVMBUS channels." - ) - - for target_cpu in mapped_cpu_set: - log.debug(f"Checking CPU {target_cpu} on /sys/device/....") - result = self._set_cpu_state(target_cpu, CPUState.OFFLINE, node) - if result: - # Try to bring CPU back to it's original state - reset = self._set_cpu_state(target_cpu, CPUState.ONLINE, node) - exception_message = ( - f"Expected CPU {target_cpu} state : {CPUState.ONLINE}(online), " - f"actual state : {CPUState.OFFLINE}(offline). CPU's mapped to " - f"LSVMBUS channels shouldn't be in state " - f"{CPUState.OFFLINE}(offline)." - ) - if not reset: - raise BadEnvironmentStateException( - exception_message, - f"The test failed leaving CPU {target_cpu} in a bad state.", - ) - raise AssertionError(exception_message) - @TestCaseMetadata( description=""" This test case will check that L3 cache is correctly mapped @@ -268,15 +196,6 @@ def verify_vmbus_interrupts(self, node: Node, log: Logger) -> None: if not found_hyperv_interrupt: raise LisaException("Hyper-V interrupts are not recorded.") - def _get_cpu_config_file(self, cpu_id: str) -> str: - return f"/sys/devices/system/cpu/cpu{cpu_id}/online" - - def _set_cpu_state(self, cpu_id: str, state: str, node: Node) -> bool: - file_path = self._get_cpu_config_file(cpu_id) - node.tools[Echo].write_to_file(state, node.get_pure_path(file_path), sudo=True) - result = node.tools[Cat].read(file_path, force_run=True, sudo=True) - return result == state - def _create_stimer_interrupts(self, node: Node, cpu_count: int) -> None: # Run CPU intensive workload to create hyper-v synthetic timer # interrupts. diff --git a/microsoft/testsuites/core/lsvmbus.py b/microsoft/testsuites/core/lsvmbus.py index fe946ebb63..8ac574732c 100644 --- a/microsoft/testsuites/core/lsvmbus.py +++ b/microsoft/testsuites/core/lsvmbus.py @@ -70,7 +70,7 @@ def lsvmbus_count_devices_channels(self, node: Node) -> None: "1" == node.tools[VmGeneration].get_generation() ) lsvmbus_tool = node.tools[Lsvmbus] - vmbus_devices_list = lsvmbus_tool.get_device_channels_from_lsvmbus() + vmbus_devices_list = lsvmbus_tool.get_device_channels() actual_vmbus_device_names = [x.name for x in vmbus_devices_list] assert_that(actual_vmbus_device_names).is_not_none() assert_that(vmbus_devices.names).is_subset_of(actual_vmbus_device_names) diff --git a/microsoft/testsuites/cpu/common.py b/microsoft/testsuites/cpu/common.py new file mode 100644 index 0000000000..54fa10cf40 --- /dev/null +++ b/microsoft/testsuites/cpu/common.py @@ -0,0 +1,178 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +from __future__ import annotations + +from typing import Dict, List + +from lisa import BadEnvironmentStateException, Logger, Node +from lisa.tools import Cat, Dmesg, Echo, Lscpu, Lsvmbus, Uname +from lisa.util import SkippedException + + +class CPUState: + OFFLINE: str = "0" + ONLINE: str = "1" + + +def check_runnable(node: Node) -> None: + uname = node.tools[Uname] + kernel_version = uname.get_linux_information().kernel_version + config_path = f"/boot/config-{kernel_version}" + config_result = node.execute(f"grep CONFIG_HOTPLUG_CPU=y {config_path}", shell=True) + if config_result.exit_code != 0: + raise SkippedException( + f"the distro {node.os.name} doesn't support cpu hotplug." + ) + + +def set_interrupts_assigned_cpu( + log: Logger, node: Node, target_cpu: str = "0" +) -> Dict[str, str]: + uname = node.tools[Uname] + kernel_version = uname.get_linux_information().kernel_version + dmesg = node.tools[Dmesg] + lsvmbus = node.tools[Lsvmbus] + vmbus_version = dmesg.get_vmbus_version() + file_path_list: Dict[str, str] = {} + # the vmbus interrupt channel reassignment feature is available in 5.8+ kernel and + # vmbus version in 4.1+, the vmbus version is negotiated with the host. + if kernel_version >= "5.8.0" and vmbus_version >= "4.1.0": + # save the raw cpu number for each channel for restoring later. + channels = lsvmbus.get_device_channels(force_run=True) + for channel in channels: + for channel_vp_map in channel.channel_vp_map: + current_target_cpu = channel_vp_map.target_cpu + if current_target_cpu == target_cpu: + continue + file_path_list[ + get_interrupts_assigned_cpu( + channel.device_id, channel_vp_map.rel_id + ) + ] = current_target_cpu + # set all vmbus channel interrupts go into cpu target_cpu. + assign_interrupts(file_path_list, node, target_cpu) + else: + # if current distro doesn't support this feature, the backup dict will be empty, + # there is nothing we can restore later, the case will rely on actual cpu usage + # on vm, if no idle cpu, then case will be skipped. + log.debug( + f"current distro {node.os.name}, os version {kernel_version}, " + f"vmbus version {vmbus_version} doesn't support " + "change channels target cpu featue." + ) + return file_path_list + + +def get_idle_cpus(node: Node) -> List[str]: + lsvmbus = node.tools[Lsvmbus] + channels = lsvmbus.get_device_channels(force_run=True) + # get all cpu in used from vmbus channels assignment + cpu_in_used = set() + for channel in channels: + for channel_vp_map in channel.channel_vp_map: + target_cpu = channel_vp_map.target_cpu + if target_cpu == "0": + continue + cpu_in_used.add(target_cpu) + + # get all cpu exclude cpu 0, usually cpu 0 is not allowed to do hotplug + cpu_count = node.tools[Lscpu].get_core_count() + all_cpu = list(range(1, cpu_count)) + + # get the idle cpu by excluding in used cpu from all cpu + idle_cpu = [str(x) for x in all_cpu if str(x) not in cpu_in_used] + return idle_cpu + + +def set_idle_cpu_offline_online(log: Logger, node: Node, idle_cpu: List[str]) -> None: + for target_cpu in idle_cpu: + set_offline = set_cpu_state(node, target_cpu, False) + log.debug(f"set cpu{target_cpu} from online to offline.") + exception_message = ( + f"expected cpu{target_cpu} state: {CPUState.OFFLINE}(offline), " + f"actual state: {CPUState.ONLINE}(online)." + ) + if not set_offline: + raise BadEnvironmentStateException( + exception_message, + f"the test failed leaving cpu{target_cpu} in a bad state.", + ) + + set_online = set_cpu_state(node, target_cpu, True) + log.debug(f"set cpu{target_cpu} from offline to online.") + exception_message = ( + f"expected cpu{target_cpu} state: {CPUState.ONLINE}(online), " + f"actual state: {CPUState.OFFLINE}(offline)." + ) + if not set_online: + raise BadEnvironmentStateException( + exception_message, + f"the test failed leaving cpu{target_cpu} in a bad state.", + ) + + +def verify_cpu_hot_plug(log: Logger, node: Node, run_times: int = 1) -> None: + check_runnable(node) + file_path_list: Dict[str, str] = {} + restore_state = False + try: + for iteration in range(1, run_times + 1): + log.debug(f"start the {iteration} time(s) testing.") + restore_state = False + # set vmbus channels target cpu into 0 if kernel supports this feature. + file_path_list = set_interrupts_assigned_cpu(log, node) + # when kernel doesn't support above feature, we have to rely on current vm's + # cpu usage. then collect the cpu not in used exclude cpu0. + idle_cpu = get_idle_cpus(node) + if 0 == len(idle_cpu): + raise SkippedException( + "all of the cpu are associated vmbus channels," + " no idle cpu can be used to test hotplug." + ) + # start to take idle cpu from online to offline, then offline to online. + set_idle_cpu_offline_online(log, node, idle_cpu) + # when kernel doesn't support set vmbus channels target cpu feature, the + # dict which stores original status is empty, nothing need to be restored. + restore_interrupts_assignment(file_path_list, node) + restore_state = True + finally: + if not restore_state: + restore_interrupts_assignment(file_path_list, node) + + +def get_cpu_state_file(cpu_id: str) -> str: + return f"/sys/devices/system/cpu/cpu{cpu_id}/online" + + +def get_interrupts_assigned_cpu(device_id: str, channel_id: str) -> str: + return f"/sys/bus/vmbus/devices/{device_id}/channels/{channel_id}/cpu" + + +def assign_interrupts( + path_cpu: Dict[str, str], + node: Node, + target_cpu: str = "0", +) -> None: + for path, _ in path_cpu.items(): + node.tools[Echo].write_to_file(target_cpu, node.get_pure_path(path), sudo=True) + + +def restore_interrupts_assignment( + path_cpu: Dict[str, str], + node: Node, +) -> None: + if path_cpu: + for path, target_cpu in path_cpu.items(): + node.tools[Echo].write_to_file( + target_cpu, node.get_pure_path(path), sudo=True + ) + + +def set_cpu_state(node: Node, cpu: str, online: bool = False) -> bool: + file_path = get_cpu_state_file(cpu) + state = CPUState.OFFLINE + if online: + state = CPUState.ONLINE + node.tools[Echo].write_to_file(state, node.get_pure_path(file_path), sudo=True) + result = node.tools[Cat].read(file_path, force_run=True, sudo=True) + return result == state diff --git a/microsoft/testsuites/cpu/cpusuite.py b/microsoft/testsuites/cpu/cpusuite.py new file mode 100644 index 0000000000..81f8f06080 --- /dev/null +++ b/microsoft/testsuites/cpu/cpusuite.py @@ -0,0 +1,53 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +from __future__ import annotations + +from lisa import ( + Logger, + Node, + TestCaseMetadata, + TestSuite, + TestSuiteMetadata, + simple_requirement, +) +from microsoft.testsuites.cpu.common import verify_cpu_hot_plug + + +@TestSuiteMetadata( + area="cpu", + category="functional", + description=""" + This test suite is used to run cpu related tests. + """, +) +class CPUSuite(TestSuite): + @TestCaseMetadata( + description=""" + This test will check cpu hotplug. + + Steps : + 1. skip test case when kernel doesn't support cpu hotplug. + 2. set all vmbus channels target to cpu 0. + when kernel version >= 5.8 and vmbus version >= 4.1, code supports changing + vmbus channels target cpu, by setting the cpu number to the file + /sys/bus/vmbus/devices//channels//cpu. + then all cpus except for cpu 0 are in idle state. + 2.1 save the raw cpu number of each channel for restoring after testing. + 2.2 set all vmbus channel interrupts go into cpu 0. + 3. collect idle cpu which can be used for hotplug. + if the kernel supports step 2, now in used cpu is 0. + exclude the in used cpu from all cpu list to get idle cpu set which can be + offline and online. + if the kernel doesn't step 2, + the idle cpu is quite rely on the cpu usage at that time. + 4. skip testing when there is no idle cpu can be set offline and online. + 5. set idle cpu offline then back to online. + 6. restore the cpu vmbus channel target cpu back to the original state. + """, + priority=3, + requirement=simple_requirement( + min_core_count=32, + ), + ) + def verify_cpu_hot_plug(self, log: Logger, node: Node) -> None: + verify_cpu_hot_plug(log, node) diff --git a/microsoft/testsuites/cpu/stress.py b/microsoft/testsuites/cpu/stress.py new file mode 100644 index 0000000000..dd1dcd6d5d --- /dev/null +++ b/microsoft/testsuites/cpu/stress.py @@ -0,0 +1,35 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +from __future__ import annotations + +from lisa import ( + Logger, + Node, + TestCaseMetadata, + TestSuite, + TestSuiteMetadata, + simple_requirement, +) +from microsoft.testsuites.cpu.common import verify_cpu_hot_plug + + +@TestSuiteMetadata( + area="cpu", + category="stress", + description=""" + This test suite is used to run cpu related tests under stress. + """, +) +class CPUStressSuite(TestSuite): + @TestCaseMetadata( + description=""" + This test will check cpu hotplug under stress. + Detailed steps please refer case verify_cpu_hot_plug. + """, + priority=3, + requirement=simple_requirement( + min_core_count=32, + ), + ) + def verify_cpu_hot_plug_stress(self, log: Logger, node: Node) -> None: + verify_cpu_hot_plug(log, node, 10)