diff --git a/tests/gpu-container b/tests/gpu-container index c2dc907b..d669711f 100755 --- a/tests/gpu-container +++ b/tests/gpu-container @@ -125,9 +125,69 @@ lxc config device add c1 gpu0 gpu id="nvidia.com/gpu=0" lxc start c1 [ "$(lxc exec c1 -- ls /dev/dri/ | grep -c '^card[0-9]')" = "1" ] || false lxc exec c1 -- nvidia-smi +lxc delete -f c1 + +# Check that CDI device files are cleanly remove even if the host machine is abruptly rebooted +echo "==> Testing that CDI device files are cleanly removed after abrupt reboot" +lxc init "${IMAGE}" v1 --vm +if hasNeededAPIExtension devlxd_images_vm; then + lxc config set v1 -c security.devlxd.images=true +fi + +lxc config device add v1 gpu0 gpu pci="${first_card_pci_slot}" +lxc start v1 +echo "==> Waiting for the VM agent to be ready" +waitInstanceBooted v1 + +echo "==> Installing NVIDIA drivers inside the VM" +lxc exec v1 -- apt-get update +lxc exec v1 --env DEBIAN_FRONTEND=noninteractive -- apt-get install -y ubuntu-drivers-common +lxc exec v1 --env DEBIAN_FRONTEND=noninteractive -- ubuntu-drivers autoinstall + +echo "==> Rebooting the VM to load NVIDIA drivers" +lxc restart v1 + +waitInstanceBooted v1 + +echo "==> Verifying NVIDIA driver installation in the VM" +lxc exec v1 -- nvidia-smi + +echo "==> Installing LXD inside the VM" +lxc exec v1 -- snap install lxd --channel="${LXD_SNAP_CHANNEL}" + +echo "==> Initializing LXD inside the VM" +lxc exec v1 -- lxd init --auto + +echo "==> Launching a container inside the VM" +lxc exec v1 -- lxc init "${IMAGE}" c1 + +echo "==> Adding GPU to the container inside the VM using CDI" +lxc exec v1 -- lxc config device add c1 gpu0 gpu id="nvidia.com/gpu=0" +lxc exec v1 -- lxc start c1 +# Wait for the container to be ready +sleep 20 + +echo "==> Verifying GPU access inside the container" +lxc exec v1 -- lxc exec c1 -- nvidia-smi + +echo "==> Simulating abrupt reboot by force-stopping the VM" +lxc stop v1 -f + +echo "==> Starting the VM again" +lxc start v1 + +waitInstanceBooted v1 + +echo "==> Starting the container inside the VM after reboot" +lxc exec v1 -- lxc start c1 + +echo "==> Verifying GPU access inside the container after VM reboot" +lxc exec v1 -- lxc exec c1 -- nvidia-smi + +echo "==> Cleaning up the VM" +lxc delete v1 -f echo "==> Cleaning up" -lxc delete -f c1 lxc profile device remove default root lxc profile device remove default eth0 lxc storage delete default