Skip to content

Commit

Permalink
Add GPU E2E Test
Browse files Browse the repository at this point in the history
This adds an E2E test for GPU use on Contrast.
It currently runs on the GPU-enabled bare-metal SNP runner.

The test currently only verifies that the GPU is available via
nvidia-smi, which also verifies that driver and CUDA work correctly.
  • Loading branch information
msanft committed Jan 20, 2025
1 parent 5f88997 commit a26ee1e
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 29 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/e2e_manual.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ on:
options:
- genpolicy
- getdents
- gpu
- openssl
- policy
- regression
Expand All @@ -24,6 +25,7 @@ on:
options:
- AKS-CLH-SNP
- K3s-QEMU-SNP
- K3s-QEMU-SNP-GPU
- K3s-QEMU-TDX
skip-undeploy:
description: "Skip undeploy"
Expand Down
91 changes: 91 additions & 0 deletions e2e/gpu/gpu_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// Copyright 2024 Edgeless Systems GmbH
// SPDX-License-Identifier: AGPL-3.0-only

//go:build e2e

package gpu

import (
"bytes"
"context"
"flag"
"os"
"testing"
"time"

"github.com/edgelesssys/contrast/e2e/internal/contrasttest"
"github.com/edgelesssys/contrast/internal/kuberesource"
"github.com/edgelesssys/contrast/internal/manifest"
"github.com/edgelesssys/contrast/internal/platforms"
"github.com/stretchr/testify/require"
)

const (
gpuPodName = "gpu-pod"
gpuName = "NVIDIA H100 PCIe"
)

// TestGPU runs e2e tests on an GPU-enabled Contrast.
func TestGPU(t *testing.T) {
platform, err := platforms.FromString(contrasttest.Flags.PlatformStr)
require.NoError(t, err)
ct := contrasttest.New(t)

runtimeHandler, err := manifest.RuntimeHandler(platform)
require.NoError(t, err)

resources := kuberesource.OpenSSL()
coordinator := kuberesource.CoordinatorBundle()

resources = append(resources, coordinator...)

resources = kuberesource.PatchRuntimeHandlers(resources, runtimeHandler)

resources = kuberesource.AddPortForwarders(resources)

ct.Init(t, resources)
require.True(t, t.Run("generate", ct.Generate), "contrast generate needs to succeed for subsequent tests")

require.True(t, t.Run("apply", ct.Apply), "Kubernetes resources need to be applied for subsequent tests")

require.True(t, t.Run("set", ct.Set), "contrast set needs to succeed for subsequent tests")

require.True(t, t.Run("contrast verify", ct.Verify), "contrast verify needs to succeed for subsequent tests")

applyGPUPod := func(t *testing.T) {
yaml, err := os.ReadFile("./e2e/gpu/testdata/gpu-pod.yaml")
require.NoError(t, err)

yaml = bytes.ReplaceAll(
bytes.ReplaceAll(yaml, []byte("@@REPLACE_NAMESPACE@@"), []byte(ct.Namespace)),
[]byte("@@REPLACE_RUNTIME@@"), []byte(ct.RuntimeClassName),
)

ct.ApplyFromYAML(t, yaml)
}

require.True(t, t.Run("apply GPU pod", applyGPUPod), "GPU pod needs to deploy successfully for subsequent tests")

t.Run("check GPU availability", func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), ct.FactorPlatformTimeout(5*time.Minute))
defer cancel()

require := require.New(t)

err := ct.Kubeclient.WaitForPod(ctx, ct.Namespace, gpuPodName)
require.NoError(err, "GPU pod %s did not start", gpuPodName)

argv := []string{"/bin/sh", "-c", "nvidia-smi"}
stdout, stderr, err := ct.Kubeclient.Exec(ctx, ct.Namespace, gpuPodName, argv)
require.NoError(err, "stderr: %q", stderr)

require.Contains(stdout, gpuName, "nvidia-smi output should contain %s", gpuName)
})
}

func TestMain(m *testing.M) {
contrasttest.RegisterFlags()
flag.Parse()

os.Exit(m.Run())
}
25 changes: 25 additions & 0 deletions e2e/gpu/testdata/gpu-pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# TODO(msanft): Move this to internal/kuberesource/sets.go as soon as genpolicy
# support for GPU pods is added.
apiVersion: v1
kind: Pod
metadata:
name: gpu-pod
namespace: "@@REPLACE_NAMESPACE@@"
annotations:
# Allow-all policy
# TODO(msanft): Generate a policy dynamically once we support policy generation for GPU pods.
io.katacontainers.config.agent.policy: IyBDb3B5cmlnaHQgKGMpIDIwMjMgTWljcm9zb2Z0IENvcnBvcmF0aW9uCiMKIyBTUERYLUxpY2Vuc2UtSWRlbnRpZmllcjogQXBhY2hlLTIuMAojCgpwYWNrYWdlIGFnZW50X3BvbGljeQoKZGVmYXVsdCBBZGRBUlBOZWlnaGJvcnNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBBZGRTd2FwUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ2xvc2VTdGRpblJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IENvcHlGaWxlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ3JlYXRlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgQ3JlYXRlU2FuZGJveFJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IERlc3Ryb3lTYW5kYm94UmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgRXhlY1Byb2Nlc3NSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBHZXRNZXRyaWNzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgR2V0T09NRXZlbnRSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBHdWVzdERldGFpbHNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBMaXN0SW50ZXJmYWNlc1JlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IExpc3RSb3V0ZXNSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBNZW1Ib3RwbHVnQnlQcm9iZVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IE9ubGluZUNQVU1lbVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFBhdXNlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgUHVsbEltYWdlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgUmVhZFN0cmVhbVJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlbW92ZUNvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlbW92ZVN0YWxlVmlydGlvZnNTaGFyZU1vdW50c1JlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlc2VlZFJhbmRvbURldlJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFJlc3VtZUNvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFNldEd1ZXN0RGF0ZVRpbWVSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTZXRQb2xpY3lSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTaWduYWxQcm9jZXNzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgU3RhcnRDb250YWluZXJSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTdGFydFRyYWNpbmdSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBTdGF0c0NvbnRhaW5lclJlcXVlc3QgOj0gdHJ1ZQpkZWZhdWx0IFN0b3BUcmFjaW5nUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVHR5V2luUmVzaXplUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlQ29udGFpbmVyUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlRXBoZW1lcmFsTW91bnRzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlSW50ZXJmYWNlUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgVXBkYXRlUm91dGVzUmVxdWVzdCA6PSB0cnVlCmRlZmF1bHQgV2FpdFByb2Nlc3NSZXF1ZXN0IDo9IHRydWUKZGVmYXVsdCBXcml0ZVN0cmVhbVJlcXVlc3QgOj0gdHJ1ZQo=
io.katacontainers.config.hypervisor.default_memory: "15258"
cdi.k8s.io/gpu: "nvidia.com/pgpu=0"
spec:
runtimeClassName: "@@REPLACE_RUNTIME@@"
restartPolicy: OnFailure
containers:
- name: vllm
image: ghcr.io/edgelesssys/contrast/ubuntu:24.04
env:
- name: NVIDIA_VISIBLE_DEVICES
value: all
resources:
limits:
"nvidia.com/GH100_H100_PCIE": 1
17 changes: 15 additions & 2 deletions e2e/internal/contrasttest/contrasttest.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ type ContrastTest struct {
ImageReplacementsFile string
Platform platforms.Platform
NamespaceFile string
RuntimeClassName string
Kubeclient *kubeclient.Kubeclient

// outputs of contrast subcommands
Expand All @@ -70,15 +71,21 @@ type ContrastTest struct {

// New creates a new contrasttest.T object bound to the given test.
func New(t *testing.T) *ContrastTest {
require := require.New(t)

platform, err := platforms.FromString(Flags.PlatformStr)
require.NoError(t, err)
require.NoError(err)

runtimeClass, err := kuberesource.ContrastRuntimeClass(platform)
require.NoError(err)

return &ContrastTest{
Namespace: MakeNamespace(t, Flags.NamespaceSuffix),
WorkDir: t.TempDir(),
ImageReplacementsFile: Flags.ImageReplacementsFile,
Platform: platform,
NamespaceFile: Flags.NamespaceFile,
RuntimeClassName: *runtimeClass.Handler,
Kubeclient: kubeclient.NewForTest(t),
}
}
Expand Down Expand Up @@ -283,9 +290,15 @@ func patchReferenceValues(k *kubeclient.Kubeclient, platform platforms.Platform)
// Apply the generated resources to the Kubernetes test environment.
func (ct *ContrastTest) Apply(t *testing.T) {
require := require.New(t)

yaml, err := os.ReadFile(path.Join(ct.WorkDir, "resources.yml"))
require.NoError(err)
ct.ApplyFromYAML(t, yaml)
}

// ApplyFromYAML applies the given YAML to the Kubernetes test environment.
func (ct *ContrastTest) ApplyFromYAML(t *testing.T, yaml []byte) {
require := require.New(t)

objects, err := kubeapi.UnmarshalUnstructuredK8SResource(yaml)
require.NoError(err)

Expand Down
65 changes: 38 additions & 27 deletions packages/by-name/contrast/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ let
subPackages = [
"e2e/genpolicy"
"e2e/getdents"
"e2e/gpu"
"e2e/openssl"
"e2e/servicemesh"
"e2e/release"
Expand Down Expand Up @@ -81,35 +82,45 @@ let
];
};

snpRefVals = {
snp =
let
launch-digest =
if kata.contrast-node-installer-image.debugRuntime then
kata.snp-launch-digest.override { debug = true; }
else
kata.snp-launch-digest;
in
[
{
trustedMeasurement = lib.removeSuffix "\n" (builtins.readFile "${launch-digest}/milan.hex");
productName = "Milan";
}
{
trustedMeasurement = lib.removeSuffix "\n" (builtins.readFile "${launch-digest}/genoa.hex");
productName = "Genoa";
}
];
};
snpRefValsWith =
{
gpu,
}:
{
snp =
let
os-image =
if gpu then
kata.contrast-node-installer-image.gpu.os-image
else
kata.contrast-node-installer-image.os-image;
launch-digest = kata.snp-launch-digest.override {
inherit os-image;
debug = kata.contrast-node-installer-image.debugRuntime;
};
in
[
{
trustedMeasurement = lib.removeSuffix "\n" (builtins.readFile "${launch-digest}/milan.hex");
productName = "Milan";
}
{
trustedMeasurement = lib.removeSuffix "\n" (builtins.readFile "${launch-digest}/genoa.hex");
productName = "Genoa";
}
];
};

snpRefVals = snpRefValsWith { gpu = false; };
snpGpuRefVals = snpRefValsWith { gpu = true; };

tdxRefVals = {
tdx = [
(
let
launch-digests =
if kata.contrast-node-installer-image.debugRuntime then
kata.tdx-launch-digests.override { debug = true; }
else
kata.tdx-launch-digests;
launch-digests = kata.tdx-launch-digests.override {
debug = kata.contrast-node-installer-image.debugRuntime;
};
in
{
mrTd = builtins.readFile "${launch-digests}/mrtd.hex";
Expand All @@ -135,9 +146,9 @@ let
"${k3s-qemu-tdx-handler}" = tdxRefVals;
"${rke2-qemu-tdx-handler}" = tdxRefVals;
"${metal-qemu-snp-handler}" = snpRefVals;
"${metal-qemu-snp-gpu-handler}" = snpRefVals;
"${metal-qemu-snp-gpu-handler}" = snpGpuRefVals;
"${k3s-qemu-snp-handler}" = snpRefVals;
"${k3s-qemu-snp-gpu-handler}" = snpRefVals;
"${k3s-qemu-snp-gpu-handler}" = snpGpuRefVals;
}
);

Expand Down

0 comments on commit a26ee1e

Please sign in to comment.