From aa1269db71c1ccd0b433d9d3e4547622a4df2fe0 Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:11:19 +0100 Subject: [PATCH 1/5] packages/buildVerityMicroVM: init This adds a Nix builder to build a micro VM image for direct Linux boot, specifically for the bare-metal Kata image where this is necessary to satisfy Contrast's security assumptions made on the SNP launch digest computation. --- .../by-name/buildVerityMicroVM/package.nix | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 packages/by-name/buildVerityMicroVM/package.nix diff --git a/packages/by-name/buildVerityMicroVM/package.nix b/packages/by-name/buildVerityMicroVM/package.nix new file mode 100644 index 0000000000..b56bd10a48 --- /dev/null +++ b/packages/by-name/buildVerityMicroVM/package.nix @@ -0,0 +1,59 @@ +# Copyright 2024 Edgeless Systems GmbH +# SPDX-License-Identifier: AGPL-3.0-only + +# Builds a micro VM image (i.e. rootfs, kernel and kernel cmdline) from a NixOS +# configuration. These components can then be booted in a microVM-fashion +# with QEMU's direct Linux boot feature. +# See: https://qemu-project.gitlab.io/qemu/system/linuxboot.html + +{ + symlinkJoin, + lib, +}: + +nixos-config: + +let + image = nixos-config.image.overrideAttrs (oldAttrs: { + passthru = oldAttrs.passthru // { + imageFileName = "${oldAttrs.pname}_${oldAttrs.version}.raw"; + }; + }); +in + +lib.throwIf + (lib.foldlAttrs ( + acc: _: partConfig: + acc || (partConfig.repartConfig.Type == "esp") + ) false nixos-config.config.image.repart.partitions) + "MicroVM images should not contain an ESP." + + symlinkJoin + { + pname = "microvm-image"; + inherit (nixos-config.config.system.image) version; + + paths = [ + nixos-config.config.system.build.kernel + nixos-config.config.system.build.initialRamdisk + image + ]; + + passthru = + let + roothash = builtins.head ( + lib.map (e: e.roothash) (builtins.fromJSON (builtins.readFile "${image}/repart-output.json")) + ); + in + { + cmdline = lib.concatStringsSep " " ( + nixos-config.config.boot.kernelParams + ++ [ + "init=${nixos-config.config.system.build.toplevel}/init" + "roothash=${roothash}" + ] + ); + inherit (image) imageFileName; + inherit (nixos-config.config.system.build) image kernel initialRamdisk; + }; + } From d90736ad70410ee223b17c4ed163a45a813bd63a Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:13:21 +0100 Subject: [PATCH 2/5] packages/kata-kernel-uvm: add config options for bare-metal use Using the Kata kernel with a baremetal NixOS image requires some additional config options to specify NixOS' sanity checks, so add them here. --- packages/by-name/kata/kata-kernel-uvm/package.nix | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/packages/by-name/kata/kata-kernel-uvm/package.nix b/packages/by-name/kata/kata-kernel-uvm/package.nix index b3cc2cb024..79d0ba442f 100644 --- a/packages/by-name/kata/kata-kernel-uvm/package.nix +++ b/packages/by-name/kata/kata-kernel-uvm/package.nix @@ -27,8 +27,19 @@ let # 3. Disable module signing to make the build reproducable. substituteInPlace $config \ --replace-fail 'CONFIG_INITRAMFS_SOURCE="initramfs.cpio.gz"' 'CONFIG_INITRAMFS_SOURCE=""' \ + --replace-fail 'CONFIG_MODULE_SIG=y' 'CONFIG_MODULE_SIG=n' \ --replace-fail '# CONFIG_DM_INIT is not set' 'CONFIG_DM_INIT=y' \ - --replace-fail 'CONFIG_MODULE_SIG=y' 'CONFIG_MODULE_SIG=n' + --replace-fail '# CONFIG_DMIID is not set' 'CONFIG_DMIID=y' \ + --replace-fail '# CONFIG_TMPFS_POSIX_ACL is not set' 'CONFIG_TMPFS_POSIX_ACL=y' \ + --replace-fail '# CONFIG_TMPFS_XATTR is not set' 'CONFIG_TMPFS_XATTR=y' \ + --replace-fail '# CONFIG_EFIVAR_FS is not set' 'CONFIG_EFIVAR_FS=y' \ + --replace-fail '# CONFIG_RD_ZSTD is not set' 'CONFIG_RD_ZSTD=y' \ + --replace-fail '# CONFIG_VFAT_FS is not se' 'CONFIG_VFAT_FS=y' \ + --replace-fail '# CONFIG_NLS_CODEPAGE_437 is not set' 'CONFIG_NLS_CODEPAGE_437=y' \ + --replace-fail '# CONFIG_NLS_ISO8859_1 is not set' 'CONFIG_NLS_ISO8859_1=y' \ + --replace-fail '# CONFIG_ATA is not set' 'CONFIG_ATA=y' + + echo "CONFIG_ATA_PIIX=y" >> $config ''; dontBuild = true; From 4ea3dc4d8e8c16eae0476e14ca3fcc14a1067e52 Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:14:33 +0100 Subject: [PATCH 3/5] packages/kata-runtime: allow booting with image and initrd Kata has a check to see if only image OR initrd are supplied, which is not needed for our use-case. So add a patch to remove that. This should probably be brought upstream in a usable fashion later on. --- ...ime-allow-initrd-AND-image-to-be-set.patch | 70 +++++++++++++++++++ .../by-name/kata/kata-runtime/package.nix | 4 ++ 2 files changed, 74 insertions(+) create mode 100644 packages/by-name/kata/kata-runtime/0017-runtime-allow-initrd-AND-image-to-be-set.patch diff --git a/packages/by-name/kata/kata-runtime/0017-runtime-allow-initrd-AND-image-to-be-set.patch b/packages/by-name/kata/kata-runtime/0017-runtime-allow-initrd-AND-image-to-be-set.patch new file mode 100644 index 0000000000..2226146eb0 --- /dev/null +++ b/packages/by-name/kata/kata-runtime/0017-runtime-allow-initrd-AND-image-to-be-set.patch @@ -0,0 +1,70 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Moritz Sanft <58110325+msanft@users.noreply.github.com> +Date: Mon, 18 Nov 2024 12:41:40 +0100 +Subject: [PATCH] runtime: allow initrd AND image to be set + +Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> +--- + .../virtcontainers/hypervisor_config_darwin.go | 2 -- + .../virtcontainers/hypervisor_config_linux.go | 2 -- + src/runtime/virtcontainers/qemu.go | 18 +++--------------- + 3 files changed, 3 insertions(+), 19 deletions(-) + +diff --git a/src/runtime/virtcontainers/hypervisor_config_darwin.go b/src/runtime/virtcontainers/hypervisor_config_darwin.go +index 1225271a2a4c5d9340022c22ee6889171bc21b93..a3398bcf6fac68e272a4ca1de962e585c4cf4fae 100644 +--- a/src/runtime/virtcontainers/hypervisor_config_darwin.go ++++ b/src/runtime/virtcontainers/hypervisor_config_darwin.go +@@ -21,8 +21,6 @@ func validateHypervisorConfig(conf *HypervisorConfig) error { + + if conf.ImagePath == "" && conf.InitrdPath == "" { + return fmt.Errorf("Missing image and initrd path") +- } else if conf.ImagePath != "" && conf.InitrdPath != "" { +- return fmt.Errorf("Image and initrd path cannot be both set") + } + + if conf.NumVCPUs == 0 { +diff --git a/src/runtime/virtcontainers/hypervisor_config_linux.go b/src/runtime/virtcontainers/hypervisor_config_linux.go +index f41cd22bd4ba96e5305ccb58e74c6d983b077974..8e1ca38eb620d58ffd4c83bbf4c666c1bc21efc3 100644 +--- a/src/runtime/virtcontainers/hypervisor_config_linux.go ++++ b/src/runtime/virtcontainers/hypervisor_config_linux.go +@@ -28,8 +28,6 @@ func validateHypervisorConfig(conf *HypervisorConfig) error { + } + } else if conf.ImagePath == "" && conf.InitrdPath == "" { + return fmt.Errorf("Missing image and initrd path") +- } else if conf.ImagePath != "" && conf.InitrdPath != "" { +- return fmt.Errorf("Image and initrd path cannot be both set") + } + + if err := conf.CheckTemplateConfig(); err != nil { +diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go +index 2c6311c067935a2c5da0a1018420bab684b670e8..3f4e143349e7467e530b5e3593f65134f9a5798c 100644 +--- a/src/runtime/virtcontainers/qemu.go ++++ b/src/runtime/virtcontainers/qemu.go +@@ -415,24 +415,12 @@ func (q *qemu) buildDevices(ctx context.Context, kernelPath string) ([]govmmQemu + return nil, nil, nil, err + } + +- assetPath, assetType, err := q.config.ImageOrInitrdAssetPath() +- if err != nil { +- return nil, nil, nil, err +- } +- +- if assetType == types.ImageAsset { +- devices, err = q.arch.appendImage(ctx, devices, assetPath) ++ devices, err = q.arch.appendImage(ctx, devices, q.config.ImagePath) + if err != nil { + return nil, nil, nil, err + } +- } else if assetType == types.InitrdAsset { +- // InitrdAsset, need to set kernel initrd path +- kernel.InitrdPath = assetPath +- } else if assetType == types.SecureBootAsset { +- // SecureBootAsset, no need to set image or initrd path +- q.Logger().Info("For IBM Z Secure Execution, initrd path should not be set") +- kernel.InitrdPath = "" +- } ++ ++ kernel.InitrdPath = q.config.InitrdPath + + if q.config.IOMMU { + devices, err = q.arch.appendIOMMU(devices) diff --git a/packages/by-name/kata/kata-runtime/package.nix b/packages/by-name/kata/kata-runtime/package.nix index 087d0aa278..a3e7bd5a4d 100644 --- a/packages/by-name/kata/kata-runtime/package.nix +++ b/packages/by-name/kata/kata-runtime/package.nix @@ -93,6 +93,10 @@ buildGoModule rec { ./0014-kata-sys-util-remove-obsolete-cgroups-dependency.patch ./0015-kata-sys-util-move-json-parsing-to-protocols-crate.patch ./0016-protocols-only-build-RLimit-impls-on-Linux.patch + + # Disable a check in Kata that prevents to set both image and initrd. + # For us, there's no practical reason not to do so. + ./0017-runtime-allow-initrd-AND-image-to-be-set.patch ]; }; From 6cf5c05ff2bff46a30b670b8072b4b9530c66ee3 Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:15:57 +0100 Subject: [PATCH 4/5] packages/boot-microvm: init This adds a little helper script to boot a Micro VM, as we build them for Kata bare-metal, via QEMU. --- packages/by-name/boot-microvm/package.nix | 36 +++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 packages/by-name/boot-microvm/package.nix diff --git a/packages/by-name/boot-microvm/package.nix b/packages/by-name/boot-microvm/package.nix new file mode 100644 index 0000000000..416f82a26b --- /dev/null +++ b/packages/by-name/boot-microvm/package.nix @@ -0,0 +1,36 @@ +# Copyright 2024 Edgeless Systems GmbH +# SPDX-License-Identifier: AGPL-3.0-only + +{ + writeShellApplication, + qemu, + OVMF, +}: + +# Usage example: +# outPath=$(nix build .#kata.kata-image --print-out-paths); nix run .#boot-microvm -- "${outPath}/bzImage" "${outPath}/initrd" "${outPath}/image-podvm-gpu_1-rc1.raw" "$(nix eval --raw .#kata.kata-image.cmdline)" + +writeShellApplication { + name = "boot-microvm"; + runtimeInputs = [ qemu ]; + text = '' + if [ $# -ne 4 ]; then + echo "Usage: $0 "; + exit 1; + fi + + tmpFile=$(mktemp) + cp "$3" "$tmpFile" + + qemu-system-x86_64 \ + -enable-kvm \ + -m 3G \ + -nographic \ + -drive if=pflash,format=raw,readonly=on,file=${OVMF.firmware} \ + -drive if=pflash,format=raw,readonly=on,file=${OVMF.variables} \ + -kernel "$1" \ + -initrd "$2" \ + -append "$4" \ + -drive "if=virtio,format=raw,file=$tmpFile" + ''; +} From 7dc21e124903fc6136d2952cfd33b9b7d525d840 Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Tue, 19 Nov 2024 16:17:16 +0100 Subject: [PATCH 5/5] Add NixOS image for bare-metal Kata This switches the image used in our bare-metal Kata uses (e.g. non-AKS and non-peerpods) to a NixOS image that we build in-tree as a MicroVM image (e.g. separated kernel, initrd, cmdline and rootfs). --- docs/docs/features-limitations.md | 7 + .../constants/configuration-qemu-tdx.toml | 3 + nodeinstaller/internal/constants/constants.go | 21 +- packages/by-name/OVMF-TDX/package.nix | 2 +- packages/by-name/image-podvm/package.nix | 1 + .../contrast-node-installer-image/package.nix | 14 +- .../by-name/kata/kata-image/buildimage.sh | 121 ---- .../kata/kata-image/package-index.json | 562 ------------------ packages/by-name/kata/kata-image/package.nix | 252 +------- .../by-name/kata/kata-kernel-uvm/package.nix | 1 + .../by-name/kata/kata-runtime/package.nix | 12 + .../kata/snp-launch-digest/package.nix | 28 +- .../kata/tdx-launch-digests/package.nix | 28 +- packages/by-name/mkNixosConfig/package.nix | 6 +- ...hw-x86-load-initrd-to-static-address.patch | 87 +++ packages/by-name/qemu-tdx-static/package.nix | 2 + packages/nixos/azure.nix | 2 + packages/nixos/debug.nix | 11 +- packages/nixos/image.nix | 116 ++-- packages/nixos/kata.nix | 158 ++--- packages/nixos/peerpods.nix | 103 ++++ packages/nixos/system.nix | 12 +- tools/tdx-measure/main.go | 25 +- tools/tdx-measure/rtmr/rtmr.go | 42 +- 24 files changed, 514 insertions(+), 1102 deletions(-) delete mode 100644 packages/by-name/kata/kata-image/buildimage.sh delete mode 100644 packages/by-name/kata/kata-image/package-index.json create mode 100644 packages/by-name/qemu-tdx-static/0004-hw-x86-load-initrd-to-static-address.patch create mode 100644 packages/nixos/peerpods.nix diff --git a/docs/docs/features-limitations.md b/docs/docs/features-limitations.md index 386b0bc0d4..69a1c028ef 100644 --- a/docs/docs/features-limitations.md +++ b/docs/docs/features-limitations.md @@ -34,3 +34,10 @@ Currently, this requires inspecting the iptables rules on startup or terminating The Contrast Coordinator is a singleton and can't be scaled to more than one instance. When this instance's pod is restarted, for example for node maintenance, it needs to be recovered manually. In a future release, we plan to support distributed Coordinator instances that can recover automatically. + +## Overriding Kata configuration + +Kata Containers supports [overriding certain configuration values via Kubernetes annotations](https://github.com/kata-containers/kata-containers/blob/b4da4b5e3b9b21048af9333b071235a57a3e9493/docs/how-to/how-to-set-sandbox-config-kata.md). + +It needs to be noted that setting these values is unsupported, and doing so may lead to unexpected +behaviour, as Contrast isn't tested against all possible configuration combinations. diff --git a/nodeinstaller/internal/constants/configuration-qemu-tdx.toml b/nodeinstaller/internal/constants/configuration-qemu-tdx.toml index c37fd9b4ed..7dd1b0b590 100644 --- a/nodeinstaller/internal/constants/configuration-qemu-tdx.toml +++ b/nodeinstaller/internal/constants/configuration-qemu-tdx.toml @@ -18,6 +18,9 @@ cpu_features="-vmx-rdseed-exit,pmu=off" default_vcpus = 1 default_maxvcpus = 0 default_bridges = 1 +# On TDX, when lowering this, the patch: +# packages/by-name/qemu-tdx-static/0004-hw-x86-load-initrd-to-static-address.patch +# needs to be updated accordingly. default_memory = 2048 default_maxmemory = 0 disable_block_device_use = false diff --git a/nodeinstaller/internal/constants/constants.go b/nodeinstaller/internal/constants/constants.go index 89628d918d..be75b60c20 100644 --- a/nodeinstaller/internal/constants/constants.go +++ b/nodeinstaller/internal/constants/constants.go @@ -64,17 +64,17 @@ func KataRuntimeConfig(baseDir string, platform platforms.Platform, qemuExtraKer config.Hypervisor["qemu"]["path"] = filepath.Join(baseDir, "tdx", "bin", "qemu-system-x86_64") config.Hypervisor["qemu"]["firmware"] = filepath.Join(baseDir, "tdx", "share", "OVMF.fd") config.Hypervisor["qemu"]["image"] = filepath.Join(baseDir, "share", "kata-containers.img") - config.Hypervisor["qemu"]["kernel"] = filepath.Join(baseDir, "share", "kata-kernel") config.Hypervisor["qemu"]["valid_hypervisor_paths"] = []string{filepath.Join(baseDir, "tdx", "bin", "qemu-system-x86_64")} config.Hypervisor["qemu"]["block_device_aio"] = "threads" config.Hypervisor["qemu"]["shared_fs"] = "none" - kernelParams := qemuExtraKernelParams + config.Hypervisor["qemu"]["initrd"] = filepath.Join(baseDir, "share", "kata-initrd.zst") + config.Hypervisor["qemu"]["kernel"] = filepath.Join(baseDir, "share", "kata-kernel") + // Replace the kernel params entirely (and don't append) since that's + // also what we do when calculating the launch measurement. + config.Hypervisor["qemu"]["kernel_params"] = qemuExtraKernelParams if debug { config.Hypervisor["qemu"]["enable_debug"] = true } - // Replace the kernel params entirely (and don't append) since that's - // also what we do when calculating the launch measurement. - config.Hypervisor["qemu"]["kernel_params"] = kernelParams case platforms.K3sQEMUSNP: if err := toml.Unmarshal([]byte(kataBareMetalQEMUSNPBaseConfig), &config); err != nil { return nil, fmt.Errorf("failed to unmarshal kata runtime configuration: %w", err) @@ -82,19 +82,18 @@ func KataRuntimeConfig(baseDir string, platform platforms.Platform, qemuExtraKer config.Hypervisor["qemu"]["path"] = filepath.Join(baseDir, "snp", "bin", "qemu-system-x86_64") config.Hypervisor["qemu"]["firmware"] = filepath.Join(baseDir, "snp", "share", "OVMF.fd") config.Hypervisor["qemu"]["image"] = filepath.Join(baseDir, "share", "kata-containers.img") - config.Hypervisor["qemu"]["kernel"] = filepath.Join(baseDir, "share", "kata-kernel") - delete(config.Hypervisor["qemu"], "initrd") config.Hypervisor["qemu"]["block_device_aio"] = "threads" config.Hypervisor["qemu"]["shared_fs"] = "none" config.Hypervisor["qemu"]["valid_hypervisor_paths"] = []string{filepath.Join(baseDir, "snp", "bin", "qemu-system-x86_64")} config.Hypervisor["qemu"]["rootfs_type"] = "erofs" - kernelParams := qemuExtraKernelParams + config.Hypervisor["qemu"]["initrd"] = filepath.Join(baseDir, "share", "kata-initrd.zst") + config.Hypervisor["qemu"]["kernel"] = filepath.Join(baseDir, "share", "kata-kernel") + // Replace the kernel params entirely (and don't append) since that's + // also what we do when calculating the launch measurement. + config.Hypervisor["qemu"]["kernel_params"] = qemuExtraKernelParams if debug { config.Hypervisor["qemu"]["enable_debug"] = true } - // Replace the kernel params entirely (and don't append) since that's - // also what we do when calculating the launch measurement. - config.Hypervisor["qemu"]["kernel_params"] = kernelParams default: return nil, fmt.Errorf("unsupported platform: %s", platform) } diff --git a/packages/by-name/OVMF-TDX/package.nix b/packages/by-name/OVMF-TDX/package.nix index be4941eed1..03a6930bb6 100644 --- a/packages/by-name/OVMF-TDX/package.nix +++ b/packages/by-name/OVMF-TDX/package.nix @@ -9,7 +9,7 @@ debug ? false, }: -edk2.mkDerivation "OvmfPkg/IntelTdx/IntelTdxX64.dsc" rec { +edk2.mkDerivation "OvmfPkg/IntelTdx/IntelTdxX64.dsc" { name = "OVMF-TDX"; buildFlags = lib.optionals debug [ "-D DEBUG_ON_SERIAL_PORT=TRUE" ]; diff --git a/packages/by-name/image-podvm/package.nix b/packages/by-name/image-podvm/package.nix index 26729aa10f..bf97f45e3d 100644 --- a/packages/by-name/image-podvm/package.nix +++ b/packages/by-name/image-podvm/package.nix @@ -15,5 +15,6 @@ buildVerityUKI (mkNixosConfig { debug.enable = withDebug; gpu.enable = withGPU; azure.enable = withCSP == "azure"; + peerpods.enable = true; }; }) diff --git a/packages/by-name/kata/contrast-node-installer-image/package.nix b/packages/by-name/kata/contrast-node-installer-image/package.nix index be31bc4d8b..d0e5c5c22b 100644 --- a/packages/by-name/kata/contrast-node-installer-image/package.nix +++ b/packages/by-name/kata/contrast-node-installer-image/package.nix @@ -47,6 +47,10 @@ let url = "file:///opt/edgeless/share/kata-kernel"; path = "/opt/edgeless/@@runtimeName@@/share/kata-kernel"; } + { + url = "file:///opt/edgeless/share/kata-initrd.zst"; + path = "/opt/edgeless/@@runtimeName@@/share/kata-initrd.zst"; + } { url = "file:///opt/edgeless/snp/bin/qemu-system-x86_64"; path = "/opt/edgeless/@@runtimeName@@/snp/bin/qemu-system-x86_64"; @@ -106,7 +110,7 @@ let } ]; inherit debugRuntime; - qemuExtraKernelParams = kata.snp-launch-digest.dmVerityArgs; + qemuExtraKernelParams = kata.kata-image.cmdline; }; destination = "/config/contrast-node-install.json"; } @@ -116,13 +120,17 @@ let kata-container-img = ociLayerTar { files = [ { - source = kata.kata-image; + source = "${kata.kata-image.image}/${kata.kata-image.imageFileName}"; destination = "/opt/edgeless/share/kata-containers.img"; } { - source = "${kata.kata-kernel-uvm}/bzImage"; + source = "${kata.kata-image.kernel}/bzImage"; destination = "/opt/edgeless/share/kata-kernel"; } + { + source = "${kata.kata-image.initialRamdisk}/initrd"; + destination = "/opt/edgeless/share/kata-initrd.zst"; + } ]; }; diff --git a/packages/by-name/kata/kata-image/buildimage.sh b/packages/by-name/kata/kata-image/buildimage.sh deleted file mode 100644 index 18fe86df5a..0000000000 --- a/packages/by-name/kata/kata-image/buildimage.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2024 Edgeless Systems GmbH -# SPDX-License-Identifier: AGPL-3.0-only - -set -euo pipefail -shopt -s inherit_errexit - -# Image layout: -# -# +---------------------------------+-------------------+-------------------------+ -# | 512B DOS MBR (padded to 1 MiB) | p0 rootfs | p1 hashtree | -# +---------------------------------+-------------------+-------------------------+ -# | | | | -# 0 1MiB 1MiB + rootfs_size 1MiB + rootfs_size + hashtree_size - -# rootfs: erofs filesystem mounted at / (read-only) -# hashtree: dm-verity hashtree without superblock - -readonly MIB=1048576 - -in=$1 -out=$2 -tmpdir=$(mktemp -d) -trap 'rm -rf $tmpdir' EXIT -rootfs=$tmpdir/01_rootfs -hashtree=$tmpdir/02_verity_hashtree -dm_verity_file=$out/dm_verity.txt -roothash=$out/roothash -raw=$out/raw.img -uuid=c1b9d5a2-f162-11cf-9ece-0020afc76f16 -salt=0102030405060708090a0b0c0d0e0f - -if [ -z "${SOURCE_DATE_EPOCH}" ]; then - echo "SOURCE_DATE_EPOCH is not set" >&2 - exit 1 -fi - -mkdir -p "$out" - -# create the rootfs and pad it to 1MiB -mkfs.erofs \ - -z lz4 \ - -b 4096 \ - -T "$SOURCE_DATE_EPOCH" \ - -U "$uuid" \ - --tar=f \ - "$rootfs" \ - "$in" -truncate -s '%1MiB' "$rootfs" - -# create the dm-verity hashtree -verity_out=$( - veritysetup format \ - "$rootfs" \ - "$hashtree" \ - --data-block-size 4096 \ - --hash-block-size 4096 \ - --no-superblock \ - --uuid "$uuid" \ - --salt "$salt" | tee "$dm_verity_file" -) -# pad the hashtree to multiple of 1MiB -truncate -s '%1MiB' "$hashtree" -# extract dm-verity parameters from text output to individual files -sed -i 1d "$dm_verity_file" -root_hash=$(echo "$verity_out" | grep -oP 'Root hash:\s+\K\w+' | tr -d "[:space:]") -echo -n "$root_hash" >"$roothash" -hash_type=$(echo "$verity_out" | grep -oP 'Hash type:\s+\K\w+' | tr -d "[:space:]") -echo -n "$hash_type" >"$out/hash_type" -data_blocks=$(echo "$verity_out" | grep -oP 'Data blocks:\s+\K\w+' | tr -d "[:space:]") -echo -n "$data_blocks" >"$out/data_blocks" -data_block_size=$(echo "$verity_out" | grep -oP 'Data block size:\s+\K\w+' | tr -d "[:space:]") -echo -n "$data_block_size" >"$out/data_block_size" -hash_blocks=$(echo "$verity_out" | grep -oP 'Hash blocks:\s+\K\w+' | tr -d "[:space:]") -echo -n "$hash_blocks" >"$out/hash_blocks" -hash_block_size=$(echo "$verity_out" | grep -oP 'Hash block size:\s+\K\w+' | tr -d "[:space:]") -echo -n "$hash_block_size" >"$out/hash_block_size" -hash_algorithm=$(echo "$verity_out" | grep -oP 'Hash algorithm:\s+\K\w+' | tr -d "[:space:]") -echo -n "$hash_algorithm" >"$out/hash_algorithm" -echo -n "$salt" >"$out/salt" - -rootfs_size_mib=$(($(stat -c %s "$rootfs") / "$MIB")) -# full image size is dos header + rootfs + hashtree -hashtree_size_bytes=$(stat -c %s "$hashtree") -hashtree_size_mib=$(($(stat -c %s "$hashtree") / "$MIB")) -# img_size is the size of the full image in bytes -# DOS MBR (padded to 1MiB) + rootfs + hashtree -img_size_bytes=$(("$MIB" + "$rootfs_size_mib" * "$MIB" + "$hashtree_size_bytes")) - -# Where the rootfs starts in MiB -readonly rootfs_start=1 -# hash_start is the start of the hashtree in MiB -hash_start=$((rootfs_start + rootfs_size_mib)) -hash_end=$((hash_start + hashtree_size_mib)) - -rs=$(printf "%4dMiB" "$rootfs_start") -hs=$(printf "%4dMiB" "$hash_start") -he=$(printf "%4dMiB" "$hash_end") -cat < /build/rootfs/etc/kata-opa/default-policy.rego < $out/milan.hex ${lib.getExe python3Packages.sev-snp-measure} \ --mode snp \ @@ -43,11 +48,8 @@ stdenvNoCC.mkDerivation { --vcpus 1 \ --vcpu-type EPYC-Genoa \ --kernel ${kernel} \ - --append '${cmdline}' \ + --initrd ${initrd} \ + --append "${cmdline}" \ --output-format hex > $out/genoa.hex ''; - - passthru = { - inherit dmVerityArgs; - }; } diff --git a/packages/by-name/kata/tdx-launch-digests/package.nix b/packages/by-name/kata/tdx-launch-digests/package.nix index 9e3bb8dbf1..1916462fc7 100644 --- a/packages/by-name/kata/tdx-launch-digests/package.nix +++ b/packages/by-name/kata/tdx-launch-digests/package.nix @@ -11,25 +11,31 @@ debug ? false, }: let - image = kata.kata-image; - inherit (image) dmVerityArgs; - cmdlineBase = "tsc=reliable no_timer_check rcupdate.rcu_expedited=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1 i8042.noaux=1 noreplace-smp reboot=k cryptomgr.notests net.ifnames=0 pci=lastbus=0 root=/dev/vda1 rootflags=ro rootfstype=erofs console=hvc0 console=hvc1 quiet systemd.show_status=false panic=1 nr_cpus=1 selinux=0 systemd.unit=kata-containers.target systemd.mask=systemd-networkd.service systemd.mask=systemd-networkd.socket scsi_mod.scan=none"; - cmdlineBaseDebug = "tsc=reliable no_timer_check rcupdate.rcu_expedited=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1 i8042.noaux=1 noreplace-smp reboot=k cryptomgr.notests net.ifnames=0 pci=lastbus=0 root=/dev/vda1 rootflags=ro rootfstype=erofs console=hvc0 console=hvc1 debug systemd.show_status=true systemd.log_level=debug panic=1 nr_cpus=1 selinux=0 systemd.unit=kata-containers.target systemd.mask=systemd-networkd.service systemd.mask=systemd-networkd.socket scsi_mod.scan=none agent.log=debug agent.debug_console agent.debug_console_vport=1026"; - cmdline = "${if debug then cmdlineBaseDebug else cmdlineBase} ${dmVerityArgs}"; + ovmf-tdx = "${OVMF-TDX}/FV/OVMF.fd"; + kernel = "${kata.kata-image}/bzImage"; + initrd = "${kata.kata-image}/initrd"; + + # Kata uses a base command line and then appends the command line from the kata config (i.e. also our node-installer config). + # Thus, we need to perform the same steps when calculating the digest. + baseCmdline = if debug then kata.kata-runtime.cmdline.debug else kata.kata-runtime.cmdline.default; + cmdline = lib.strings.concatStringsSep " " [ + baseCmdline + kata.kata-image.cmdline + ]; in stdenvNoCC.mkDerivation { name = "tdx-launch-digests"; - inherit (image) version; + inherit (kata.kata-image) version; dontUnpack = true; buildPhase = '' mkdir $out - ${lib.getExe tdx-measure} mrtd -f ${OVMF-TDX}/FV/OVMF.fd > $out/mrtd.hex - ${lib.getExe tdx-measure} rtmr -f ${OVMF-TDX}/FV/OVMF.fd -k ${kata.kata-kernel-uvm}/bzImage -c '${cmdline}' 0 > $out/rtmr0.hex - ${lib.getExe tdx-measure} rtmr -f ${OVMF-TDX}/FV/OVMF.fd -k ${kata.kata-kernel-uvm}/bzImage -c '${cmdline}' 1 > $out/rtmr1.hex - ${lib.getExe tdx-measure} rtmr -f ${OVMF-TDX}/FV/OVMF.fd -k ${kata.kata-kernel-uvm}/bzImage -c '${cmdline}' 2 > $out/rtmr2.hex - ${lib.getExe tdx-measure} rtmr -f ${OVMF-TDX}/FV/OVMF.fd -k ${kata.kata-kernel-uvm}/bzImage -c '${cmdline}' 3 > $out/rtmr3.hex + ${lib.getExe tdx-measure} mrtd -f ${ovmf-tdx} > $out/mrtd.hex + ${lib.getExe tdx-measure} rtmr -f ${ovmf-tdx} -k ${kernel} -i ${initrd} -c '${cmdline}' 0 > $out/rtmr0.hex + ${lib.getExe tdx-measure} rtmr -f ${ovmf-tdx} -k ${kernel} -i ${initrd} -c '${cmdline}' 1 > $out/rtmr1.hex + ${lib.getExe tdx-measure} rtmr -f ${ovmf-tdx} -k ${kernel} -i ${initrd} -c '${cmdline}' 2 > $out/rtmr2.hex + ${lib.getExe tdx-measure} rtmr -f ${ovmf-tdx} -k ${kernel} -i ${initrd} -c '${cmdline}' 3 > $out/rtmr3.hex ''; } diff --git a/packages/by-name/mkNixosConfig/package.nix b/packages/by-name/mkNixosConfig/package.nix index 767761accb..e5b90e109d 100644 --- a/packages/by-name/mkNixosConfig/package.nix +++ b/packages/by-name/mkNixosConfig/package.nix @@ -45,7 +45,11 @@ lib.makeOverridable ( nvidia-ctk-with-config tdx-tools ; - inherit (outerPkgs.kata) kata-agent; + inherit (outerPkgs.kata) + kata-agent + kata-runtime + kata-kernel-uvm + ; }) ]; diff --git a/packages/by-name/qemu-tdx-static/0004-hw-x86-load-initrd-to-static-address.patch b/packages/by-name/qemu-tdx-static/0004-hw-x86-load-initrd-to-static-address.patch new file mode 100644 index 0000000000..6c26926949 --- /dev/null +++ b/packages/by-name/qemu-tdx-static/0004-hw-x86-load-initrd-to-static-address.patch @@ -0,0 +1,87 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Moritz Sanft <58110325+msanft@users.noreply.github.com> +Date: Thu, 21 Nov 2024 14:36:23 +0100 +Subject: [PATCH] hw/x86: load initrd to static address + +For TDX RTMRs to be predictable regardless of VM memory size, we need to +load the initrd to a static address, so no dynamic value ends up in the +mapped kernel image. + +Without setting this to a static address, the address the initrd is mapped to +which depend not only on the size of the initrd, but also on the memory space +of the guest, this is not viable for Contrast's reference-value-based attestation +approach. + +As we control the minimum VM memory size in Contrast, we just load the initrd +to the address it gets loaded to for Contrast's minimum VM memory (2Gi), regardless +of if the VM has more memory. + +QEMU, by default, does a similar thing. +Consider the below line (cited from above): + + `initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1;` + +This adds an artifical upper bound of where the initrd can be loaded to, as +the calculation is based on the VM memory (below_4g_mem_size), but capped at 4Gi. +This means, the initrd, regardless of guest memory size, will always be loaded +at address 0x100000000 (4Gi) max (minus ACPI data size). + +Essentially, overwriting this to 0x80000000 (2Gi), we create an artificial lower *and* +upper bound (set to Contrast minimum TDX VM memory size). +This means that the initrd will *always* be loaded at 0x80000000 (2Gi), minus ACPI +data size. The difference to QEMU's setting is, that we *fix* the address, rather than +setting *only* an upper bound. + +This way, we get the initrd to *always* be loaded at a static address. + +Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> +--- + hw/i386/x86.c | 35 +++++++++++++++++++++++++++++++++++ + 1 file changed, 35 insertions(+) + +diff --git a/hw/i386/x86.c b/hw/i386/x86.c +index 504575abfa98bc25e498e219a2d58d8d31e5feaa..0763462c16f4106d0aa6a46c2b9c360e36ae3e96 100644 +--- a/hw/i386/x86.c ++++ b/hw/i386/x86.c +@@ -953,6 +953,41 @@ void x86_load_linux(X86MachineState *x86ms, + initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1; + } + ++ /* ++ * For TDX RTMRs to be predictable regardless of VM memory size, we need to ++ * load the initrd to a static address, so no dynamic value ends up in the ++ * mapped kernel image. ++ * ++ * Without setting this to a static address, the address the initrd is mapped to ++ * which depend not only on the size of the initrd, but also on the memory space ++ * of the guest, this is not viable for Contrast's reference-value-based attestation ++ * approach. ++ * ++ * As we control the minimum VM memory size in Contrast, we just load the initrd ++ * to the address it gets loaded to for Contrast's minimum VM memory (2Gi), regardless ++ * of if the VM has more memory. ++ * ++ * QEMU, by default, does a similar thing. ++ * Consider the below line (cited from above): ++ * ++ * initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1; ++ * ++ * This adds an artifical upper bound of where the initrd can be loaded to, as ++ * the calculation is based on the VM memory (below_4g_mem_size), but capped at 4Gi. ++ * This means, the initrd, regardless of guest memory size, will always be loaded ++ * at address 0x100000000 (4Gi) max (minus ACPI data size). ++ * ++ * Essentially, overwriting this to 0x80000000 (2Gi), we create an artificial lower *and* ++ * upper bound (set to Contrast minimum TDX VM memory size). ++ * This means that the initrd will *always* be loaded at 0x80000000 (2Gi), minus ACPI ++ * data size. The difference to QEMU's setting is, that we *fix* the address, rather than ++ * setting *only* an upper bound. ++ * ++ * This way, we get the initrd to *always* be loaded at a static address. ++ */ ++ uint32_t contrast_min_memory = 0x80000000; // 2Gi ++ initrd_max = contrast_min_memory - acpi_data_size - 1; ++ + fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_ADDR, cmdline_addr); + fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(kernel_cmdline) + 1); + fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline); diff --git a/packages/by-name/qemu-tdx-static/package.nix b/packages/by-name/qemu-tdx-static/package.nix index 521cf11487..58af1704fa 100644 --- a/packages/by-name/qemu-tdx-static/package.nix +++ b/packages/by-name/qemu-tdx-static/package.nix @@ -61,5 +61,7 @@ in # Make the generated ACPI tables more deterministic, so that we get a # fixed hash for attestation. ./0003-i386-omit-some-unneeded-ACPI-tables.patch + # Load the initrd to a static address to make RTMRs predictable. + ./0004-hw-x86-load-initrd-to-static-address.patch ]; }) diff --git a/packages/nixos/azure.nix b/packages/nixos/azure.nix index 4f2a70a94f..6da4f97e47 100644 --- a/packages/nixos/azure.nix +++ b/packages/nixos/azure.nix @@ -55,6 +55,8 @@ in }; config = lib.mkIf cfg.enable { + boot.kernelPackages = pkgs.recurseIntoAttrs (pkgs.linuxPackagesFor pkgs.kernel-podvm-azure); + boot.initrd = { kernelModules = [ "hv_storvsc" diff --git a/packages/nixos/debug.nix b/packages/nixos/debug.nix index 38958f87f8..19cfac9b0f 100644 --- a/packages/nixos/debug.nix +++ b/packages/nixos/debug.nix @@ -32,7 +32,16 @@ in services.getty.autologinUser = "root"; - boot.kernelParams = [ "console=ttyS0" ]; + # If the image is to be booted locally for testing purposes through + # .#boot-image, or if the image is booted on Peer-pods (on Azure), the + # console should be ttyS0, as this is what Azure and QEMU expose by default to the + # user for reading. However, when one builds a Kata image (e.g. for bare-metal), setting + # console=ttyS0 will break the VM logging (i.e. Kata's "reading guest console"), as this + # only listens on hvc* TTYs. As we have no indicator on whether an image should be booted locally, + # we only set console=ttyS0 when the image is a debug-image and on Peer-pods. So for local + # booting of an image, one needs to remove the optional manually. + boot.kernelParams = lib.optionals config.contrast.peerpods.enable [ "console=ttyS0" ]; + boot.initrd.systemd.emergencyAccess = true; systemd.enableEmergencyMode = true; }; diff --git a/packages/nixos/image.nix b/packages/nixos/image.nix index 192102e5cc..caf834473a 100644 --- a/packages/nixos/image.nix +++ b/packages/nixos/image.nix @@ -1,62 +1,80 @@ # Copyright 2024 Edgeless Systems GmbH # SPDX-License-Identifier: AGPL-3.0-only -{ config, pkgs, ... }: +{ + config, + pkgs, + lib, + ... +}: + +let + cfg = config.contrast.image; +in { - # We build the image with systemd-repart, which integrates well - # with the systemd utilities we use for dm-verity, UKI, etc. - # However, we do not use the repart unit, as we don't want - # dynamic repartitioning at run- / boot-time. - image.repart = { - name = "image-podvm-gpu"; - version = "1-rc1"; + options.contrast.image = { + microVM = lib.mkEnableOption "Build a micro VM image"; + }; - # This defines the actual partition layout. - partitions = { - # EFI System Partition, holds the UKI. - "00-esp" = { - contents = { - "/".source = pkgs.runCommand "esp-contents" { } '' - mkdir -p $out/EFI/BOOT - cp ${config.system.build.uki}/${config.system.boot.loader.ukiFile} $out/EFI/BOOT/BOOTX64.EFI - ''; - }; - repartConfig = { - Type = "esp"; - Format = "vfat"; - SizeMinBytes = "64M"; - UUID = "null"; # Fix partition UUID for reproducibility. - }; - }; + config = { + system.image.version = "1-rc1"; + + # We build the image with systemd-repart, which integrates well + # with the systemd utilities we use for dm-verity, UKI, etc. + # However, we do not use the repart unit, as we don't want + # dynamic repartitioning at run- / boot-time. + image.repart = { + name = "image-podvm-gpu"; + inherit (config.system.image) version; - # Root filesystem. - "10-root" = { - contents = { - "/pause_bundle".source = "${pkgs.pause-bundle}/pause_bundle"; + # This defines the actual partition layout. + partitions = { + # EFI System Partition, holds the UKI. + # Only build this partition if we need a bootable image (i.e. not a micro VM). + "00-esp" = lib.mkIf (!cfg.microVM) { + contents = { + "/".source = pkgs.runCommand "esp-contents" { } '' + mkdir -p $out/EFI/BOOT + cp ${config.system.build.uki}/${config.system.boot.loader.ukiFile} $out/EFI/BOOT/BOOTX64.EFI + ''; + }; + repartConfig = { + Type = "esp"; + Format = "vfat"; + SizeMinBytes = "64M"; + UUID = "null"; # Fix partition UUID for reproducibility. + }; }; - storePaths = [ config.system.build.toplevel ]; - repartConfig = { - Type = "root"; - Format = "erofs"; - Label = "root"; - Verity = "data"; - VerityMatchKey = "root"; - Minimize = "best"; - # We need to ensure that mountpoints are available. - # TODO (Maybe): This could be done more elegantly with CopyFiles and a skeleton tree in the vcs. - MakeDirectories = "/bin /boot /dev /etc /home /lib /lib64 /mnt /nix /opt /proc /root /run /srv /sys /tmp /usr/bin /var"; + + # Root filesystem. + "10-root" = { + contents = { + "/pause_bundle".source = "${pkgs.pause-bundle}/pause_bundle"; + }; + storePaths = [ config.system.build.toplevel ]; + repartConfig = { + Type = "root"; + Format = "erofs"; + Label = "root"; + Verity = "data"; + VerityMatchKey = "root"; + Minimize = "best"; + # We need to ensure that mountpoints are available. + # TODO (Maybe): This could be done more elegantly with CopyFiles and a skeleton tree in the vcs. + MakeDirectories = "/bin /boot /dev /etc /home /lib /lib64 /mnt /nix /opt /proc /root /run /srv /sys /tmp /usr/bin /var"; + }; }; - }; - # Verity hashes for the root filesystem. - "20-root-verity" = { - repartConfig = { - Type = "root-verity"; - Label = "root-verity"; - Verity = "hash"; - VerityMatchKey = "root"; - Minimize = "best"; + # Verity hashes for the root filesystem. + "20-root-verity" = { + repartConfig = { + Type = "root-verity"; + Label = "root-verity"; + Verity = "hash"; + VerityMatchKey = "root"; + Minimize = "best"; + }; }; }; }; diff --git a/packages/nixos/kata.nix b/packages/nixos/kata.nix index d31681f850..fa85326bff 100644 --- a/packages/nixos/kata.nix +++ b/packages/nixos/kata.nix @@ -1,90 +1,98 @@ # Copyright 2024 Edgeless Systems GmbH # SPDX-License-Identifier: AGPL-3.0-only -{ lib, pkgs, ... }: - { - systemd.services.kata-agent = { - description = "Kata Containers Agent"; - documentation = [ - "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/kata-agent.service" - ]; - bindsTo = [ "netns@podns.service" ]; - wants = [ "process-user-data.service" ]; - after = [ - "netns@podns.service" - "process-user-data.service" - ]; - wantedBy = [ "multi-user.target" ]; - serviceConfig = { - Type = "exec"; # Not upstream. - ExecStartPre = [ "${pkgs.coreutils}/bin/mkdir -p /run/kata-containers" ]; - ExecStart = "${lib.getExe pkgs.kata-agent} --config /run/peerpod/agent-config.toml"; - ExecStopPost = "${lib.getExe pkgs.cloud-api-adaptor.kata-agent-clean} --config /run/peerpod/agent-config.toml"; - SyslogIdentifier = "kata-agent"; - }; - environment = { - KATA_AGENT_LOG_LEVEL = "debug"; - OCICRYPT_KEYPROVIDER_CONFIG = builtins.toFile "policy.json" ( - lib.strings.toJSON { default = [ { type = "insecureAcceptAnything"; } ]; } - ); - }; + config, + lib, + pkgs, + ... +}: +let + cfg = config.contrast.kata; +in +{ + options.contrast.kata = { + enable = lib.mkEnableOption "Enable Kata (non-peerpod) support"; }; - systemd.services.agent-protocol-forwarder = { - description = "Agent Protocol Forwarder"; - documentation = [ - "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/agent-protocol-forwarder.service" - ]; - wants = [ "kata-agent.service" ]; - after = [ "kata-agent.service" ]; - wantedBy = [ "multi-user.target" ]; - unitConfig = { - DefaultDependencies = false; - }; - serviceConfig = { - Type = "notify"; - ExecStart = lib.strings.concatStringsSep " " [ - "${pkgs.cloud-api-adaptor}/bin/agent-protocol-forwarder" - "-kata-agent-namespace /run/netns/podns" - "-kata-agent-socket /run/kata-containers/agent.sock" + config = lib.mkIf cfg.enable { + # https://github.com/kata-containers/kata-containers/blob/3.10.1/src/agent/kata-containers.target + systemd.targets.kata-containers = { + description = "Kata Containers Agent Target"; + requires = [ + "basic.target" + "tmp.mount" + "kata-agent.service" ]; - Restart = "on-failure"; - RestartSec = "5s"; + wantedBy = [ "basic.target" ]; + wants = [ + "chronyd.service" + # https://github.com/kata-containers/kata-containers/blob/5869046d04553c3bd2f16fa1cfb714133050e537/tools/osbuilder/rootfs-builder/rootfs.sh#L712 + "dbus.socket" + ]; + conflicts = [ + "rescue.service" + "rescue.target" + ]; + after = [ + "basic.target" + "rescue.service" + "rescue.target" + ]; + unitConfig.AllowIsolate = true; }; - }; - systemd.services.process-user-data = { - description = "Pull configuration from metadata service"; - documentation = [ - "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/process-user-data.service" - ]; - wants = [ "network-online.target" ]; - after = [ "network-online.target" ]; - wantedBy = [ "multi-user.target" ]; - unitConfig = { - DefaultDependencies = false; + # https://github.com/kata-containers/kata-containers/blob/3.10.1/src/agent/kata-agent.service.in + systemd.services.kata-agent = { + description = "Kata Containers Agent"; + documentation = [ "https://github.com/kata-containers/kata-containers" ]; + wants = [ "kata-containers.target" ]; + after = [ "systemd-tmpfiles-setup.service" ]; # Not upstream, but required for /etc/resolv.conf bind mount. + serviceConfig = { + Type = "exec"; # Not upstream. + StandardOutput = "tty"; + ExecStart = "${lib.getExe pkgs.kata-agent}"; + LimitNOFILE = 1048576; + ExecStop = "${pkgs.coreutils}/bin/sync ; ${config.systemd.package}/bin/systemctl --force poweroff"; + FailureAction = "poweroff"; + OOMScoreAdjust = -997; + }; + # Not upstream + environment = { + KATA_AGENT_LOG_LEVEL = "debug"; + OCICRYPT_KEYPROVIDER_CONFIG = builtins.toFile "policy.json" ( + lib.strings.toJSON { default = [ { type = "insecureAcceptAnything"; } ]; } + ); + }; }; - serviceConfig = { - Type = "oneshot"; - ExecStart = "${pkgs.cloud-api-adaptor}/bin/process-user-data provision-files"; - RemainAfterExit = true; + + fileSystems."/run" = { + fsType = "tmpfs"; + options = [ + "nodev" + "nosuid" + "size=50%" + ]; + neededForBoot = true; }; - }; - systemd.services."netns@" = { - description = "Create a network namespace for pod networking"; - documentation = [ - "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/netns%40.service" - ]; - serviceConfig = { - Type = "oneshot"; - RemainAfterExit = true; - ExecStartPre = "${pkgs.iproute2}/bin/ip netns add %I"; - ExecStart = "${pkgs.iproute2}/bin/ip netns exec %I ${pkgs.iproute2}/bin/ip link set lo up"; - ExecStop = "${pkgs.iproute2}/bin/ip netns del %I"; + # Not used directly, but required for kernel-specific driver builds. + boot.kernelPackages = pkgs.recurseIntoAttrs (pkgs.linuxPackagesFor pkgs.kata-kernel-uvm); + + boot.initrd = { + # Don't require TPM2 support. (additional modules) + systemd.tpm2.enable = false; + # Don't require any of the hardware modules NixOS includes by default. + includeDefaultModules = false; + }; + + networking.resolvconf.enable = false; + systemd.tmpfiles.settings."10-etc-resolvconf"."/etc/resolv.conf".f = { + group = "root"; + mode = "0755"; + user = "root"; }; - }; - environment.etc."kata-opa/default-policy.rego".source = pkgs.cloud-api-adaptor.default-policy; + environment.etc."kata-opa/default-policy.rego".source = "${pkgs.kata-runtime.src}/src/kata-opa/allow-set-policy.rego"; + }; } diff --git a/packages/nixos/peerpods.nix b/packages/nixos/peerpods.nix new file mode 100644 index 0000000000..116768e519 --- /dev/null +++ b/packages/nixos/peerpods.nix @@ -0,0 +1,103 @@ +# Copyright 2024 Edgeless Systems GmbH +# SPDX-License-Identifier: AGPL-3.0-only + +{ + config, + lib, + pkgs, + ... +}: +let + cfg = config.contrast.peerpods; +in +{ + options.contrast.peerpods = { + enable = lib.mkEnableOption "Enable peer pods support"; + }; + + config = lib.mkIf cfg.enable { + systemd.services.kata-agent = { + description = "Kata Containers Agent"; + documentation = [ + "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/kata-agent.service" + ]; + bindsTo = [ "netns@podns.service" ]; + wants = [ "process-user-data.service" ]; + after = [ + "netns@podns.service" + "process-user-data.service" + ]; + wantedBy = [ "multi-user.target" ]; + serviceConfig = { + Type = "exec"; # Not upstream. + ExecStartPre = [ "${pkgs.coreutils}/bin/mkdir -p /run/kata-containers" ]; + ExecStart = "${lib.getExe pkgs.kata-agent} --config /run/peerpod/agent-config.toml"; + ExecStopPost = "${lib.getExe pkgs.cloud-api-adaptor.kata-agent-clean} --config /run/peerpod/agent-config.toml"; + SyslogIdentifier = "kata-agent"; + }; + environment = { + KATA_AGENT_LOG_LEVEL = "debug"; + OCICRYPT_KEYPROVIDER_CONFIG = builtins.toFile "policy.json" ( + lib.strings.toJSON { default = [ { type = "insecureAcceptAnything"; } ]; } + ); + }; + }; + + systemd.services.agent-protocol-forwarder = { + description = "Agent Protocol Forwarder"; + documentation = [ + "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/agent-protocol-forwarder.service" + ]; + wants = [ "kata-agent.service" ]; + after = [ "kata-agent.service" ]; + wantedBy = [ "multi-user.target" ]; + unitConfig = { + DefaultDependencies = false; + }; + serviceConfig = { + Type = "notify"; + ExecStart = lib.strings.concatStringsSep " " [ + "${pkgs.cloud-api-adaptor}/bin/agent-protocol-forwarder" + "-kata-agent-namespace /run/netns/podns" + "-kata-agent-socket /run/kata-containers/agent.sock" + ]; + Restart = "on-failure"; + RestartSec = "5s"; + }; + }; + + systemd.services.process-user-data = { + description = "Pull configuration from metadata service"; + documentation = [ + "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/process-user-data.service" + ]; + wants = [ "network-online.target" ]; + after = [ "network-online.target" ]; + wantedBy = [ "multi-user.target" ]; + unitConfig = { + DefaultDependencies = false; + }; + serviceConfig = { + Type = "oneshot"; + ExecStart = "${pkgs.cloud-api-adaptor}/bin/process-user-data provision-files"; + RemainAfterExit = true; + }; + }; + + systemd.services."netns@" = { + description = "Create a network namespace for pod networking"; + documentation = [ + "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/netns%40.service" + ]; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + ExecStartPre = "${pkgs.iproute2}/bin/ip netns add %I"; + ExecStart = "${pkgs.iproute2}/bin/ip netns exec %I ${pkgs.iproute2}/bin/ip link set lo up"; + ExecStop = "${pkgs.iproute2}/bin/ip netns del %I"; + }; + }; + + environment.etc."kata-opa/default-policy.rego".source = pkgs.cloud-api-adaptor.default-policy; + }; +} diff --git a/packages/nixos/system.nix b/packages/nixos/system.nix index d11a336acc..553accc304 100644 --- a/packages/nixos/system.nix +++ b/packages/nixos/system.nix @@ -4,20 +4,24 @@ { config, lib, - pkgs, ... }: { boot.loader.grub.enable = false; - boot.kernelPackages = pkgs.recurseIntoAttrs (pkgs.linuxPackagesFor pkgs.kernel-podvm-azure); boot.kernelParams = [ "systemd.verity=yes" "selinux=0" ]; - boot.supportedFilesystems = [ "erofs" ]; + boot.supportedFilesystems = [ + "erofs" + "vfat" + ]; boot.initrd = { - supportedFilesystems = [ "erofs" ]; + supportedFilesystems = [ + "erofs" + "vfat" + ]; availableKernelModules = [ "dm_mod" "dm_verity" diff --git a/tools/tdx-measure/main.go b/tools/tdx-measure/main.go index b43f4db9d9..50f3bdb6a5 100644 --- a/tools/tdx-measure/main.go +++ b/tools/tdx-measure/main.go @@ -106,6 +106,10 @@ func newRtMrCmd() *cobra.Command { if err := cmd.MarkFlagFilename("kernel"); err != nil { panic(err) } + cmd.Flags().StringP("initrd", "i", "initrd.zst", "path to initrd file") + if err := cmd.MarkFlagFilename("initrd"); err != nil { + panic(err) + } cmd.Flags().StringP("cmdline", "c", "", "kernel command line") return cmd } @@ -136,8 +140,15 @@ func runRtMr(cmd *cobra.Command, args []string) error { if err != nil { return fmt.Errorf("can't read kernel file: %w", err) } - - digest, err = rtmr.CalcRtmr1(kernel) + initrdPath, err := cmd.Flags().GetString("initrd") + if err != nil { + return err + } + initrd, err := os.ReadFile(initrdPath) + if err != nil { + return fmt.Errorf("can't read initrd file: %w", err) + } + digest, err = rtmr.CalcRtmr1(kernel, initrd) if err != nil { return fmt.Errorf("can't calculate RTMR 1: %w", err) } @@ -146,7 +157,15 @@ func runRtMr(cmd *cobra.Command, args []string) error { if err != nil { return err } - digest, err = rtmr.CalcRtmr2(cmdLine) + initrdPath, err := cmd.Flags().GetString("initrd") + if err != nil { + return err + } + initrd, err := os.ReadFile(initrdPath) + if err != nil { + return fmt.Errorf("can't read initrd file: %w", err) + } + digest, err = rtmr.CalcRtmr2(cmdLine, initrd) if err != nil { return fmt.Errorf("can't calculate RTMR 2: %w", err) } diff --git a/tools/tdx-measure/rtmr/rtmr.go b/tools/tdx-measure/rtmr/rtmr.go index a123f29d9a..a6a3fb71a9 100644 --- a/tools/tdx-measure/rtmr/rtmr.go +++ b/tools/tdx-measure/rtmr/rtmr.go @@ -243,23 +243,33 @@ func CalcRtmr0(firmware []byte) ([48]byte, error) { } // CalcRtmr1 calculates RTMR[1] for the given kernel. -func CalcRtmr1(kernelFile []byte) ([48]byte, error) { +func CalcRtmr1(kernelFile, initrdFile []byte) ([48]byte, error) { var rtmr Rtmr - kernelHashContent, err := hashKernel(kernelFile) + + kernelHashContent, err := hashKernel(kernelFile, initrdFile) if err != nil { return [48]byte{}, fmt.Errorf("can't hash kernel: %w", err) } rtmr.hashAndExtend(kernelHashContent) + + // https://github.com/tianocore/edk2/blob/0f3867fa6ef0553e26c42f7d71ff6bdb98429742/OvmfPkg/Tcg/TdTcg2Dxe/TdTcg2Dxe.c#L2155 rtmr.hashAndExtend([]byte("Calling EFI Application from Boot Option")) + // https://github.com/tianocore/edk2/blob/0f3867fa6ef0553e26c42f7d71ff6bdb98429742/OvmfPkg/Tcg/TdTcg2Dxe/TdTcg2Dxe.c#L2243 rtmr.hashAndExtend([]byte("Exit Boot Services Invocation")) + // https://github.com/tianocore/edk2/blob/0f3867fa6ef0553e26c42f7d71ff6bdb98429742/OvmfPkg/Tcg/TdTcg2Dxe/TdTcg2Dxe.c#L2254 rtmr.hashAndExtend([]byte("Exit Boot Services Returned with Success")) return rtmr.Get(), nil } -// CalcRtmr2 calculates RTMR[2] for the given kernel command line. -func CalcRtmr2(cmdLine string) ([48]byte, error) { +// CalcRtmr2 calculates RTMR[2] for the given kernel command line and initrd. +func CalcRtmr2(cmdLine string, initrdFile []byte) ([48]byte, error) { var rtmr Rtmr + // TODO(msanft): find out which component silently adds this string to the commandline. + // Suspects: QEMU-TDX, OVMF-TDX, Linux EFI Stub + cmdLine += " initrd=initrd" + + // https://elixir.bootlin.com/linux/v6.11.8/source/drivers/firmware/efi/libstub/efi-stub-helper.c#L342 codepoints := utf16.Encode([]rune(cmdLine)) bytes := make([]byte, (len(codepoints)+1)*2) for i, codepoint := range codepoints { @@ -267,11 +277,14 @@ func CalcRtmr2(cmdLine string) ([48]byte, error) { } rtmr.hashAndExtend(bytes) + // https://elixir.bootlin.com/linux/v6.11.8/source/drivers/firmware/efi/libstub/efi-stub-helper.c#L625 + rtmr.hashAndExtend(initrdFile) + return rtmr.Get(), nil } -func hashKernel(kernelFile []byte) ([]byte, error) { - patchKernel(kernelFile) +func hashKernel(kernelFile, initrdFile []byte) ([]byte, error) { + patchKernel(kernelFile, initrdFile) kernel, err := authenticode.Parse(bytes.NewReader(kernelFile)) if err != nil { @@ -281,7 +294,7 @@ func hashKernel(kernelFile []byte) ([]byte, error) { return kernel.HashContent.Bytes(), nil } -func patchKernel(kernelFile []byte) { +func patchKernel(kernelFile, initrdFile []byte) { // QEMU patches some header bytes in the kernel before loading it into memory. // Sources: // - https://gitlab.com/qemu-project/qemu/-/blob/28ae3179fc52d2e4d870b635c4a412aab99759e7/hw/i386/x86-common.c#L837 @@ -299,4 +312,19 @@ func patchKernel(kernelFile []byte) { kernelFile[0x229] = 0x00 kernelFile[0x22A] = 0x02 kernelFile[0x22B] = 0x00 + + // https://github.com/qemu/qemu/blob/f48c205fb42be48e2e47b7e1cd9a2802e5ca17b0/hw/i386/x86.c#L1036 + // Maximum size of the initrd as calculated by QEMU. Normally, this would be dependent on the VM + // memory size, but we have a QEMU patch that removes that fixes this to make RTMR1 reproducible. + // Our QEMU patch has a commented-out line to print this value upon start, so it's easy to find + // when updating QEMU, as the value might change on QEMU updates. + initrdMax := 0x7ffd7fff + initrdSize := len(initrdFile) + initrdAddr := (initrdMax - initrdSize) & ^4095 + + // https://github.com/qemu/qemu/blob/f48c205fb42be48e2e47b7e1cd9a2802e5ca17b0/hw/i386/x86.c#L1044 + binary.LittleEndian.PutUint32(kernelFile[0x218:][:4], uint32(initrdAddr)) + + // https://github.com/qemu/qemu/blob/f48c205fb42be48e2e47b7e1cd9a2802e5ca17b0/hw/i386/x86.c#L1045 + binary.LittleEndian.PutUint32(kernelFile[0x21C:][:4], uint32(initrdSize)) }