Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Container: BPF token support #15009

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
4 changes: 4 additions & 0 deletions doc/api-extensions.md
Original file line number Diff line number Diff line change
Expand Up @@ -2618,3 +2618,7 @@ Note that the `openid` and `email` scopes are always required.
## `project_default_network_and_storage`

Adds flags --network and --storage. The --network flag adds a network device connected to the specified network to the default profile. The --storage flag adds a root disk device using the specified storage pool to the default profile.

## `bpf_delegation`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As this is container specific, suggest container_bpf_delegation, similar to container_syscall_intercept_bpf_devices that exists today.


Adds new `security.delegate_bpf.*` group of options in order to support eBPF delegation using BPF Token mechanism.
9 changes: 9 additions & 0 deletions lxd-user/callhook/callhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,15 @@ func HandleContainerHook(lxdPath string, projectName string, instanceRef string,
u := api.NewURL().Path("internal", "containers", instanceRef, "on"+hook)
u.WithQuery("target", target)

if hook == "starthost" {
lxcPID := os.Getenv("LXC_PID")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we lose this validation and instead still call the starthost URL with a missing or blank lxc_pid query parameter, then let LXD validate that its missing, as the error will then be more accessible in the LXD logs rather than in the lxc debug log.

if lxcPID == "" {
return errors.New("starthost hook requires LXC_PID env variable set")
}

u.WithQuery("lxc_pid", lxcPID)
}

if projectName != "" {
u.WithQuery("project", projectName)
}
Expand Down
1 change: 1 addition & 0 deletions lxd/api_1.0.go
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,7 @@ func api10Get(d *Daemon, r *http.Request) response.Response {
}

env.KernelFeatures = map[string]string{
"bpf_token": fmt.Sprint(s.OS.BPFToken),
"netnsid_getifaddrs": fmt.Sprint(s.OS.NetnsGetifaddrs),
"uevent_injection": fmt.Sprint(s.OS.UeventInjection),
"unpriv_binfmt": fmt.Sprint(s.OS.UnprivBinfmt),
Expand Down
36 changes: 36 additions & 0 deletions lxd/api_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ var apiInternal = []APIEndpoint{
internalClusterRebalanceCmd,
internalClusterHealCmd,
internalContainerOnStartCmd,
internalContainerOnStartHostCmd,
internalContainerOnStopCmd,
internalContainerOnStopNSCmd,
internalGarbageCollectorCmd,
Expand Down Expand Up @@ -82,6 +83,12 @@ var internalContainerOnStartCmd = APIEndpoint{
Get: APIEndpointAction{Handler: internalContainerOnStart, AccessHandler: allowPermission(entity.TypeServer, auth.EntitlementCanEdit)},
}

var internalContainerOnStartHostCmd = APIEndpoint{
Path: "containers/{instanceRef}/onstarthost",

Get: APIEndpointAction{Handler: internalContainerOnStartHost, AccessHandler: allowPermission(entity.TypeServer, auth.EntitlementCanEdit)},
}

var internalContainerOnStopNSCmd = APIEndpoint{
Path: "containers/{instanceRef}/onstopns",

Expand Down Expand Up @@ -341,6 +348,35 @@ func internalContainerOnStart(d *Daemon, r *http.Request) response.Response {
return response.EmptySyncResponse
}

func internalContainerOnStartHost(d *Daemon, r *http.Request) response.Response {
s := d.State()

inst, err := internalContainerHookLoadFromReference(s, r)
if err != nil {
logger.Error("The start-host hook failed to load", logger.Ctx{"err": err})
return response.SmartError(err)
}

lxcPID := request.QueryParam(r, "lxc_pid")
if lxcPID == "" {
err := fmt.Errorf("No lxc_pid GET parameter was provided")
logger.Error("The start-host hook failed", logger.Ctx{"instance": inst.Name(), "err": err})
return response.BadRequest(err)
}

args := map[string]string{
"LXC_PID": lxcPID,
}

err = inst.OnHook(instance.HookStartHost, args)
if err != nil {
logger.Error("The start-host hook failed", logger.Ctx{"instance": inst.Name(), "err": err})
return response.SmartError(err)
}

return response.EmptySyncResponse
}

func internalContainerOnStopNS(d *Daemon, r *http.Request) response.Response {
s := d.State()

Expand Down
7 changes: 7 additions & 0 deletions lxd/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -1235,6 +1235,13 @@ func (d *Daemon) init() error {
logger.Info(" - unprivileged binfmt_misc: no")
}

d.os.BPFToken = canUseBPFToken()
if d.os.BPFToken {
logger.Info(" - BPF Token: yes")
} else {
logger.Info(" - BPF Token: no")
}

/*
* During daemon startup we're the only thread that touches VFS3Fscaps
* so we don't need to bother with atomic.StoreInt32() when touching
Expand Down
101 changes: 101 additions & 0 deletions lxd/instance/drivers/driver_lxc.go
Original file line number Diff line number Diff line change
Expand Up @@ -1071,6 +1071,13 @@ func (d *lxc) initLXC(config bool) (*liblxc.Container, error) {
}
}

if shared.IsTrue(d.expandedConfig["security.delegate_bpf"]) {
err = lxcSetConfigItem(cc, "lxc.hook.start-host", fmt.Sprintf("%s callhook %s %s %s starthost", d.state.OS.ExecPath, shared.VarPath(""), strconv.Quote(d.Project().Name), strconv.Quote(d.Name())))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like that fmt.Sprintf() to be replaced by string concatenation but I understand you might want to leave it as-is as there are other similar line in that file ;)

if err != nil {
return nil, err
}
}

// Memory limits
if d.state.OS.CGInfo.Supports(cgroup.Memory, cg) {
memory := d.expandedConfig["limits.memory"]
Expand Down Expand Up @@ -2464,6 +2471,8 @@ func (d *lxc) OnHook(hookName string, args map[string]string) error {
switch hookName {
case instance.HookStart:
return d.onStart(args)
case instance.HookStartHost:
return d.onStartHost(args)
case instance.HookStopNS:
return d.onStopNS(args)
case instance.HookStop:
Expand Down Expand Up @@ -2522,6 +2531,94 @@ func (d *lxc) onStart(_ map[string]string) error {
return nil
}

// mountBpfFs mounts bpffs inside the container.
func (d *lxc) mountBpfFs(pid int, bpffsParams map[string]string) error {
if !d.state.OS.BPFToken {
return fmt.Errorf("BPF Token mechanism is not supported by kernel running.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No full stops on log messages or errors please

}

pidFdNr, pidFd := seccomp.MakePidFd(pid, d.state)
if pidFdNr >= 0 {
defer func() { _ = pidFd.Close() }()
}

ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()

d.logger.Debug("bpffs mount helper is being called", logger.Ctx{"pid": pid, "bpffsParams": bpffsParams})
stdout, err := shared.RunCommandInheritFds(
ctx,
[]*os.File{pidFd},
d.state.OS.ExecPath,
"forkmount",
"bpffs",
"--",
fmt.Sprint(pid),
fmt.Sprint(pidFdNr),
bpffsParams["mountpoint"],
bpffsParams["delegate_cmds"],
bpffsParams["delegate_maps"],
bpffsParams["delegate_progs"],
bpffsParams["delegate_attachs"])
if err != nil {
d.logger.Error("bpffs mount helper has failed", logger.Ctx{"err": err, "stdout": stdout})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there potentially useful info in stdout we should be collecting and passing to the end user in err?

return err
}

d.logger.Debug("bpffs mount helper has finished without error", logger.Ctx{"stdout": stdout})

return nil
}

// onStartHost implements the LXC start-host hook.
func (d *lxc) onStartHost(args map[string]string) error {
if shared.IsFalseOrEmpty(d.expandedConfig["security.delegate_bpf"]) {
return nil
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets move the contents of his function from line 2580 into its own function, suggest d.onStartHostBPFDelegate() or similar, and then reverse this if statement's logic to call that function if shared.IsTrue(d.expandedConfig["security.delegate_bpf"]).

That way we nicely structure the general onStartHost function with the BPF delegation logic, allowing for easier future expansion for other purposes.

}

// Get the init PID
pidStr, ok := args["LXC_PID"]
if !ok {
return fmt.Errorf("No LXC_PID parameter was provided to start-host hook")
}

pid, err := strconv.Atoi(pidStr)
if err != nil {
return fmt.Errorf("Invalid LXC_PID parameter was provided to start-host hook %q: %w", pidStr, err)
}

bpffsParams := map[string]string{
"delegate_cmds": "any",
"delegate_maps": "any",
"delegate_progs": "any",
"delegate_attachs": "any",
"mountpoint": "/sys/fs/bpf",
}

if d.expandedConfig["security.delegate_bpf.cmds"] != "" {
bpffsParams["delegate_cmds"] = d.expandedConfig["security.delegate_bpf.cmds"]
}

if d.expandedConfig["security.delegate_bpf.maps"] != "" {
bpffsParams["delegate_maps"] = d.expandedConfig["security.delegate_bpf.maps"]
}

if d.expandedConfig["security.delegate_bpf.progs"] != "" {
bpffsParams["delegate_progs"] = d.expandedConfig["security.delegate_bpf.progs"]
}

if d.expandedConfig["security.delegate_bpf.attachs"] != "" {
bpffsParams["delegate_attachs"] = d.expandedConfig["security.delegate_bpf.attachs"]
}

err = d.mountBpfFs(pid, bpffsParams)
if err != nil {
return err
}

return nil
}

// validateStartup checks any constraints that would prevent start up from succeeding under normal circumstances.
func (d *lxc) validateStartup(statusCode api.StatusCode) error {
err := d.common.validateStartup(statusCode)
Expand All @@ -2539,6 +2636,10 @@ func (d *lxc) validateStartup(statusCode api.StatusCode) error {
return fmt.Errorf("Instance is protected from being started")
}

if shared.IsTrue(d.expandedConfig["security.delegate_bpf"]) && !d.state.OS.BPFToken {
return fmt.Errorf("BPF Token mechanism is not supported by your kernel. Linux kernel 6.9+ is required to start this instance, or security.delegate_bpf option must be disabled")
}

return nil
}

Expand Down
3 changes: 3 additions & 0 deletions lxd/instance/instance_interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ import (
// HookStart hook used when instance has started.
const HookStart = "onstart"

// HookStartHost hook used when instance is fully ready to be started.
const HookStartHost = "onstarthost"

// HookStopNS hook used when instance has stopped but before namespaces have been destroyed.
const HookStopNS = "onstopns"

Expand Down
50 changes: 50 additions & 0 deletions lxd/instance/instancetype/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -911,6 +911,56 @@ var InstanceConfigKeysContainer = map[string]func(value string) error{
// shortdesc: Whether to handle the `sysinfo` system call
"security.syscalls.intercept.sysinfo": validate.Optional(validate.IsBool),

// lxdmeta:generate(entities=instance; group=security; key=security.delegate_bpf)
//
// ---
// type: bool
// defaultdesc: `false`
// liveupdate: no
// condition: unprivileged container
// shortdesc: Whether to enable eBPF delegation using BPF Token mechanism
"security.delegate_bpf": validate.Optional(validate.IsBool),

// lxdmeta:generate(entities=instance; group=security; key=security.delegate_bpf.cmds)
//
// ---
// type: bool
// defaultdesc: `false`
// liveupdate: no
// condition: unprivileged container
// shortdesc: Which eBPF commands to allow with delegation mechanism
"security.delegate_bpf.cmds": validate.Optional(validate.IsBpfDelegateOption("cmds")),

// lxdmeta:generate(entities=instance; group=security; key=security.delegate_bpf.maps)
//
// ---
// type: bool
// defaultdesc: `false`
// liveupdate: no
// condition: unprivileged container
// shortdesc: Which eBPF maps to allow with delegation mechanism
"security.delegate_bpf.maps": validate.Optional(validate.IsBpfDelegateOption("maps")),

// lxdmeta:generate(entities=instance; group=security; key=security.delegate_bpf.progs)
//
// ---
// type: bool
// defaultdesc: `false`
// liveupdate: no
// condition: unprivileged container
// shortdesc: Which eBPF program types to allow with delegation mechanism
"security.delegate_bpf.progs": validate.Optional(validate.IsBpfDelegateOption("progs")),

// lxdmeta:generate(entities=instance; group=security; key=security.delegate_bpf.attachs)
//
// ---
// type: bool
// defaultdesc: `false`
// liveupdate: no
// condition: unprivileged container
// shortdesc: Which eBPF attach types to allow with delegation mechanism
"security.delegate_bpf.attachs": validate.Optional(validate.IsBpfDelegateOption("attachs")),

// lxdmeta:generate(entities=instance; group=volatile; key=volatile.last_state.idmap)
// The UID/GID map that has been applied to the container's underlying storage.
// This is usually set for containers created on older kernels that don't
Expand Down
35 changes: 35 additions & 0 deletions lxd/main_checkfeature.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ __ro_after_init bool pidfd_aware = false;
__ro_after_init bool pidfd_setns_aware = false;
__ro_after_init bool uevent_aware = false;
__ro_after_init bool binfmt_aware = false;
__ro_after_init bool bpftoken_aware = false;
__ro_after_init int seccomp_notify_aware = 0;
__ro_after_init char errbuf[4096];

Expand Down Expand Up @@ -619,6 +620,35 @@ static void is_binfmt_aware(void)
binfmt_aware = true;
}

static void is_bpftoken_aware(void)
{
__do_close int fs_fd = -EBADF;
int ret;

fs_fd = lxd_fsopen("bpf", FSOPEN_CLOEXEC);
if (fs_fd < 0) {
(void)sprintf(errbuf, "%s", "fsopen() failed on bpffs");
return;
}

// Try to set an invalid "delegate_cmds" option value and ensure that it fails.
// This is important to check, because bpffs ignores unknown options on the kernel side.
ret = lxd_fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_cmds", "MUSTFAIL", 0);
if (ret == 0) {
(void)sprintf(errbuf, "%s", "fsconfig succeed to set delegate_cmds, but must fail");
return;
}

// Now let's check that a valid value works too. Just in case.
ret = lxd_fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_cmds", "any", 0);
if (ret < 0) {
(void)sprintf(errbuf, "%s - fsconfig failed to set delegate_cmds", strerror(errno));
return;
}

bpftoken_aware = true;
}

void checkfeature(void)
{
__do_close int hostnetns_fd = -EBADF, newnetns_fd = -EBADF, pidfd = -EBADF;
Expand All @@ -638,6 +668,7 @@ void checkfeature(void)
(void)sprintf(errbuf, "%s", "Failed to attach to host network namespace");

is_binfmt_aware();
is_bpftoken_aware();
}

static bool is_empty_string(char *s)
Expand Down Expand Up @@ -734,3 +765,7 @@ func canUseCoreScheduling() bool {
func canUseBinfmt() bool {
return bool(C.binfmt_aware)
}

func canUseBPFToken() bool {
return bool(C.bpftoken_aware)
}
Loading