Skip to content

Commit

Permalink
wmeta: add nvml collector (#32109)
Browse files Browse the repository at this point in the history
This PR adds the NVML collector to workloadmeta, so that we collect data about the NVIDIA GPUs present in the system that can be used in other parts of the agent, including the tagger.
  • Loading branch information
gjulianm authored Jan 15, 2025
1 parent 00abd8c commit 14ca022
Show file tree
Hide file tree
Showing 59 changed files with 565 additions and 246 deletions.
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,7 @@
/pkg/tagger/ @DataDog/container-platform
/pkg/windowsdriver/ @DataDog/windows-kernel-integrations
/comp/core/workloadmeta/collectors/internal/cloudfoundry @DataDog/platform-integrations
/comp/core/workloadmeta/collectors/internal/nvml @DataDog/ebpf-platform
/pkg/sbom/ @DataDog/container-integrations @DataDog/agent-security
/pkg/internaltelemetry @DataDog/windows-kernel-integrations @DataDog/fleet
/pkg/networkpath/ @DataDog/network-device-monitoring @DataDog/Networks
Expand Down
1 change: 1 addition & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1170,6 +1170,7 @@ workflow:
- pkg/gpu/**/*
- test/new-e2e/tests/gpu/**/*
- pkg/collector/corechecks/gpu/**/*
- comp/core/workloadmeta/collectors/internal/nvml/**/*
compare_to: main # TODO: use a variable, when this is supported https://gitlab.com/gitlab-org/gitlab/-/issues/369916

# windows_docker_2022 configures the job to use the Windows Server 2022 runners.
Expand Down
12 changes: 6 additions & 6 deletions comp/api/authtoken/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,15 @@ require (
github.com/DataDog/datadog-agent/pkg/config/teeconfig v0.60.0-devel // indirect
github.com/DataDog/datadog-agent/pkg/config/utils v0.56.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/executable v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/filesystem v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/filesystem v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/hostname/validate v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/log v0.59.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/log v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/log/setup v0.58.0-devel // indirect
github.com/DataDog/datadog-agent/pkg/util/pointer v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/scrubber v0.59.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/system v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/pointer v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/scrubber v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/system v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/system/socket v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/winutil v0.59.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/winutil v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/version v0.59.1 // indirect
github.com/DataDog/viper v1.14.0 // indirect
github.com/Microsoft/go-winio v0.6.2 // indirect
Expand Down
12 changes: 6 additions & 6 deletions comp/core/config/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ require (
github.com/DataDog/datadog-agent/pkg/util/defaultpaths v0.0.0-00010101000000-000000000000
github.com/DataDog/datadog-agent/pkg/util/fxutil v0.56.0-rc.3
github.com/DataDog/datadog-agent/pkg/util/option v0.59.0
github.com/DataDog/datadog-agent/pkg/util/winutil v0.59.1
github.com/DataDog/datadog-agent/pkg/util/winutil v0.60.1
github.com/DataDog/viper v1.14.0
github.com/stretchr/testify v1.10.0
go.uber.org/fx v1.23.0
Expand All @@ -58,12 +58,12 @@ require (
github.com/DataDog/datadog-agent/pkg/config/structure v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/config/teeconfig v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/executable v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/filesystem v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/filesystem v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/hostname/validate v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/log v0.59.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/pointer v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/scrubber v0.59.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/system v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/log v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/pointer v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/scrubber v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/system v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/system/socket v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/version v0.59.1 // indirect
github.com/Microsoft/go-winio v0.6.2 // indirect
Expand Down
12 changes: 6 additions & 6 deletions comp/core/log/impl-trace/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ require (
github.com/DataDog/datadog-agent/pkg/config/env v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/trace v0.56.0-rc.3
github.com/DataDog/datadog-agent/pkg/util/fxutil v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/log v0.59.1
github.com/DataDog/datadog-agent/pkg/util/log v0.60.1
github.com/cihub/seelog v0.0.0-20170130134532-f561c5e57575 // indirect; v2.6
github.com/stretchr/testify v1.10.0
go.uber.org/fx v1.23.0 // indirect
Expand All @@ -68,14 +68,14 @@ require (
github.com/DataDog/datadog-agent/pkg/config/structure v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/config/teeconfig v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/executable v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/filesystem v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/filesystem v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/hostname/validate v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/option v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/pointer v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/scrubber v0.59.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/system v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/pointer v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/scrubber v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/system v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/system/socket v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/winutil v0.59.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/winutil v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/version v0.59.1 // indirect
github.com/DataDog/viper v1.14.0 // indirect
github.com/Microsoft/go-winio v0.6.2 // indirect
Expand Down
12 changes: 6 additions & 6 deletions comp/core/log/impl/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ require (
github.com/DataDog/datadog-agent/comp/core/log/def v0.0.0-00010101000000-000000000000
github.com/DataDog/datadog-agent/comp/def v0.56.0-rc.3
github.com/DataDog/datadog-agent/pkg/config/mock v0.59.0
github.com/DataDog/datadog-agent/pkg/util/log v0.59.1
github.com/DataDog/datadog-agent/pkg/util/log v0.60.1
github.com/DataDog/datadog-agent/pkg/util/log/setup v0.0.0-00010101000000-000000000000
github.com/stretchr/testify v1.10.0
)
Expand All @@ -56,15 +56,15 @@ require (
github.com/DataDog/datadog-agent/pkg/config/structure v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/config/teeconfig v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/executable v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/filesystem v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/filesystem v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/fxutil v0.56.0-rc.3 // indirect
github.com/DataDog/datadog-agent/pkg/util/hostname/validate v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/option v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/pointer v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/scrubber v0.59.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/system v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/pointer v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/scrubber v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/system v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/system/socket v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/winutil v0.59.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/winutil v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/version v0.59.1 // indirect
github.com/DataDog/viper v1.14.0 // indirect
github.com/Microsoft/go-winio v0.6.2 // indirect
Expand Down
4 changes: 2 additions & 2 deletions comp/core/log/mock/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,15 @@ replace (

require (
github.com/DataDog/datadog-agent/comp/core/log/def v0.0.0-00010101000000-000000000000
github.com/DataDog/datadog-agent/pkg/util/log v0.59.1
github.com/DataDog/datadog-agent/pkg/util/log v0.60.1
github.com/DataDog/datadog-agent/pkg/util/log/setup v0.0.0-00010101000000-000000000000
)

require (
github.com/DataDog/datadog-agent/pkg/config/model v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/config/nodetreemodel v0.60.0-devel // indirect
github.com/DataDog/datadog-agent/pkg/config/teeconfig v0.60.0-devel // indirect
github.com/DataDog/datadog-agent/pkg/util/scrubber v0.59.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/scrubber v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/version v0.59.1 // indirect
github.com/DataDog/viper v1.14.0 // indirect
github.com/cihub/seelog v0.0.0-20170130134532-f561c5e57575 // indirect
Expand Down
12 changes: 6 additions & 6 deletions comp/core/status/statusimpl/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,16 @@ require (
github.com/DataDog/datadog-agent/pkg/config/structure v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/config/teeconfig v0.60.0-devel // indirect
github.com/DataDog/datadog-agent/pkg/util/executable v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/filesystem v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/filesystem v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/hostname/validate v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/log v0.59.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/log v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/log/setup v0.56.0-rc.3 // indirect
github.com/DataDog/datadog-agent/pkg/util/option v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/pointer v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/scrubber v0.59.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/system v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/pointer v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/scrubber v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/system v0.60.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/system/socket v0.59.0 // indirect
github.com/DataDog/datadog-agent/pkg/util/winutil v0.59.1 // indirect
github.com/DataDog/datadog-agent/pkg/util/winutil v0.60.1 // indirect
github.com/DataDog/viper v1.14.0 // indirect
github.com/Microsoft/go-winio v0.6.2 // indirect
github.com/cihub/seelog v0.0.0-20170130134532-f561c5e57575 // indirect
Expand Down
2 changes: 2 additions & 0 deletions comp/core/workloadmeta/collectors/catalog-core/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/DataDog/datadog-agent/comp/core/workloadmeta/collectors/internal/kubeapiserver"
"github.com/DataDog/datadog-agent/comp/core/workloadmeta/collectors/internal/kubelet"
"github.com/DataDog/datadog-agent/comp/core/workloadmeta/collectors/internal/kubemetadata"
"github.com/DataDog/datadog-agent/comp/core/workloadmeta/collectors/internal/nvml"
"github.com/DataDog/datadog-agent/comp/core/workloadmeta/collectors/internal/podman"
"github.com/DataDog/datadog-agent/comp/core/workloadmeta/collectors/internal/process"
remoteprocesscollector "github.com/DataDog/datadog-agent/comp/core/workloadmeta/collectors/internal/remote/processcollector"
Expand All @@ -41,5 +42,6 @@ func getCollectorOptions() []fx.Option {
podman.GetFxOptions(),
remoteprocesscollector.GetFxOptions(),
process.GetFxOptions(),
nvml.GetFxOptions(),
}
}
2 changes: 2 additions & 0 deletions comp/core/workloadmeta/collectors/catalog/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/DataDog/datadog-agent/comp/core/workloadmeta/collectors/internal/kubeapiserver"
"github.com/DataDog/datadog-agent/comp/core/workloadmeta/collectors/internal/kubelet"
"github.com/DataDog/datadog-agent/comp/core/workloadmeta/collectors/internal/kubemetadata"
"github.com/DataDog/datadog-agent/comp/core/workloadmeta/collectors/internal/nvml"
"github.com/DataDog/datadog-agent/comp/core/workloadmeta/collectors/internal/podman"
"github.com/DataDog/datadog-agent/comp/core/workloadmeta/collectors/internal/remote/processcollector"
remoteworkloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/collectors/internal/remote/workloadmeta"
Expand All @@ -42,5 +43,6 @@ func getCollectorOptions() []fx.Option {
remoteworkloadmeta.GetFxOptions(),
remoteWorkloadmetaParams(),
processcollector.GetFxOptions(),
nvml.GetFxOptions(),
}
}
124 changes: 124 additions & 0 deletions comp/core/workloadmeta/collectors/internal/nvml/nvml.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2024-present Datadog, Inc.

//go:build linux

package nvml

import (
"context"
"fmt"

"go.uber.org/fx"

"github.com/NVIDIA/go-nvml/pkg/nvml"

workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def"
"github.com/DataDog/datadog-agent/pkg/config/env"
"github.com/DataDog/datadog-agent/pkg/errors"
)

const (
collectorID = "nvml"
componentName = "workloadmeta-nvml"
nvidiaVendor = "nvidia"
)

type collector struct {
id string
catalog workloadmeta.AgentType
store workloadmeta.Component
nvmlLib nvml.Interface
}

// NewCollector returns a kubelet CollectorProvider that instantiates its collector
func NewCollector() (workloadmeta.CollectorProvider, error) {
return workloadmeta.CollectorProvider{
Collector: &collector{
id: collectorID,
catalog: workloadmeta.NodeAgent,
},
}, nil
}

// GetFxOptions returns the FX framework options for the collector
func GetFxOptions() fx.Option {
return fx.Provide(NewCollector)
}

// Start initializes the NVML library and sets the store
func (c *collector) Start(_ context.Context, store workloadmeta.Component) error {
if !env.IsFeaturePresent(env.NVML) {
return errors.NewDisabled(componentName, "Agent does not have NVML library available")
}

c.store = store
// TODO: Add configuration option for NVML library path
c.nvmlLib = nvml.New()
ret := c.nvmlLib.Init()
if ret != nvml.SUCCESS && ret != nvml.ERROR_ALREADY_INITIALIZED {
return fmt.Errorf("failed to initialize NVML library: %v", nvml.ErrorString(ret))
}

return nil
}

// Pull collects the GPUs available on the node and notifies the store
func (c *collector) Pull(_ context.Context) error {
count, ret := c.nvmlLib.DeviceGetCount()
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to get device count: %v", nvml.ErrorString(ret))
}

var events []workloadmeta.CollectorEvent
for i := 0; i < count; i++ {
dev, ret := c.nvmlLib.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to get device handle for index %d: %v", i, nvml.ErrorString(ret))
}

uuid, ret := dev.GetUUID()
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to get device UUID for index %d: %v", i, nvml.ErrorString(ret))
}

name, ret := dev.GetName()
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to get device name for index %d: %v", i, nvml.ErrorString(ret))
}

gpu := &workloadmeta.GPU{
EntityID: workloadmeta.EntityID{
Kind: workloadmeta.KindGPU,
ID: uuid,
},
EntityMeta: workloadmeta.EntityMeta{
Name: name,
},
Vendor: nvidiaVendor,
Device: name,
Index: i,
}

event := workloadmeta.CollectorEvent{
Source: workloadmeta.SourceRuntime,
Type: workloadmeta.EventTypeSet,
Entity: gpu,
}
events = append(events, event)
}

c.store.Notify(events)

return nil
}

func (c *collector) GetID() string {
return c.id
}

func (c *collector) GetTargetCatalog() workloadmeta.AgentType {
return c.catalog
}
15 changes: 15 additions & 0 deletions comp/core/workloadmeta/collectors/internal/nvml/nvml_nop.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2024-present Datadog, Inc.

//go:build !linux

package nvml

import "go.uber.org/fx"

// GetFxOptions returns the FX framework options for the collector
func GetFxOptions() fx.Option {
return nil
}
48 changes: 48 additions & 0 deletions comp/core/workloadmeta/collectors/internal/nvml/nvml_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2024-present Datadog, Inc.

//go:build linux

package nvml

import (
"context"
"testing"

"github.com/stretchr/testify/require"

workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def"
"github.com/DataDog/datadog-agent/pkg/gpu/testutil"
)

func TestPull(t *testing.T) {
wmetaMock := testutil.GetWorkloadMetaMock(t)
nvmlMock := testutil.GetBasicNvmlMock()

c := &collector{
id: collectorID,
catalog: workloadmeta.NodeAgent,
store: wmetaMock,
nvmlLib: nvmlMock,
}

c.Pull(context.Background())

gpus := wmetaMock.ListGPUs()
require.Equal(t, len(testutil.GPUUUIDs), len(gpus))

foundIDs := make(map[string]bool)
for _, gpu := range gpus {
foundIDs[gpu.ID] = true

require.Equal(t, nvidiaVendor, gpu.Vendor)
require.Equal(t, testutil.DefaultGPUName, gpu.Name)
require.Equal(t, testutil.DefaultGPUName, gpu.Device)
}

for _, uuid := range testutil.GPUUUIDs {
require.True(t, foundIDs[uuid], "GPU with UUID %s not found", uuid)
}
}
7 changes: 7 additions & 0 deletions comp/core/workloadmeta/collectors/internal/nvml/stub.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2024-present Datadog, Inc.

// Package nvml implements the NVML collector for workloadmeta
package nvml
Loading

0 comments on commit 14ca022

Please sign in to comment.