From 8c4fcbafa1aede32c87e9e438d756cbce13acadc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Wed, 11 Dec 2024 17:06:51 +0100 Subject: [PATCH] Add NVML collector --- .github/CODEOWNERS | 1 + .../collectors/internal/nvml/nvml.go | 120 ++++++++++++++++++ .../collectors/internal/nvml/nvml_nop.go | 15 +++ .../collectors/internal/nvml/stub.go | 7 + .../env/environment_container_features.go | 2 + pkg/config/env/environment_containers.go | 15 +++ 6 files changed, 160 insertions(+) create mode 100644 comp/core/workloadmeta/collectors/internal/nvml/nvml.go create mode 100644 comp/core/workloadmeta/collectors/internal/nvml/nvml_nop.go create mode 100644 comp/core/workloadmeta/collectors/internal/nvml/stub.go diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 4647c2ede6ae00..7a9b5c9fce8413 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -539,6 +539,7 @@ /pkg/tagger/ @DataDog/container-platform /pkg/windowsdriver/ @DataDog/windows-kernel-integrations /comp/core/workloadmeta/collectors/internal/cloudfoundry @DataDog/platform-integrations +/comp/core/workloadmeta/collectors/internal/nvml @DataDog/ebpf-platform /pkg/sbom/ @DataDog/container-integrations @DataDog/agent-security /pkg/internaltelemetry @DataDog/windows-kernel-integrations @DataDog/fleet /pkg/networkpath/ @DataDog/network-device-monitoring @DataDog/Networks diff --git a/comp/core/workloadmeta/collectors/internal/nvml/nvml.go b/comp/core/workloadmeta/collectors/internal/nvml/nvml.go new file mode 100644 index 00000000000000..e0d75e08179fc9 --- /dev/null +++ b/comp/core/workloadmeta/collectors/internal/nvml/nvml.go @@ -0,0 +1,120 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2024-present Datadog, Inc. + +//go:build linux + +package nvml + +import ( + "context" + "fmt" + + "go.uber.org/fx" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + + workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def" + "github.com/DataDog/datadog-agent/pkg/config/env" + "github.com/DataDog/datadog-agent/pkg/errors" +) + +const ( + collectorID = "nvml" + componentName = "workloadmeta-nvml" +) + +type collector struct { + id string + catalog workloadmeta.AgentType + store workloadmeta.Component + nvmlLib nvml.Interface +} + +// NewCollector returns a kubelet CollectorProvider that instantiates its collector +func NewCollector() (workloadmeta.CollectorProvider, error) { + return workloadmeta.CollectorProvider{ + Collector: &collector{ + id: collectorID, + catalog: workloadmeta.NodeAgent, + }, + }, nil +} + +// GetFxOptions returns the FX framework options for the collector +func GetFxOptions() fx.Option { + return fx.Provide(NewCollector) +} + +func (c *collector) Start(_ context.Context, store workloadmeta.Component) error { + if !env.IsFeaturePresent(env.NVML) { + return errors.NewDisabled(componentName, "Agent does not have NVML library available") + } + + c.store = store + // TODO: Add configuration option for NVML library path + c.nvmlLib = nvml.New() + ret := c.nvmlLib.Init() + if ret != nvml.SUCCESS && ret != nvml.ERROR_ALREADY_INITIALIZED { + return fmt.Errorf("failed to initialize NVML library: %v", nvml.ErrorString(ret)) + } + + return nil +} + +func (c *collector) Pull(ctx context.Context) error { + count, ret := c.nvmlLib.DeviceGetCount() + if ret != nvml.SUCCESS { + return fmt.Errorf("failed to get device count: %v", nvml.ErrorString(ret)) + } + + var events []workloadmeta.CollectorEvent + for i := 0; i < count; i++ { + dev, ret := c.nvmlLib.DeviceGetHandleByIndex(i) + if ret != nvml.SUCCESS { + return fmt.Errorf("failed to get device handle for index %d: %v", i, nvml.ErrorString(ret)) + } + + uuid, ret := dev.GetUUID() + if ret != nvml.SUCCESS { + return fmt.Errorf("failed to get device UUID for index %d: %v", i, nvml.ErrorString(ret)) + } + + model, ret := dev.GetName() + if ret != nvml.SUCCESS { + return fmt.Errorf("failed to get device model for index %d: %v", i, nvml.ErrorString(ret)) + } + + gpu := &workloadmeta.GPU{ + EntityID: workloadmeta.EntityID{ + Kind: workloadmeta.KindGPU, + ID: uuid, + }, + EntityMeta: workloadmeta.EntityMeta{ + Name: model, + }, + Vendor: "nvidia", + Model: model, + } + + event := workloadmeta.CollectorEvent{ + Source: workloadmeta.SourceRuntime, + Type: workloadmeta.EventTypeSet, + Entity: gpu, + } + events = append(events, event) + } + + c.store.Notify(events) + + return nil +} + +func (c *collector) GetID() string { + return c.id +} + +func (c *collector) GetTargetCatalog() workloadmeta.AgentType { + return c.catalog +} diff --git a/comp/core/workloadmeta/collectors/internal/nvml/nvml_nop.go b/comp/core/workloadmeta/collectors/internal/nvml/nvml_nop.go new file mode 100644 index 00000000000000..7d6beacc270f97 --- /dev/null +++ b/comp/core/workloadmeta/collectors/internal/nvml/nvml_nop.go @@ -0,0 +1,15 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2024-present Datadog, Inc. + +//go:build !linux + +package nvml + +import "go.uber.org/fx" + +// GetFxOptions returns the FX framework options for the collector +func GetFxOptions() fx.Option { + return nil +} diff --git a/comp/core/workloadmeta/collectors/internal/nvml/stub.go b/comp/core/workloadmeta/collectors/internal/nvml/stub.go new file mode 100644 index 00000000000000..bb7f692241c977 --- /dev/null +++ b/comp/core/workloadmeta/collectors/internal/nvml/stub.go @@ -0,0 +1,7 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2024-present Datadog, Inc. + +// Package nvml implements the NVML collector for workloadmeta +package nvml diff --git a/pkg/config/env/environment_container_features.go b/pkg/config/env/environment_container_features.go index ffced09b49c611..ebb854289e938d 100644 --- a/pkg/config/env/environment_container_features.go +++ b/pkg/config/env/environment_container_features.go @@ -33,4 +33,6 @@ const ( Podman Feature = "podman" // PodResources socket present PodResources Feature = "podresources" + // NVML library present for GPU detection + NVML Feature = "nvml" ) diff --git a/pkg/config/env/environment_containers.go b/pkg/config/env/environment_containers.go index 7dbb0cf9ce58f3..fa57cb3a35a481 100644 --- a/pkg/config/env/environment_containers.go +++ b/pkg/config/env/environment_containers.go @@ -14,6 +14,8 @@ import ( "strings" "time" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/DataDog/datadog-agent/pkg/config/model" "github.com/DataDog/datadog-agent/pkg/util/log" "github.com/DataDog/datadog-agent/pkg/util/system/socket" @@ -47,6 +49,7 @@ func init() { registerFeature(CloudFoundry) registerFeature(Podman) registerFeature(PodResources) + registerFeature(NVML) } // IsAnyContainerFeaturePresent checks if any of known container features is present @@ -71,6 +74,7 @@ func detectContainerFeatures(features FeatureMap, cfg model.Reader) { detectCloudFoundry(features, cfg) detectPodman(features, cfg) detectPodResources(features, cfg) + detectNVML(features, cfg) } func detectKubernetes(features FeatureMap, cfg model.Reader) { @@ -243,6 +247,17 @@ func detectPodResources(features FeatureMap, cfg model.Reader) { } } +func detectNVML(features FeatureMap, cfg model.Reader) { + // TODO: Add configuration option for NVML library path + nvmlLib := nvml.New() + ret := nvmlLib.Init() + if ret == nvml.SUCCESS || ret == nvml.ERROR_ALREADY_INITIALIZED { + features[NVML] = struct{}{} + } else { + log.Infof("Agent did not find NVML library for NVIDIA GPU detection: %v", nvml.ErrorString(ret)) + } +} + func getHostMountPrefixes() []string { if IsContainerized() { return []string{"", defaultHostMountPrefix}