Skip to content

Commit

Permalink
Add NVML collector
Browse files Browse the repository at this point in the history
  • Loading branch information
gjulianm committed Dec 11, 2024
1 parent 6f0a929 commit 8c4fcba
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,7 @@
/pkg/tagger/ @DataDog/container-platform
/pkg/windowsdriver/ @DataDog/windows-kernel-integrations
/comp/core/workloadmeta/collectors/internal/cloudfoundry @DataDog/platform-integrations
/comp/core/workloadmeta/collectors/internal/nvml @DataDog/ebpf-platform
/pkg/sbom/ @DataDog/container-integrations @DataDog/agent-security
/pkg/internaltelemetry @DataDog/windows-kernel-integrations @DataDog/fleet
/pkg/networkpath/ @DataDog/network-device-monitoring @DataDog/Networks
Expand Down
120 changes: 120 additions & 0 deletions comp/core/workloadmeta/collectors/internal/nvml/nvml.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2024-present Datadog, Inc.

//go:build linux

package nvml

import (
"context"
"fmt"

"go.uber.org/fx"

"github.com/NVIDIA/go-nvml/pkg/nvml"

workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def"
"github.com/DataDog/datadog-agent/pkg/config/env"
"github.com/DataDog/datadog-agent/pkg/errors"
)

const (
collectorID = "nvml"
componentName = "workloadmeta-nvml"
)

type collector struct {
id string
catalog workloadmeta.AgentType
store workloadmeta.Component
nvmlLib nvml.Interface
}

// NewCollector returns a kubelet CollectorProvider that instantiates its collector
func NewCollector() (workloadmeta.CollectorProvider, error) {
return workloadmeta.CollectorProvider{
Collector: &collector{
id: collectorID,
catalog: workloadmeta.NodeAgent,
},
}, nil
}

// GetFxOptions returns the FX framework options for the collector
func GetFxOptions() fx.Option {
return fx.Provide(NewCollector)
}

func (c *collector) Start(_ context.Context, store workloadmeta.Component) error {
if !env.IsFeaturePresent(env.NVML) {
return errors.NewDisabled(componentName, "Agent does not have NVML library available")
}

c.store = store
// TODO: Add configuration option for NVML library path
c.nvmlLib = nvml.New()
ret := c.nvmlLib.Init()
if ret != nvml.SUCCESS && ret != nvml.ERROR_ALREADY_INITIALIZED {
return fmt.Errorf("failed to initialize NVML library: %v", nvml.ErrorString(ret))
}

return nil
}

func (c *collector) Pull(ctx context.Context) error {
count, ret := c.nvmlLib.DeviceGetCount()
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to get device count: %v", nvml.ErrorString(ret))
}

var events []workloadmeta.CollectorEvent
for i := 0; i < count; i++ {
dev, ret := c.nvmlLib.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to get device handle for index %d: %v", i, nvml.ErrorString(ret))
}

uuid, ret := dev.GetUUID()
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to get device UUID for index %d: %v", i, nvml.ErrorString(ret))
}

model, ret := dev.GetName()
if ret != nvml.SUCCESS {
return fmt.Errorf("failed to get device model for index %d: %v", i, nvml.ErrorString(ret))
}

gpu := &workloadmeta.GPU{
EntityID: workloadmeta.EntityID{
Kind: workloadmeta.KindGPU,
ID: uuid,
},
EntityMeta: workloadmeta.EntityMeta{
Name: model,
},
Vendor: "nvidia",
Model: model,
}

event := workloadmeta.CollectorEvent{
Source: workloadmeta.SourceRuntime,
Type: workloadmeta.EventTypeSet,
Entity: gpu,
}
events = append(events, event)
}

c.store.Notify(events)

return nil
}

func (c *collector) GetID() string {
return c.id
}

func (c *collector) GetTargetCatalog() workloadmeta.AgentType {
return c.catalog
}
15 changes: 15 additions & 0 deletions comp/core/workloadmeta/collectors/internal/nvml/nvml_nop.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2024-present Datadog, Inc.

//go:build !linux

package nvml

import "go.uber.org/fx"

// GetFxOptions returns the FX framework options for the collector
func GetFxOptions() fx.Option {
return nil
}
7 changes: 7 additions & 0 deletions comp/core/workloadmeta/collectors/internal/nvml/stub.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2024-present Datadog, Inc.

// Package nvml implements the NVML collector for workloadmeta
package nvml
2 changes: 2 additions & 0 deletions pkg/config/env/environment_container_features.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,6 @@ const (
Podman Feature = "podman"
// PodResources socket present
PodResources Feature = "podresources"
// NVML library present for GPU detection
NVML Feature = "nvml"
)
15 changes: 15 additions & 0 deletions pkg/config/env/environment_containers.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import (
"strings"
"time"

"github.com/NVIDIA/go-nvml/pkg/nvml"

"github.com/DataDog/datadog-agent/pkg/config/model"
"github.com/DataDog/datadog-agent/pkg/util/log"
"github.com/DataDog/datadog-agent/pkg/util/system/socket"
Expand Down Expand Up @@ -47,6 +49,7 @@ func init() {
registerFeature(CloudFoundry)
registerFeature(Podman)
registerFeature(PodResources)
registerFeature(NVML)
}

// IsAnyContainerFeaturePresent checks if any of known container features is present
Expand All @@ -71,6 +74,7 @@ func detectContainerFeatures(features FeatureMap, cfg model.Reader) {
detectCloudFoundry(features, cfg)
detectPodman(features, cfg)
detectPodResources(features, cfg)
detectNVML(features, cfg)
}

func detectKubernetes(features FeatureMap, cfg model.Reader) {
Expand Down Expand Up @@ -243,6 +247,17 @@ func detectPodResources(features FeatureMap, cfg model.Reader) {
}
}

func detectNVML(features FeatureMap, cfg model.Reader) {
// TODO: Add configuration option for NVML library path
nvmlLib := nvml.New()
ret := nvmlLib.Init()
if ret == nvml.SUCCESS || ret == nvml.ERROR_ALREADY_INITIALIZED {
features[NVML] = struct{}{}
} else {
log.Infof("Agent did not find NVML library for NVIDIA GPU detection: %v", nvml.ErrorString(ret))
}
}

func getHostMountPrefixes() []string {
if IsContainerized() {
return []string{"", defaultHostMountPrefix}
Expand Down

0 comments on commit 8c4fcba

Please sign in to comment.