-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This PR adds the NVML collector to workloadmeta, so that we collect data about the NVIDIA GPUs present in the system that can be used in other parts of the agent, including the tagger.
- Loading branch information
Showing
59 changed files
with
565 additions
and
246 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
124 changes: 124 additions & 0 deletions
124
comp/core/workloadmeta/collectors/internal/nvml/nvml.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
// Unless explicitly stated otherwise all files in this repository are licensed | ||
// under the Apache License Version 2.0. | ||
// This product includes software developed at Datadog (https://www.datadoghq.com/). | ||
// Copyright 2024-present Datadog, Inc. | ||
|
||
//go:build linux | ||
|
||
package nvml | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
|
||
"go.uber.org/fx" | ||
|
||
"github.com/NVIDIA/go-nvml/pkg/nvml" | ||
|
||
workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def" | ||
"github.com/DataDog/datadog-agent/pkg/config/env" | ||
"github.com/DataDog/datadog-agent/pkg/errors" | ||
) | ||
|
||
const ( | ||
collectorID = "nvml" | ||
componentName = "workloadmeta-nvml" | ||
nvidiaVendor = "nvidia" | ||
) | ||
|
||
type collector struct { | ||
id string | ||
catalog workloadmeta.AgentType | ||
store workloadmeta.Component | ||
nvmlLib nvml.Interface | ||
} | ||
|
||
// NewCollector returns a kubelet CollectorProvider that instantiates its collector | ||
func NewCollector() (workloadmeta.CollectorProvider, error) { | ||
return workloadmeta.CollectorProvider{ | ||
Collector: &collector{ | ||
id: collectorID, | ||
catalog: workloadmeta.NodeAgent, | ||
}, | ||
}, nil | ||
} | ||
|
||
// GetFxOptions returns the FX framework options for the collector | ||
func GetFxOptions() fx.Option { | ||
return fx.Provide(NewCollector) | ||
} | ||
|
||
// Start initializes the NVML library and sets the store | ||
func (c *collector) Start(_ context.Context, store workloadmeta.Component) error { | ||
if !env.IsFeaturePresent(env.NVML) { | ||
return errors.NewDisabled(componentName, "Agent does not have NVML library available") | ||
} | ||
|
||
c.store = store | ||
// TODO: Add configuration option for NVML library path | ||
c.nvmlLib = nvml.New() | ||
ret := c.nvmlLib.Init() | ||
if ret != nvml.SUCCESS && ret != nvml.ERROR_ALREADY_INITIALIZED { | ||
return fmt.Errorf("failed to initialize NVML library: %v", nvml.ErrorString(ret)) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
// Pull collects the GPUs available on the node and notifies the store | ||
func (c *collector) Pull(_ context.Context) error { | ||
count, ret := c.nvmlLib.DeviceGetCount() | ||
if ret != nvml.SUCCESS { | ||
return fmt.Errorf("failed to get device count: %v", nvml.ErrorString(ret)) | ||
} | ||
|
||
var events []workloadmeta.CollectorEvent | ||
for i := 0; i < count; i++ { | ||
dev, ret := c.nvmlLib.DeviceGetHandleByIndex(i) | ||
if ret != nvml.SUCCESS { | ||
return fmt.Errorf("failed to get device handle for index %d: %v", i, nvml.ErrorString(ret)) | ||
} | ||
|
||
uuid, ret := dev.GetUUID() | ||
if ret != nvml.SUCCESS { | ||
return fmt.Errorf("failed to get device UUID for index %d: %v", i, nvml.ErrorString(ret)) | ||
} | ||
|
||
name, ret := dev.GetName() | ||
if ret != nvml.SUCCESS { | ||
return fmt.Errorf("failed to get device name for index %d: %v", i, nvml.ErrorString(ret)) | ||
} | ||
|
||
gpu := &workloadmeta.GPU{ | ||
EntityID: workloadmeta.EntityID{ | ||
Kind: workloadmeta.KindGPU, | ||
ID: uuid, | ||
}, | ||
EntityMeta: workloadmeta.EntityMeta{ | ||
Name: name, | ||
}, | ||
Vendor: nvidiaVendor, | ||
Device: name, | ||
Index: i, | ||
} | ||
|
||
event := workloadmeta.CollectorEvent{ | ||
Source: workloadmeta.SourceRuntime, | ||
Type: workloadmeta.EventTypeSet, | ||
Entity: gpu, | ||
} | ||
events = append(events, event) | ||
} | ||
|
||
c.store.Notify(events) | ||
|
||
return nil | ||
} | ||
|
||
func (c *collector) GetID() string { | ||
return c.id | ||
} | ||
|
||
func (c *collector) GetTargetCatalog() workloadmeta.AgentType { | ||
return c.catalog | ||
} |
15 changes: 15 additions & 0 deletions
15
comp/core/workloadmeta/collectors/internal/nvml/nvml_nop.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
// Unless explicitly stated otherwise all files in this repository are licensed | ||
// under the Apache License Version 2.0. | ||
// This product includes software developed at Datadog (https://www.datadoghq.com/). | ||
// Copyright 2024-present Datadog, Inc. | ||
|
||
//go:build !linux | ||
|
||
package nvml | ||
|
||
import "go.uber.org/fx" | ||
|
||
// GetFxOptions returns the FX framework options for the collector | ||
func GetFxOptions() fx.Option { | ||
return nil | ||
} |
48 changes: 48 additions & 0 deletions
48
comp/core/workloadmeta/collectors/internal/nvml/nvml_test.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
// Unless explicitly stated otherwise all files in this repository are licensed | ||
// under the Apache License Version 2.0. | ||
// This product includes software developed at Datadog (https://www.datadoghq.com/). | ||
// Copyright 2024-present Datadog, Inc. | ||
|
||
//go:build linux | ||
|
||
package nvml | ||
|
||
import ( | ||
"context" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/require" | ||
|
||
workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def" | ||
"github.com/DataDog/datadog-agent/pkg/gpu/testutil" | ||
) | ||
|
||
func TestPull(t *testing.T) { | ||
wmetaMock := testutil.GetWorkloadMetaMock(t) | ||
nvmlMock := testutil.GetBasicNvmlMock() | ||
|
||
c := &collector{ | ||
id: collectorID, | ||
catalog: workloadmeta.NodeAgent, | ||
store: wmetaMock, | ||
nvmlLib: nvmlMock, | ||
} | ||
|
||
c.Pull(context.Background()) | ||
|
||
gpus := wmetaMock.ListGPUs() | ||
require.Equal(t, len(testutil.GPUUUIDs), len(gpus)) | ||
|
||
foundIDs := make(map[string]bool) | ||
for _, gpu := range gpus { | ||
foundIDs[gpu.ID] = true | ||
|
||
require.Equal(t, nvidiaVendor, gpu.Vendor) | ||
require.Equal(t, testutil.DefaultGPUName, gpu.Name) | ||
require.Equal(t, testutil.DefaultGPUName, gpu.Device) | ||
} | ||
|
||
for _, uuid := range testutil.GPUUUIDs { | ||
require.True(t, foundIDs[uuid], "GPU with UUID %s not found", uuid) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
// Unless explicitly stated otherwise all files in this repository are licensed | ||
// under the Apache License Version 2.0. | ||
// This product includes software developed at Datadog (https://www.datadoghq.com/). | ||
// Copyright 2024-present Datadog, Inc. | ||
|
||
// Package nvml implements the NVML collector for workloadmeta | ||
package nvml |
Oops, something went wrong.