From 49cb49b4a1aa6b91cb8ffa31d5f3e066cabcaa62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Wed, 11 Dec 2024 17:28:38 +0100 Subject: [PATCH 1/2] Add tagger support for GPUs --- .../tagger/collectors/workloadmeta_extract.go | 31 ++++++++++- .../tagger/collectors/workloadmeta_main.go | 1 + .../tagger/collectors/workloadmeta_test.go | 55 +++++++++++++++++++ comp/core/tagger/common/entity_id_builder.go | 2 + comp/core/tagger/tags/tags.go | 11 +++- comp/core/tagger/types/entity_id.go | 3 + comp/core/tagger/types/filter_builder_test.go | 1 + 7 files changed, 102 insertions(+), 2 deletions(-) diff --git a/comp/core/tagger/collectors/workloadmeta_extract.go b/comp/core/tagger/collectors/workloadmeta_extract.go index eee5f075d1a2b..b19b534856e00 100644 --- a/comp/core/tagger/collectors/workloadmeta_extract.go +++ b/comp/core/tagger/collectors/workloadmeta_extract.go @@ -150,7 +150,7 @@ func (c *WorkloadMetaCollector) processEvents(evBundle workloadmeta.EventBundle) case workloadmeta.KindKubernetesDeployment: tagInfos = append(tagInfos, c.handleKubeDeployment(ev)...) case workloadmeta.KindGPU: - // tagInfos = append(tagInfos, c.handleGPU(ev)...) No tags for now + tagInfos = append(tagInfos, c.handleGPU(ev)...) default: log.Errorf("cannot handle event for entity %q with kind %q", entityID.ID, entityID.Kind) } @@ -615,6 +615,35 @@ func (c *WorkloadMetaCollector) handleKubeMetadata(ev workloadmeta.Event) []*typ return tagInfos } +func (c *WorkloadMetaCollector) handleGPU(ev workloadmeta.Event) []*types.TagInfo { + gpu := ev.Entity.(*workloadmeta.GPU) + + tagList := taglist.NewTagList() + + tagList.AddLow(tags.KubeGPUVendor, gpu.Vendor) + tagList.AddLow(tags.KubeGPUDevice, gpu.Device) + tagList.AddLow(tags.KubeGPUUUID, gpu.ID) + + low, orch, high, standard := tagList.Compute() + + if len(low)+len(orch)+len(high)+len(standard) == 0 { + return nil + } + + tagInfos := []*types.TagInfo{ + { + Source: gpuSource, + EntityID: common.BuildTaggerEntityID(gpu.EntityID), + HighCardTags: high, + OrchestratorCardTags: orch, + LowCardTags: low, + StandardTags: standard, + }, + } + + return tagInfos +} + func (c *WorkloadMetaCollector) extractTagsFromPodLabels(pod *workloadmeta.KubernetesPod, tagList *taglist.TagList) { for name, value := range pod.Labels { switch name { diff --git a/comp/core/tagger/collectors/workloadmeta_main.go b/comp/core/tagger/collectors/workloadmeta_main.go index 527b35b0ece0f..c3dfc7cfd6d8c 100644 --- a/comp/core/tagger/collectors/workloadmeta_main.go +++ b/comp/core/tagger/collectors/workloadmeta_main.go @@ -35,6 +35,7 @@ const ( processSource = workloadmetaCollectorName + "-" + string(workloadmeta.KindProcess) kubeMetadataSource = workloadmetaCollectorName + "-" + string(workloadmeta.KindKubernetesMetadata) deploymentSource = workloadmetaCollectorName + "-" + string(workloadmeta.KindKubernetesDeployment) + gpuSource = workloadmetaCollectorName + "-" + string(workloadmeta.KindGPU) clusterTagNamePrefix = "kube_cluster_name" ) diff --git a/comp/core/tagger/collectors/workloadmeta_test.go b/comp/core/tagger/collectors/workloadmeta_test.go index a15d7e674f41e..5837cbfbaf260 100644 --- a/comp/core/tagger/collectors/workloadmeta_test.go +++ b/comp/core/tagger/collectors/workloadmeta_test.go @@ -2253,6 +2253,61 @@ func TestHandleContainerImage(t *testing.T) { } } +func TestHandleGPU(t *testing.T) { + entityID := workloadmeta.EntityID{ + Kind: workloadmeta.KindGPU, + ID: "gpu-1234", + } + + taggerEntityID := types.NewEntityID(types.GPU, entityID.ID) + + tests := []struct { + name string + gpu workloadmeta.GPU + expected []*types.TagInfo + }{ + { + name: "basic", + gpu: workloadmeta.GPU{ + EntityID: entityID, + EntityMeta: workloadmeta.EntityMeta{ + Name: entityID.ID, + }, + Vendor: "nvidia", + Device: "tesla-v100", + }, + expected: []*types.TagInfo{ + { + Source: gpuSource, + EntityID: taggerEntityID, + HighCardTags: []string{}, + OrchestratorCardTags: []string{}, + LowCardTags: []string{ + "gpu_vendor:nvidia", + "gpu_device:tesla-v100", + "gpu_uuid:gpu-1234", + }, + StandardTags: []string{}, + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfg := configmock.New(t) + collector := NewWorkloadMetaCollector(context.Background(), cfg, nil, nil) + + actual := collector.handleGPU(workloadmeta.Event{ + Type: workloadmeta.EventTypeSet, + Entity: &tt.gpu, + }) + + assertTagInfoListEqual(t, tt.expected, actual) + }) + } +} + func TestHandleDelete(t *testing.T) { const ( podName = "datadog-agent-foobar" diff --git a/comp/core/tagger/common/entity_id_builder.go b/comp/core/tagger/common/entity_id_builder.go index 054a0b4493ba6..108f440e0cf3f 100644 --- a/comp/core/tagger/common/entity_id_builder.go +++ b/comp/core/tagger/common/entity_id_builder.go @@ -29,6 +29,8 @@ func BuildTaggerEntityID(entityID workloadmeta.EntityID) types.EntityID { return types.NewEntityID(types.KubernetesDeployment, entityID.ID) case workloadmeta.KindKubernetesMetadata: return types.NewEntityID(types.KubernetesMetadata, entityID.ID) + case workloadmeta.KindGPU: + return types.NewEntityID(types.GPU, entityID.ID) default: log.Errorf("can't recognize entity %q with kind %q; trying %s://%s as tagger entity", entityID.ID, entityID.Kind, entityID.ID, entityID.Kind) diff --git a/comp/core/tagger/tags/tags.go b/comp/core/tagger/tags/tags.go index 818ab9a7576cb..8b4517f96e371 100644 --- a/comp/core/tagger/tags/tags.go +++ b/comp/core/tagger/tags/tags.go @@ -91,9 +91,18 @@ const ( // GPU related tags - // KubeGPUVendor the tag for the Kubernetes Resource GPU vendor + // KubeGPUVendor the tag for the Kubernetes Resource GPU vendor (e.g., NVIDIA). KubeGPUVendor = "gpu_vendor" + // KubeGPUDevice is the tag for the Kubernetes Resource GPU device. This is + // the commercial name of the device (e.g., Tesla T4). See + // comp/core/workloadmeta/def/types.go:GPU.Device for more detail on this + // field. + KubeGPUDevice = "gpu_device" + + // KubeGPUUUID is the tag for the Kubernetes Resource GPU UUID + KubeGPUUUID = "gpu_uuid" + // OpenshiftDeploymentConfig is the tag for the OpenShift deployment config name OpenshiftDeploymentConfig = "oshift_deployment_config" diff --git a/comp/core/tagger/types/entity_id.go b/comp/core/tagger/types/entity_id.go index c598c13fcf43d..8fb3969e007c2 100644 --- a/comp/core/tagger/types/entity_id.go +++ b/comp/core/tagger/types/entity_id.go @@ -71,6 +71,8 @@ const ( Process EntityIDPrefix = "process" // InternalID is the prefix `internal` InternalID EntityIDPrefix = "internal" + // GPU is the prefix `gpu` + GPU EntityIDPrefix = "gpu" ) // AllPrefixesSet returns a set of all possible entity id prefixes that can be used in the tagger @@ -85,6 +87,7 @@ func AllPrefixesSet() map[EntityIDPrefix]struct{} { KubernetesPodUID: {}, Process: {}, InternalID: {}, + GPU: {}, } } diff --git a/comp/core/tagger/types/filter_builder_test.go b/comp/core/tagger/types/filter_builder_test.go index 72beadbf33910..4c05a49452ec1 100644 --- a/comp/core/tagger/types/filter_builder_test.go +++ b/comp/core/tagger/types/filter_builder_test.go @@ -59,6 +59,7 @@ func TestFilterBuilderOps(t *testing.T) { KubernetesPodUID: {}, Process: {}, InternalID: {}, + GPU: {}, }, cardinality: HighCardinality, }, From 172313619546f21abd48b97597dd854077638bcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Mon, 13 Jan 2025 11:56:45 +0100 Subject: [PATCH 2/2] Add entity ID to README.md --- comp/core/tagger/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/comp/core/tagger/README.md b/comp/core/tagger/README.md index 774a2fcaeecf2..060a3e5680cf3 100644 --- a/comp/core/tagger/README.md +++ b/comp/core/tagger/README.md @@ -57,6 +57,7 @@ Tagger entities are identified by a string-typed ID, with one of the following f | workloadmeta.KindKubernetesMetadata | `kubernetes_metadata://///` (`` is empty in cluster-scoped objects) | | workloadmeta.KindKubernetesPod | `kubernetes_pod_uid://` | | workloadmeta.KindProcess | `process://` | +| workloadmeta.KindGPU | `gpu://` | ## Tagger