diff --git a/go.mod b/go.mod index 581d60ab..c6cac73c 100644 --- a/go.mod +++ b/go.mod @@ -29,7 +29,7 @@ replace ( require ( github.com/NVIDIA/go-dcgm v0.0.0-20240108230649-3c233ee2a242 - github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20211102125545-5a2c58442e48 + github.com/NVIDIA/go-nvml v0.12.0-1.0.20231031105836-a160364ba1cc github.com/avast/retry-go/v4 v4.5.1 github.com/bits-and-blooms/bitset v1.12.0 github.com/gorilla/mux v1.8.1 @@ -75,7 +75,7 @@ require ( github.com/prometheus/procfs v0.11.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect - golang.org/x/crypto v0.16.0 // indirect + golang.org/x/crypto v0.17.0 // indirect golang.org/x/net v0.17.0 // indirect golang.org/x/oauth2 v0.13.0 // indirect golang.org/x/sync v0.5.0 // indirect diff --git a/go.sum b/go.sum index 044ff305..35afbc10 100644 --- a/go.sum +++ b/go.sum @@ -58,9 +58,8 @@ github.com/Microsoft/go-winio v0.4.14/go.mod h1:qXqCSQ3Xa7+6tgxaGTIe4Kpcdsi+P8jB github.com/Microsoft/hcsshim v0.0.0-20190417211021-672e52e9209d/go.mod h1:Op3hHsoHPAvb6lceZHDtd9OkTew38wNoXnJs8iY7rUg= github.com/NVIDIA/go-dcgm v0.0.0-20240108230649-3c233ee2a242 h1:H+Md4NKlMvN/rTNCVMFqRGXAgag0dRs2NsEEIfTRReM= github.com/NVIDIA/go-dcgm v0.0.0-20240108230649-3c233ee2a242/go.mod h1:eAZdHcOerdg1hyVoWwJ6jGQ+bxl95PfreT1S7ukI7mY= -github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20211102125545-5a2c58442e48 h1:JO/JF5CBte9mvATbhoh32swu9erf07ZdLgwFj8u21UQ= -github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20211102125545-5a2c58442e48/go.mod h1:oKPJa5eOTkWvlT4/Y4D8Nds44Fzmww5HUK+xwO+DwTA= -github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm v0.0.0-20210325210537-29b4f1784f18/go.mod h1:8qXwltEzU3idjUcVpMOv3FNgxxbDeXZPGMLyc/khWiY= +github.com/NVIDIA/go-nvml v0.12.0-1.0.20231031105836-a160364ba1cc h1:cpPqTnfDcYPZyvc55pdf+3PnHYZRolqp95HH9ORa12o= +github.com/NVIDIA/go-nvml v0.12.0-1.0.20231031105836-a160364ba1cc/go.mod h1:7ruy85eOM73muOc/I37euONSwEyFqZsv5ED9AogD4G0= github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/OpenPeeDeeP/depguard v1.0.0/go.mod h1:7/4sitnI9YlQgTLLk734QlzXT8DuHVnAyztLplQjk+o= @@ -368,7 +367,6 @@ github.com/gophercloud/gophercloud v0.1.0/go.mod h1:vxM41WHh5uqHVBMZHzuwNOHh8XEo github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= github.com/gorilla/context v1.1.1/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51q0aT7Yg= github.com/gorilla/mux v1.7.0/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= -github.com/gorilla/mux v1.7.4/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= @@ -728,8 +726,8 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20201002170205-7f63de1d35b0/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.16.0 h1:mMMrFzRSCF0GvB7Ne27XVtVAaXLrPmgPC7/v0tkwHaY= -golang.org/x/crypto v0.16.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= +golang.org/x/crypto v0.17.0 h1:r8bRNjWL3GshPW3gkd+RpvzWrZAwPS49OmTGZ/uhM4k= +golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= diff --git a/internal/README.md b/internal/README.md new file mode 100644 index 00000000..6ac0cf67 --- /dev/null +++ b/internal/README.md @@ -0,0 +1,4 @@ + `/internal` + +Code intended for private use only, not for external import. +Note that this layout pattern is enforced by the Go compiler itself. See the Go 1.4 [`release notes`](https://golang.org/doc/go1.4#internalpackages) for more details. \ No newline at end of file diff --git a/internal/pkg/nvmlprovider/provider.go b/internal/pkg/nvmlprovider/provider.go new file mode 100644 index 00000000..778af95e --- /dev/null +++ b/internal/pkg/nvmlprovider/provider.go @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package nvmlprovider + +import ( + "errors" + "fmt" + "strconv" + "strings" + "sync" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/sirupsen/logrus" +) + +var nvmlOnce *sync.Once = new(sync.Once) + +type MIGDeviceInfo struct { + ParentUUID string + GPUInstanceID int + ComputeInstanceID int +} + +// GetMIGDeviceInfoByID returns information about MIG DEVICE by ID +func GetMIGDeviceInfoByID(uuid string) (*MIGDeviceInfo, error) { + var err error + + nvmlOnce.Do(func() { + ret := nvml.Init() + if ret != nvml.SUCCESS { + err = errors.New(nvml.ErrorString(ret)) + logrus.Error("Can not init NVML library") + } + }) + if err != nil { + return nil, err + } + + // 1. With drivers >= R470 (470.42.01+), each MIG device is assigned a GPU UUID starting + // with MIG-. + + device, ret := nvml.DeviceGetHandleByUUID(uuid) + if ret == nvml.SUCCESS { + parentDevice, ret := device.GetDeviceHandleFromMigDeviceHandle() + if ret != nvml.SUCCESS { + return nil, errors.New(nvml.ErrorString(ret)) + } + + parentUUID, ret := parentDevice.GetUUID() + if ret != nvml.SUCCESS { + return nil, errors.New(nvml.ErrorString(ret)) + } + + gi, ret := device.GetGpuInstanceId() + if ret != nvml.SUCCESS { + return nil, errors.New(nvml.ErrorString(ret)) + } + + ci, ret := device.GetComputeInstanceId() + if ret != nvml.SUCCESS { + return nil, errors.New(nvml.ErrorString(ret)) + } + + return &MIGDeviceInfo{ + ParentUUID: parentUUID, + GPUInstanceID: gi, + ComputeInstanceID: ci, + }, nil + } + + // 2. With drivers < R470 (e.g. R450 and R460), each MIG device is enumerated by + // specifying the CI and the corresponding parent GI. The format follows this + // convention: MIG-//. + + tokens := strings.SplitN(uuid, "-", 2) + if len(tokens) != 2 || tokens[0] != "MIG" { + return nil, fmt.Errorf("Unable to parse UUID as MIG device") + } + + tokens = strings.SplitN(tokens[1], "/", 3) + if len(tokens) != 3 || !strings.HasPrefix(tokens[0], "GPU-") { + return nil, fmt.Errorf("Unable to parse UUID as MIG device") + } + + gi, err := strconv.Atoi(tokens[1]) + if err != nil { + return nil, fmt.Errorf("Unable to parse UUID as MIG device") + } + + ci, err := strconv.Atoi(tokens[2]) + if err != nil { + return nil, fmt.Errorf("Unable to parse UUID as MIG device") + } + + return &MIGDeviceInfo{ + ParentUUID: tokens[0], + GPUInstanceID: gi, + ComputeInstanceID: ci, + }, nil +} diff --git a/internal/pkg/nvmlprovider/provider_test.go b/internal/pkg/nvmlprovider/provider_test.go new file mode 100644 index 00000000..0b63a7f4 --- /dev/null +++ b/internal/pkg/nvmlprovider/provider_test.go @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package nvmlprovider + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestGetMIGDeviceInfoByID_When_DriverVersion_Below_R470(t *testing.T) { + tests := []struct { + name string + uuid string + expectedGPU string + expectedGi int + expectedCi int + expectedError bool + }{ + { + name: "Successfull Parsing", + uuid: "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5", + expectedGPU: "GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5", + expectedGi: 1, + expectedCi: 5, + }, + { + name: "Fail, Missing MIG at the beginning of UUID", + uuid: "GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5", + expectedError: true, + }, + { + name: "Fail, Missing GPU at the beginning of GPU UUID", + uuid: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5", + expectedError: true, + }, + { + name: "Fail, GI not parsable", + uuid: "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/xx/5", + expectedError: true, + }, + { + name: "Fail, CI not a parsable", + uuid: "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/xx", + expectedError: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + deviceInfo, err := GetMIGDeviceInfoByID(tc.uuid) + if tc.expectedError && err != nil { + return + } + if tc.expectedError && err == nil { + t.Fatalf("Expected an error, but didn't get one: uuid: %v, (gpu: %v, gi: %v, ci: %v)", + tc.uuid, + deviceInfo.ParentUUID, + deviceInfo.GPUInstanceID, + deviceInfo.ComputeInstanceID) + } + if !tc.expectedError && err != nil { + t.Fatalf("Unexpected error: %v, uuid: %v, (gpu: %v, gi: %v, ci: %v)", + err, + tc.uuid, + deviceInfo.ParentUUID, + deviceInfo.GPUInstanceID, + deviceInfo.ComputeInstanceID) + } + + assert.Equal(t, tc.expectedGPU, deviceInfo.ParentUUID, "MIG UUID parsed incorrectly: uuid: %v, (gpu: %v, gi: %v, ci: %v)", + tc.uuid, + deviceInfo.ParentUUID, + deviceInfo.GPUInstanceID, + deviceInfo.ComputeInstanceID) + assert.Equal(t, tc.expectedGi, deviceInfo.GPUInstanceID, "MIG UUID parsed incorrectly: uuid: %v, (gpu: %v, gi: %v, ci: %v)", + tc.uuid, + deviceInfo.ParentUUID, + deviceInfo.GPUInstanceID, + deviceInfo.ComputeInstanceID) + assert.Equal(t, tc.expectedCi, deviceInfo.ComputeInstanceID, "MIG UUID parsed incorrectly: uuid: %v, (gpu: %v, gi: %v, ci: %v)", + tc.uuid, + deviceInfo.ParentUUID, + deviceInfo.GPUInstanceID, + deviceInfo.ComputeInstanceID) + }) + } +} diff --git a/internal/pkg/testutils/testutils.go b/internal/pkg/testutils/testutils.go new file mode 100644 index 00000000..050ad284 --- /dev/null +++ b/internal/pkg/testutils/testutils.go @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package testutils + +import ( + "runtime" + "testing" +) + +// RequireLinux checks if +func RequireLinux(t *testing.T) { + t.Helper() + if runtime.GOOS != "linux" { + t.Skipf("Test is not supported on %q", runtime.GOOS) + } +} diff --git a/pkg/dcgmexporter/kubernetes.go b/pkg/dcgmexporter/kubernetes.go index b745d585..357fd9a4 100644 --- a/pkg/dcgmexporter/kubernetes.go +++ b/pkg/dcgmexporter/kubernetes.go @@ -25,7 +25,7 @@ import ( "strings" "time" - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider" "github.com/sirupsen/logrus" "google.golang.org/grpc" podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1alpha1" @@ -37,19 +37,14 @@ var ( connectionTimeout = 10 * time.Second - gkeMigDeviceIdRegex = regexp.MustCompile(`^nvidia([0-9]+)/gi([0-9]+)$`) - gkeVirtualGPUDeviceIdSeparator = "/vgpu" + gkeMigDeviceIDRegex = regexp.MustCompile(`^nvidia([0-9]+)/gi([0-9]+)$`) + gkeVirtualGPUDeviceIDSeparator = "/vgpu" + nvmlGetMIGDeviceInfoByIDHook = nvmlprovider.GetMIGDeviceInfoByID ) func NewPodMapper(c *Config) (*PodMapper, error) { logrus.Infof("Kubernetes metrics collection enabled!") - ret := nvml.Init() - - if ret != nil { - return nil, ret - } - return &PodMapper{ Config: c, }, nil @@ -73,29 +68,29 @@ func (p *PodMapper) Process(metrics [][]Metric, sysInfo SystemInfo) error { } defer cleanup() - pods, err := ListPods(c) + pods, err := p.listPods(c) if err != nil { return err } - deviceToPod := ToDeviceToPod(pods, sysInfo) + deviceToPod := p.toDeviceToPod(pods, sysInfo) // Note: for loop are copies the value, if we want to change the value // and not the copy, we need to use the indexes for i, device := range metrics { for j, val := range device { - deviceId, err := val.getIDOfType(p.Config.KubernetesGPUIdType) + deviceID, err := val.getIDOfType(p.Config.KubernetesGPUIdType) if err != nil { return err } if !p.Config.UseOldNamespace { - metrics[i][j].Attributes[podAttribute] = deviceToPod[deviceId].Name - metrics[i][j].Attributes[namespaceAttribute] = deviceToPod[deviceId].Namespace - metrics[i][j].Attributes[containerAttribute] = deviceToPod[deviceId].Container + metrics[i][j].Attributes[podAttribute] = deviceToPod[deviceID].Name + metrics[i][j].Attributes[namespaceAttribute] = deviceToPod[deviceID].Namespace + metrics[i][j].Attributes[containerAttribute] = deviceToPod[deviceID].Container } else { - metrics[i][j].Attributes[oldPodAttribute] = deviceToPod[deviceId].Name - metrics[i][j].Attributes[oldNamespaceAttribute] = deviceToPod[deviceId].Namespace - metrics[i][j].Attributes[oldContainerAttribute] = deviceToPod[deviceId].Container + metrics[i][j].Attributes[oldPodAttribute] = deviceToPod[deviceID].Name + metrics[i][j].Attributes[oldNamespaceAttribute] = deviceToPod[deviceID].Namespace + metrics[i][j].Attributes[oldContainerAttribute] = deviceToPod[deviceID].Container } } } @@ -120,7 +115,7 @@ func connectToServer(socket string) (*grpc.ClientConn, func(), error) { return conn, func() { conn.Close() }, nil } -func ListPods(conn *grpc.ClientConn) (*podresourcesapi.ListPodResourcesResponse, error) { +func (p *PodMapper) listPods(conn *grpc.ClientConn) (*podresourcesapi.ListPodResourcesResponse, error) { client := podresourcesapi.NewPodResourcesListerClient(conn) ctx, cancel := context.WithTimeout(context.Background(), connectionTimeout) @@ -134,7 +129,7 @@ func ListPods(conn *grpc.ClientConn) (*podresourcesapi.ListPodResourcesResponse, return resp, nil } -func ToDeviceToPod(devicePods *podresourcesapi.ListPodResourcesResponse, sysInfo SystemInfo) map[string]PodInfo { +func (p *PodMapper) toDeviceToPod(devicePods *podresourcesapi.ListPodResourcesResponse, sysInfo SystemInfo) map[string]PodInfo { deviceToPodMap := make(map[string]PodInfo) for _, pod := range devicePods.GetPodResources() { @@ -144,7 +139,7 @@ func ToDeviceToPod(devicePods *podresourcesapi.ListPodResourcesResponse, sysInfo resourceName := device.GetResourceName() if resourceName != nvidiaResourceName { // Mig resources appear differently than GPU resources - if strings.HasPrefix(resourceName, nvidiaMigResourcePrefix) == false { + if !strings.HasPrefix(resourceName, nvidiaMigResourcePrefix) { continue } } @@ -155,37 +150,36 @@ func ToDeviceToPod(devicePods *podresourcesapi.ListPodResourcesResponse, sysInfo Container: container.GetName(), } - for _, deviceid := range device.GetDeviceIds() { - if strings.HasPrefix(deviceid, MIG_UUID_PREFIX) { - gpuUuid, gi, _, err := nvml.ParseMigDeviceUUID(deviceid) + for _, deviceID := range device.GetDeviceIds() { + if strings.HasPrefix(deviceID, MIG_UUID_PREFIX) { + migDevice, err := nvmlGetMIGDeviceInfoByIDHook(deviceID) if err == nil { - giIdentifier := GetGPUInstanceIdentifier(sysInfo, gpuUuid, gi) + giIdentifier := GetGPUInstanceIdentifier(sysInfo, migDevice.ParentUUID, uint(migDevice.GPUInstanceID)) deviceToPodMap[giIdentifier] = podInfo - } else { - gpuUuid = deviceid[len(MIG_UUID_PREFIX):] } - deviceToPodMap[gpuUuid] = podInfo - } else if gkeMigDeviceIdMatches := gkeMigDeviceIdRegex.FindStringSubmatch(deviceid); gkeMigDeviceIdMatches != nil { + gpuUUID := deviceID[len(MIG_UUID_PREFIX):] + deviceToPodMap[gpuUUID] = podInfo + } else if gkeMigDeviceIDMatches := gkeMigDeviceIDRegex.FindStringSubmatch(deviceID); gkeMigDeviceIDMatches != nil { var gpuIndex string - var gpuInstanceId string - for groupIdx, group := range gkeMigDeviceIdMatches { + var gpuInstanceID string + for groupIdx, group := range gkeMigDeviceIDMatches { switch groupIdx { case 1: gpuIndex = group case 2: - gpuInstanceId = group + gpuInstanceID = group } } - giIdentifier := fmt.Sprintf("%s-%s", gpuIndex, gpuInstanceId) + giIdentifier := fmt.Sprintf("%s-%s", gpuIndex, gpuInstanceID) deviceToPodMap[giIdentifier] = podInfo - } else if strings.Contains(deviceid, gkeVirtualGPUDeviceIdSeparator) { - deviceToPodMap[strings.Split(deviceid, gkeVirtualGPUDeviceIdSeparator)[0]] = podInfo - } else if strings.Contains(deviceid, "::") { - gpuInstanceId := strings.Split(deviceid, "::")[0] - deviceToPodMap[gpuInstanceId] = podInfo - } else { - deviceToPodMap[deviceid] = podInfo + } else if strings.Contains(deviceID, gkeVirtualGPUDeviceIDSeparator) { + deviceToPodMap[strings.Split(deviceID, gkeVirtualGPUDeviceIDSeparator)[0]] = podInfo + } else if strings.Contains(deviceID, "::") { + gpuInstanceID := strings.Split(deviceID, "::")[0] + deviceToPodMap[gpuInstanceID] = podInfo } + // Default mapping between deviceID and pod information + deviceToPodMap[deviceID] = podInfo } } } diff --git a/pkg/dcgmexporter/kubernetes_test.go b/pkg/dcgmexporter/kubernetes_test.go index 6c1b6129..3a8332a4 100644 --- a/pkg/dcgmexporter/kubernetes_test.go +++ b/pkg/dcgmexporter/kubernetes_test.go @@ -19,12 +19,14 @@ package dcgmexporter import ( "context" "fmt" - "io/ioutil" "os" "testing" "time" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider" + "github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils" "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "google.golang.org/grpc" podresourcesapi "k8s.io/kubernetes/pkg/kubelet/apis/podresources/v1alpha1" @@ -34,6 +36,9 @@ import ( var tmpDir string func TestProcessPodMapper(t *testing.T) { + + testutils.RequireLinux(t) + cleanup := CreateTmpDir(t) defer cleanup() @@ -110,7 +115,7 @@ func StartMockServer(t *testing.T, server *grpc.Server, socket string) func() { } func CreateTmpDir(t *testing.T) func() { - path, err := ioutil.TempDir("", "dcgm-exporter") + path, err := os.MkdirTemp("", "dcgm-exporter") require.NoError(t, err) tmpDir = path @@ -139,10 +144,10 @@ func (s *PodResourcesMockServer) List(ctx context.Context, req *podresourcesapi. Name: fmt.Sprintf("gpu-pod-%d", i), Namespace: "default", Containers: []*podresourcesapi.ContainerResources{ - &podresourcesapi.ContainerResources{ + { Name: "default", Devices: []*podresourcesapi.ContainerDevices{ - &podresourcesapi.ContainerDevices{ + { ResourceName: nvidiaResourceName, DeviceIds: []string{gpu}, }, @@ -157,3 +162,143 @@ func (s *PodResourcesMockServer) List(ctx context.Context, req *podresourcesapi. }, nil } + +func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { + testutils.RequireLinux(t) + + type TestCase struct { + KubernetesGPUIDType KubernetesGPUIDType + GPUInstanceID uint + MetricGPUID string + MetricGPUDevice string + MetricMigProfile string + PODGPUID string + } + + testCases := []TestCase{ + { + KubernetesGPUIDType: GPUUID, + MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + }, + { + KubernetesGPUIDType: GPUUID, + MetricGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + MetricMigProfile: "", + }, + { + KubernetesGPUIDType: GPUUID, + GPUInstanceID: 3, + MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + MetricMigProfile: "", + PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + }, + { + KubernetesGPUIDType: DeviceName, + GPUInstanceID: 3, + MetricMigProfile: "mig", + PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5", + }, + { + KubernetesGPUIDType: DeviceName, + MetricMigProfile: "mig", + PODGPUID: "nvidia0/gi0", + }, + { + KubernetesGPUIDType: DeviceName, + MetricGPUDevice: "0", + PODGPUID: "0/vgpu", + }, + { + KubernetesGPUIDType: GPUUID, + MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5::", + }, + } + + for _, tc := range testCases { + t.Run(fmt.Sprintf("when type %s, pod device id %s metric device id %s and gpu device %s", + tc.KubernetesGPUIDType, + tc.PODGPUID, + tc.MetricGPUID, + tc.MetricGPUDevice, + ), + func(t *testing.T) { + cleanup := CreateTmpDir(t) + defer cleanup() + socketPath = tmpDir + "/kubelet.sock" + server := grpc.NewServer() + + cleanup, err := dcgm.Init(dcgm.Embedded) + require.NoError(t, err) + defer cleanup() + + gpus := []string{tc.PODGPUID} + podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(gpus)) + + cleanup = StartMockServer(t, server, socketPath) + defer cleanup() + + nvmlGetMIGDeviceInfoByIDHook = func(uuid string) (*nvmlprovider.MIGDeviceInfo, error) { + return &nvmlprovider.MIGDeviceInfo{ + ParentUUID: "00000000-0000-0000-0000-000000000000", + GPUInstanceID: 3, + ComputeInstanceID: 0, + }, nil + } + + defer func() { + nvmlGetMIGDeviceInfoByIDHook = nvmlprovider.GetMIGDeviceInfoByID + }() + + podMapper, err := NewPodMapper(&Config{KubernetesGPUIdType: tc.KubernetesGPUIDType}) + require.NoError(t, err) + require.NotNil(t, podMapper) + metrics := [][]Metric{ + { + { + GPU: "0", + GPUUUID: tc.MetricGPUID, + GPUDevice: tc.MetricGPUDevice, + GPUInstanceID: fmt.Sprint(tc.GPUInstanceID), + Value: "42", + MigProfile: tc.MetricMigProfile, + Counter: &Counter{ + FieldID: 155, + FieldName: "DCGM_FI_DEV_POWER_USAGE", + PromType: "gauge", + }, + Attributes: map[string]string{}, + }, + }, + } + sysInfo := SystemInfo{ + GPUCount: 1, + GPUs: [32]GPUInfo{ + { + DeviceInfo: dcgm.Device{ + UUID: "00000000-0000-0000-0000-000000000000", + GPU: 0, + }, + MigEnabled: true, + }, + }, + } + err = podMapper.Process(metrics, sysInfo) + require.NoError(t, err) + assert.Len(t, metrics, 1) + for _, metric := range metrics[0] { + require.Contains(t, metric.Attributes, podAttribute) + require.Contains(t, metric.Attributes, namespaceAttribute) + require.Contains(t, metric.Attributes, containerAttribute) + + // TODO currently we rely on ordering and implicit expectations of the mock implementation + // This should be a table comparison + require.Equal(t, fmt.Sprintf("gpu-pod-%d", 0), metric.Attributes[podAttribute]) + require.Equal(t, "default", metric.Attributes[namespaceAttribute]) + require.Equal(t, "default", metric.Attributes[containerAttribute]) + } + }) + } +}