From b4552f0bb78fe5b2cd7a17d048ee49abc1c2d926 Mon Sep 17 00:00:00 2001 From: LiuXiang Date: Tue, 16 Jul 2024 04:17:48 +0800 Subject: [PATCH] Make nvidia resource names configurable (#359) * Make nvidia resource names configurable Signed-off-by: lx1036 * added unit test Signed-off-by: Vadym Fedorov --------- Signed-off-by: lx1036 Signed-off-by: Vadym Fedorov Co-authored-by: Vadym Fedorov --- pkg/cmd/app.go | 8 ++++++++ pkg/dcgmexporter/config.go | 1 + pkg/dcgmexporter/kubernetes.go | 3 ++- pkg/dcgmexporter/kubernetes_test.go | 9 +++++++++ 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index 7bbd44c8..c78f8434 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -74,6 +74,7 @@ const ( CLIDCGMLogLevel = "dcgm-log-level" CLIPodResourcesKubeletSocket = "pod-resources-kubelet-socket" CLIHPCJobMappingDir = "hpc-job-mapping-dir" + CLINvidiaResourceNames = "nvidia-resource-names" ) func NewApp(buildVersion ...string) *cli.App { @@ -237,6 +238,12 @@ func NewApp(buildVersion ...string) *cli.App { Usage: "Path to HPC job mapping file directory used for mapping GPUs to jobs.", EnvVars: []string{"DCGM_HPC_JOB_MAPPING_DIR"}, }, + &cli.StringSliceFlag{ + Name: CLINvidiaResourceNames, + Value: cli.NewStringSlice(), + Usage: "Nvidia resource names for specified GPU type like nvidia.com/a100, nvidia.com/a10.", + EnvVars: []string{"NVIDIA_RESOURCE_NAMES"}, + }, } if runtime.GOOS == "linux" { @@ -631,5 +638,6 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { DCGMLogLevel: dcgmLogLevel, PodResourcesKubeletSocket: c.String(CLIPodResourcesKubeletSocket), HPCJobMappingDir: c.String(CLIHPCJobMappingDir), + NvidiaResourceNames: c.StringSlice(CLINvidiaResourceNames), }, nil } diff --git a/pkg/dcgmexporter/config.go b/pkg/dcgmexporter/config.go index a9405111..f13c91db 100644 --- a/pkg/dcgmexporter/config.go +++ b/pkg/dcgmexporter/config.go @@ -58,4 +58,5 @@ type Config struct { DCGMLogLevel string PodResourcesKubeletSocket string HPCJobMappingDir string + NvidiaResourceNames []string } diff --git a/pkg/dcgmexporter/kubernetes.go b/pkg/dcgmexporter/kubernetes.go index 1a04245b..8fb8d7d2 100644 --- a/pkg/dcgmexporter/kubernetes.go +++ b/pkg/dcgmexporter/kubernetes.go @@ -21,6 +21,7 @@ import ( "fmt" "net" "regexp" + "slices" "strings" "time" @@ -147,7 +148,7 @@ func (p *PodMapper) toDeviceToPod( for _, device := range container.GetDevices() { resourceName := device.GetResourceName() - if resourceName != nvidiaResourceName { + if resourceName != nvidiaResourceName && !slices.Contains(p.Config.NvidiaResourceNames, resourceName) { // Mig resources appear differently than GPU resources if !strings.HasPrefix(resourceName, nvidiaMigResourcePrefix) { continue diff --git a/pkg/dcgmexporter/kubernetes_test.go b/pkg/dcgmexporter/kubernetes_test.go index 7a9b2b86..3b48efe2 100644 --- a/pkg/dcgmexporter/kubernetes_test.go +++ b/pkg/dcgmexporter/kubernetes_test.go @@ -174,6 +174,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { MetricGPUDevice string MetricMigProfile string PODGPUID string + NvidiaResourceNames []string } testCases := []TestCase{ @@ -232,6 +233,13 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { MetricGPUDevice: "0", GPUInstanceID: 3, }, + { + KubernetesGPUIDType: GPUUID, + ResourceName: "nvidia.com/a100", + MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5", + NvidiaResourceNames: []string{"nvidia.com/a100"}, + }, } for _, tc := range testCases { @@ -272,6 +280,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { podMapper, err := NewPodMapper(&Config{ KubernetesGPUIdType: tc.KubernetesGPUIDType, PodResourcesKubeletSocket: socketPath, + NvidiaResourceNames: tc.NvidiaResourceNames, }) require.NoError(t, err) require.NotNil(t, podMapper)