Skip to content

Commit

Permalink
Merge pull request #567 from elezar/fix-namespace
Browse files Browse the repository at this point in the history
Use clientsets and downward API to construct NFD client
  • Loading branch information
elezar authored Mar 6, 2024
2 parents aa4b72b + cfcdcce commit c829653
Show file tree
Hide file tree
Showing 11 changed files with 511 additions and 272 deletions.
19 changes: 11 additions & 8 deletions api/config/v1/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,15 @@ type Flags struct {

// CommandLineFlags holds the list of command line flags used to configure the device plugin and GFD.
type CommandLineFlags struct {
MigStrategy *string `json:"migStrategy" yaml:"migStrategy"`
FailOnInitError *bool `json:"failOnInitError" yaml:"failOnInitError"`
MpsRoot *string `json:"mpsRoot,omitempty" yaml:"mpsRoot,omitempty"`
NvidiaDriverRoot *string `json:"nvidiaDriverRoot,omitempty" yaml:"nvidiaDriverRoot,omitempty"`
GDSEnabled *bool `json:"gdsEnabled" yaml:"gdsEnabled"`
MOFEDEnabled *bool `json:"mofedEnabled" yaml:"mofedEnabled"`
Plugin *PluginCommandLineFlags `json:"plugin,omitempty" yaml:"plugin,omitempty"`
GFD *GFDCommandLineFlags `json:"gfd,omitempty" yaml:"gfd,omitempty"`
MigStrategy *string `json:"migStrategy" yaml:"migStrategy"`
FailOnInitError *bool `json:"failOnInitError" yaml:"failOnInitError"`
MpsRoot *string `json:"mpsRoot,omitempty" yaml:"mpsRoot,omitempty"`
NvidiaDriverRoot *string `json:"nvidiaDriverRoot,omitempty" yaml:"nvidiaDriverRoot,omitempty"`
GDSEnabled *bool `json:"gdsEnabled" yaml:"gdsEnabled"`
MOFEDEnabled *bool `json:"mofedEnabled" yaml:"mofedEnabled"`
UseNodeFeatureAPI *bool `json:"useNodeFeatureAPI" yaml:"useNodeFeatureAPI"`
Plugin *PluginCommandLineFlags `json:"plugin,omitempty" yaml:"plugin,omitempty"`
GFD *GFDCommandLineFlags `json:"gfd,omitempty" yaml:"gfd,omitempty"`
}

// PluginCommandLineFlags holds the list of command line flags specific to the device plugin.
Expand Down Expand Up @@ -125,6 +126,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
updateFromCLIFlag(&f.GDSEnabled, c, n)
case "mofed-enabled":
updateFromCLIFlag(&f.MOFEDEnabled, c, n)
case "use-node-feature-api":
updateFromCLIFlag(&f.UseNodeFeatureAPI, c, n)
}
// Plugin specific flags
if f.Plugin == nil {
Expand Down
5 changes: 4 additions & 1 deletion api/config/v1/flags_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,8 @@ func TestMarshalFlags(t *testing.T) {
"migStrategy": null,
"failOnInitError": null,
"gdsEnabled": null,
"mofedEnabled": null
"mofedEnabled": null,
"useNodeFeatureAPI": null
}`,
},
{
Expand All @@ -177,6 +178,7 @@ func TestMarshalFlags(t *testing.T) {
"failOnInitError": null,
"gdsEnabled": null,
"mofedEnabled": null,
"useNodeFeatureAPI": null,
"gfd": {
"oneshot": null,
"noTimestamp": null,
Expand All @@ -199,6 +201,7 @@ func TestMarshalFlags(t *testing.T) {
"failOnInitError": null,
"gdsEnabled": null,
"mofedEnabled": null,
"useNodeFeatureAPI": null,
"gfd": {
"oneshot": null,
"noTimestamp": null,
Expand Down
100 changes: 72 additions & 28 deletions cmd/gpu-feature-discovery/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"k8s.io/klog/v2"

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
"github.com/NVIDIA/k8s-device-plugin/internal/flags"
"github.com/NVIDIA/k8s-device-plugin/internal/info"
"github.com/NVIDIA/k8s-device-plugin/internal/lm"
"github.com/NVIDIA/k8s-device-plugin/internal/logger"
Expand All @@ -22,20 +23,29 @@ import (
"github.com/NVIDIA/k8s-device-plugin/internal/watch"
)

var nodeFeatureAPI bool
// Config represents a collection of config options for GFD.
type Config struct {
configFile string

kubeClientConfig flags.KubeClientConfig
nodeConfig flags.NodeConfig

// flags stores the CLI flags for later processing.
flags []cli.Flag
}

func main() {
var configFile string
config := &Config{}

c := cli.NewApp()
c.Name = "GPU Feature Discovery"
c.Usage = "generate labels for NVIDIA devices"
c.Version = info.GetVersionString()
c.Action = func(ctx *cli.Context) error {
return start(ctx, c.Flags)
return start(ctx, config)
}

c.Flags = []cli.Flag{
config.flags = []cli.Flag{
&cli.StringFlag{
Name: "mig-strategy",
Value: spec.MigStrategyNone,
Expand Down Expand Up @@ -81,18 +91,21 @@ func main() {
&cli.StringFlag{
Name: "config-file",
Usage: "the path to a config file as an alternative to command line options or environment variables",
Destination: &configFile,
Destination: &config.configFile,
EnvVars: []string{"GFD_CONFIG_FILE", "CONFIG_FILE"},
},
&cli.BoolFlag{
Name: "use-node-feature-api",
Value: false,
Destination: &nodeFeatureAPI,
Usage: "Use NFD NodeFeature API to publish labels",
EnvVars: []string{"GFD_USE_NODE_FEATURE_API"},
Name: "use-node-feature-api",
Usage: "Use NFD NodeFeature API to publish labels",
EnvVars: []string{"GFD_USE_NODE_FEATURE_API", "USE_NODE_FEATURE_API"},
},
}

config.flags = append(config.flags, config.kubeClientConfig.Flags()...)
config.flags = append(config.flags, config.nodeConfig.Flags()...)

c.Flags = config.flags

if err := c.Run(os.Args); err != nil {
klog.Error(err)
os.Exit(1)
Expand All @@ -103,8 +116,9 @@ func validateFlags(config *spec.Config) error {
return nil
}

func loadConfig(c *cli.Context, flags []cli.Flag) (*spec.Config, error) {
config, err := spec.NewConfig(c, flags)
// loadConfig loads the config from the spec file.
func (cfg *Config) loadConfig(c *cli.Context) (*spec.Config, error) {
config, err := spec.NewConfig(c, cfg.flags)
if err != nil {
return nil, fmt.Errorf("unable to finalize config: %v", err)
}
Expand All @@ -113,10 +127,11 @@ func loadConfig(c *cli.Context, flags []cli.Flag) (*spec.Config, error) {
return nil, fmt.Errorf("unable to validate flags: %v", err)
}
config.Flags.Plugin = nil

return config, nil
}

func start(c *cli.Context, flags []cli.Flag) error {
func start(c *cli.Context, cfg *Config) error {
defer func() {
klog.Info("Exiting")
}()
Expand All @@ -127,7 +142,7 @@ func start(c *cli.Context, flags []cli.Flag) error {
for {
// Load the configuration file
klog.Info("Loading configuration.")
config, err := loadConfig(c, flags)
config, err := cfg.loadConfig(c)
if err != nil {
return fmt.Errorf("unable to load config: %v", err)
}
Expand All @@ -143,8 +158,23 @@ func start(c *cli.Context, flags []cli.Flag) error {
manager := resource.NewManager(config)
vgpul := vgpu.NewVGPULib(vgpu.NewNvidiaPCILib())

clientSets, err := cfg.kubeClientConfig.NewClientSets()
if err != nil {
return fmt.Errorf("failed to create clientsets: %w", err)
}
klog.Info("Start running")
restart, err := run(manager, vgpul, config, sigs)
d := &gfd{
manager: manager,
vgpu: vgpul,
config: config,

labelOutputer: lm.NewOutputer(
config,
cfg.nodeConfig,
clientSets,
),
}
restart, err := d.run(sigs)
if err != nil {
return err
}
Expand All @@ -155,19 +185,34 @@ func start(c *cli.Context, flags []cli.Flag) error {
}
}

func run(manager resource.Manager, vgpu vgpu.Interface, config *spec.Config, sigs chan os.Signal) (bool, error) {
type gfd struct {
manager resource.Manager
vgpu vgpu.Interface
config *spec.Config

labelOutputer lm.Outputer
}

func (d *gfd) run(sigs chan os.Signal) (bool, error) {
defer func() {
if !nodeFeatureAPI && !*config.Flags.GFD.Oneshot && *config.Flags.GFD.OutputFile != "" {
err := removeOutputFile(*config.Flags.GFD.OutputFile)
if err != nil {
klog.Warningf("Error removing output file: %v", err)
}
if d.config.Flags.UseNodeFeatureAPI != nil && *d.config.Flags.UseNodeFeatureAPI {
return
}
if d.config.Flags.GFD.Oneshot != nil && *d.config.Flags.GFD.Oneshot {
return
}
if d.config.Flags.GFD.OutputFile != nil && *d.config.Flags.GFD.OutputFile == "" {
return
}
err := removeOutputFile(*d.config.Flags.GFD.OutputFile)
if err != nil {
klog.Warningf("Error removing output file: %v", err)
}
}()

timestampLabeler := lm.NewTimestampLabeler(config)
timestampLabeler := lm.NewTimestampLabeler(d.config)
rerun:
loopLabelers, err := lm.NewLabelers(manager, vgpu, config)
loopLabelers, err := lm.NewLabelers(d.manager, d.vgpu, d.config)
if err != nil {
return false, err
}
Expand All @@ -187,17 +232,16 @@ rerun:
}

klog.Info("Creating Labels")
err = labels.Output(*config.Flags.GFD.OutputFile, nodeFeatureAPI)
if err != nil {
if err := d.labelOutputer.Output(labels); err != nil {
return false, err
}

if *config.Flags.GFD.Oneshot {
if *d.config.Flags.GFD.Oneshot {
return false, nil
}

klog.Info("Sleeping for ", *config.Flags.GFD.SleepInterval)
rerunTimeout := time.After(time.Duration(*config.Flags.GFD.SleepInterval))
klog.Info("Sleeping for ", *d.config.Flags.GFD.SleepInterval)
rerunTimeout := time.After(time.Duration(*d.config.Flags.GFD.SleepInterval))

for {
select {
Expand Down
34 changes: 30 additions & 4 deletions cmd/gpu-feature-discovery/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ import (
"github.com/stretchr/testify/require"

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
"github.com/NVIDIA/k8s-device-plugin/internal/flags"
"github.com/NVIDIA/k8s-device-plugin/internal/lm"
"github.com/NVIDIA/k8s-device-plugin/internal/resource"
rt "github.com/NVIDIA/k8s-device-plugin/internal/resource/testing"
"github.com/NVIDIA/k8s-device-plugin/internal/vgpu"
Expand Down Expand Up @@ -112,7 +114,13 @@ func TestRunOneshot(t *testing.T) {
setupMachineFile(t)
defer removeMachineFile(t)

restart, err := run(nvmlMock, vgpuMock, conf, nil)
d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}),
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
require.False(t, restart)

Expand Down Expand Up @@ -158,7 +166,13 @@ func TestRunWithNoTimestamp(t *testing.T) {
setupMachineFile(t)
defer removeMachineFile(t)

restart, err := run(nvmlMock, vgpuMock, conf, nil)
d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}),
}
restart, err := d.run(nil)
require.NoError(t, err, "Error from run function")
require.False(t, restart)

Expand Down Expand Up @@ -216,7 +230,13 @@ func TestRunSleep(t *testing.T) {
var runRestart bool
var runError error
go func() {
runRestart, runError = run(nvmlMock, vgpuMock, conf, sigs)
d := gfd{
manager: nvmlMock,
vgpu: vgpuMock,
config: conf,
labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}),
}
runRestart, runError = d.run(sigs)
}()

outFileModificationTime := make([]int64, 2)
Expand Down Expand Up @@ -370,7 +390,13 @@ func TestFailOnNVMLInitError(t *testing.T) {

nvmlMock := rt.NewManagerMockWithDevices(rt.NewFullGPU()).WithErrorOnInit(tc.errorOnInit)

restart, err := run(resource.WithConfig(nvmlMock, conf), vgpuMock, conf, nil)
d := gfd{
manager: resource.WithConfig(nvmlMock, conf),
vgpu: vgpuMock,
config: conf,
labelOutputer: lm.NewOutputer(conf, flags.NodeConfig{}, flags.ClientSets{}),
}
restart, err := d.run(nil)
if tc.expectError {
require.Error(t, err)
} else {
Expand Down
Loading

0 comments on commit c829653

Please sign in to comment.