From 2f8035457601f19eda657bb8c4598f48fcabdeab Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 8 Nov 2024 12:38:43 -0800 Subject: [PATCH] Abstract prometheus metrics into interfaces (#1801) --- go.mod | 2 +- go.sum | 4 +- pkg/cloudprovider/metrics/cloudprovider.go | 39 +++--- pkg/controllers/disruption/controller.go | 12 +- pkg/controllers/disruption/helpers.go | 4 +- pkg/controllers/disruption/metrics.go | 26 ++-- .../disruption/multinodeconsolidation.go | 2 +- .../disruption/orchestration/metrics.go | 8 +- .../disruption/orchestration/queue.go | 9 +- .../disruption/singlenodeconsolidation.go | 2 +- pkg/controllers/metrics/node/controller.go | 72 +++++----- .../metrics/nodepool/controller.go | 25 ++-- pkg/controllers/metrics/pod/controller.go | 54 ++++---- .../node/termination/controller.go | 17 ++- pkg/controllers/node/termination/metrics.go | 17 +-- .../node/termination/suite_test.go | 2 +- .../node/termination/terminator/eviction.go | 4 +- .../node/termination/terminator/metrics.go | 8 +- .../nodeclaim/expiration/controller.go | 5 +- .../nodeclaim/garbagecollection/controller.go | 5 +- .../nodeclaim/lifecycle/controller.go | 13 +- pkg/controllers/nodeclaim/lifecycle/launch.go | 9 +- .../nodeclaim/lifecycle/liveness.go | 5 +- .../nodeclaim/lifecycle/metrics.go | 11 +- .../nodeclaim/lifecycle/registration.go | 5 +- pkg/controllers/provisioning/provisioner.go | 15 +- .../provisioning/scheduling/metrics.go | 18 +-- .../provisioning/scheduling/scheduler.go | 13 +- .../provisioning/scheduling/suite_test.go | 2 +- pkg/controllers/provisioning/suite_test.go | 2 +- pkg/controllers/state/cluster.go | 14 +- pkg/controllers/state/metrics.go | 16 ++- pkg/metrics/constants.go | 6 +- pkg/metrics/metrics.go | 21 +-- pkg/metrics/store.go | 9 +- pkg/metrics/suite_test.go | 128 +++++++++--------- pkg/operator/operator.go | 11 +- pkg/test/expectations/expectations.go | 9 +- 38 files changed, 302 insertions(+), 322 deletions(-) diff --git a/go.mod b/go.mod index 30f208fa28..9128dc8c67 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.23.2 require ( github.com/Pallinder/go-randomdata v1.2.0 github.com/avast/retry-go v3.0.0+incompatible - github.com/awslabs/operatorpkg v0.0.0-20240920182301-771460b3160b + github.com/awslabs/operatorpkg v0.0.0-20241108183842-a2ebef231d52 github.com/docker/docker v27.3.1+incompatible github.com/go-logr/logr v1.4.2 github.com/imdario/mergo v0.3.16 diff --git a/go.sum b/go.sum index 3b1f1f669b..4b4b1f060c 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,8 @@ github.com/Pallinder/go-randomdata v1.2.0 h1:DZ41wBchNRb/0GfsePLiSwb0PHZmT67XY00 github.com/Pallinder/go-randomdata v1.2.0/go.mod h1:yHmJgulpD2Nfrm0cR9tI/+oAgRqCQQixsA8HyRZfV9Y= github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0= github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= -github.com/awslabs/operatorpkg v0.0.0-20240920182301-771460b3160b h1:aG1+YRmKIf5nLTZJNhw1NmuxvjUprWYyluqJ2jmVqiU= -github.com/awslabs/operatorpkg v0.0.0-20240920182301-771460b3160b/go.mod h1:RI+iNDn57c3WX0tsZg4rvkmM58lWsEC5cc6E4vJJld8= +github.com/awslabs/operatorpkg v0.0.0-20241108183842-a2ebef231d52 h1:k8f1ukVs49+nC6JbN8r8bxs8g1TPE3Iki/dK/LGwf3A= +github.com/awslabs/operatorpkg v0.0.0-20241108183842-a2ebef231d52/go.mod h1:nq1PLBLCojzjfqSK8SG3ymxqwW6e/cHLJvddKOSFkfw= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= diff --git a/pkg/cloudprovider/metrics/cloudprovider.go b/pkg/cloudprovider/metrics/cloudprovider.go index c2d53b25c0..79ae60b028 100644 --- a/pkg/cloudprovider/metrics/cloudprovider.go +++ b/pkg/cloudprovider/metrics/cloudprovider.go @@ -19,6 +19,7 @@ package metrics import ( "context" + opmetrics "github.com/awslabs/operatorpkg/metrics" "github.com/prometheus/client_golang/prometheus" crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" @@ -44,7 +45,8 @@ const ( // decorator implements CloudProvider var _ cloudprovider.CloudProvider = (*decorator)(nil) -var methodDuration = prometheus.NewHistogramVec( +var MethodDuration = opmetrics.NewPrometheusHistogram( + crmetrics.Registry, prometheus.HistogramOpts{ Namespace: metrics.Namespace, Subsystem: "cloudprovider", @@ -59,7 +61,8 @@ var methodDuration = prometheus.NewHistogramVec( ) var ( - errorsTotal = prometheus.NewCounterVec( + ErrorsTotal = opmetrics.NewPrometheusCounter( + crmetrics.Registry, prometheus.CounterOpts{ Namespace: metrics.Namespace, Subsystem: "cloudprovider", @@ -75,10 +78,6 @@ var ( ) ) -func init() { - crmetrics.Registry.MustRegister(methodDuration, errorsTotal) -} - type decorator struct { cloudprovider.CloudProvider } @@ -96,60 +95,60 @@ func Decorate(cloudProvider cloudprovider.CloudProvider) cloudprovider.CloudProv func (d *decorator) Create(ctx context.Context, nodeClaim *v1.NodeClaim) (*v1.NodeClaim, error) { method := "Create" - defer metrics.Measure(methodDuration.With(getLabelsMapForDuration(ctx, d, method)))() + defer metrics.Measure(MethodDuration, getLabelsMapForDuration(ctx, d, method))() nodeClaim, err := d.CloudProvider.Create(ctx, nodeClaim) if err != nil { - errorsTotal.With(getLabelsMapForError(ctx, d, method, err)).Inc() + ErrorsTotal.Inc(getLabelsMapForError(ctx, d, method, err)) } return nodeClaim, err } func (d *decorator) Delete(ctx context.Context, nodeClaim *v1.NodeClaim) error { method := "Delete" - defer metrics.Measure(methodDuration.With(getLabelsMapForDuration(ctx, d, method)))() + defer metrics.Measure(MethodDuration, getLabelsMapForDuration(ctx, d, method))() err := d.CloudProvider.Delete(ctx, nodeClaim) if err != nil { - errorsTotal.With(getLabelsMapForError(ctx, d, method, err)).Inc() + ErrorsTotal.Inc(getLabelsMapForError(ctx, d, method, err)) } return err } func (d *decorator) Get(ctx context.Context, id string) (*v1.NodeClaim, error) { method := "Get" - defer metrics.Measure(methodDuration.With(getLabelsMapForDuration(ctx, d, method)))() + defer metrics.Measure(MethodDuration, getLabelsMapForDuration(ctx, d, method))() nodeClaim, err := d.CloudProvider.Get(ctx, id) if err != nil { - errorsTotal.With(getLabelsMapForError(ctx, d, method, err)).Inc() + ErrorsTotal.Inc(getLabelsMapForError(ctx, d, method, err)) } return nodeClaim, err } func (d *decorator) List(ctx context.Context) ([]*v1.NodeClaim, error) { method := "List" - defer metrics.Measure(methodDuration.With(getLabelsMapForDuration(ctx, d, method)))() + defer metrics.Measure(MethodDuration, getLabelsMapForDuration(ctx, d, method))() nodeClaims, err := d.CloudProvider.List(ctx) if err != nil { - errorsTotal.With(getLabelsMapForError(ctx, d, method, err)).Inc() + ErrorsTotal.Inc(getLabelsMapForError(ctx, d, method, err)) } return nodeClaims, err } func (d *decorator) GetInstanceTypes(ctx context.Context, nodePool *v1.NodePool) ([]*cloudprovider.InstanceType, error) { method := "GetInstanceTypes" - defer metrics.Measure(methodDuration.With(getLabelsMapForDuration(ctx, d, method)))() + defer metrics.Measure(MethodDuration, getLabelsMapForDuration(ctx, d, method))() instanceType, err := d.CloudProvider.GetInstanceTypes(ctx, nodePool) if err != nil { - errorsTotal.With(getLabelsMapForError(ctx, d, method, err)).Inc() + ErrorsTotal.Inc(getLabelsMapForError(ctx, d, method, err)) } return instanceType, err } func (d *decorator) IsDrifted(ctx context.Context, nodeClaim *v1.NodeClaim) (cloudprovider.DriftReason, error) { method := "IsDrifted" - defer metrics.Measure(methodDuration.With(getLabelsMapForDuration(ctx, d, method)))() + defer metrics.Measure(MethodDuration, getLabelsMapForDuration(ctx, d, method))() isDrifted, err := d.CloudProvider.IsDrifted(ctx, nodeClaim) if err != nil { - errorsTotal.With(getLabelsMapForError(ctx, d, method, err)).Inc() + ErrorsTotal.Inc(getLabelsMapForError(ctx, d, method, err)) } return isDrifted, err } @@ -157,7 +156,7 @@ func (d *decorator) IsDrifted(ctx context.Context, nodeClaim *v1.NodeClaim) (clo // getLabelsMapForDuration is a convenience func that constructs a map[string]string // for a prometheus Label map used to compose a duration metric spec func getLabelsMapForDuration(ctx context.Context, d *decorator, method string) map[string]string { - return prometheus.Labels{ + return map[string]string{ metricLabelController: injection.GetControllerName(ctx), metricLabelMethod: method, metricLabelProvider: d.Name(), @@ -167,7 +166,7 @@ func getLabelsMapForDuration(ctx context.Context, d *decorator, method string) m // getLabelsMapForError is a convenience func that constructs a map[string]string // for a prometheus Label map used to compose a counter metric spec func getLabelsMapForError(ctx context.Context, d *decorator, method string, err error) map[string]string { - return prometheus.Labels{ + return map[string]string{ metricLabelController: injection.GetControllerName(ctx), metricLabelMethod: method, metricLabelProvider: d.Name(), diff --git a/pkg/controllers/disruption/controller.go b/pkg/controllers/disruption/controller.go index b809ccad73..aa77c63cad 100644 --- a/pkg/controllers/disruption/controller.go +++ b/pkg/controllers/disruption/controller.go @@ -160,17 +160,17 @@ func (c *Controller) Reconcile(ctx context.Context) (reconcile.Result, error) { } func (c *Controller) disrupt(ctx context.Context, disruption Method) (bool, error) { - defer metrics.Measure(EvaluationDurationSeconds.With(map[string]string{ + defer metrics.Measure(EvaluationDurationSeconds, map[string]string{ metrics.ReasonLabel: strings.ToLower(string(disruption.Reason())), consolidationTypeLabel: disruption.ConsolidationType(), - }))() + })() candidates, err := GetCandidates(ctx, c.cluster, c.kubeClient, c.recorder, c.clock, c.cloudProvider, disruption.ShouldDisrupt, disruption.Class(), c.queue) if err != nil { return false, fmt.Errorf("determining candidates, %w", err) } - EligibleNodes.With(map[string]string{ + EligibleNodes.Set(float64(len(candidates)), map[string]string{ metrics.ReasonLabel: strings.ToLower(string(disruption.Reason())), - }).Set(float64(len(candidates))) + }) // If there are no candidates, move to the next disruption if len(candidates) == 0 { @@ -244,11 +244,11 @@ func (c *Controller) executeCommand(ctx context.Context, m Method, cmd Command, } // An action is only performed and pods/nodes are only disrupted after a successful add to the queue - DecisionsPerformedTotal.With(map[string]string{ + DecisionsPerformedTotal.Inc(map[string]string{ decisionLabel: string(cmd.Decision()), metrics.ReasonLabel: strings.ToLower(string(m.Reason())), consolidationTypeLabel: m.ConsolidationType(), - }).Inc() + }) return nil } diff --git a/pkg/controllers/disruption/helpers.go b/pkg/controllers/disruption/helpers.go index 84836639f0..d6fe1243ef 100644 --- a/pkg/controllers/disruption/helpers.go +++ b/pkg/controllers/disruption/helpers.go @@ -234,9 +234,9 @@ func BuildDisruptionBudgetMapping(ctx context.Context, cluster *state.Cluster, c allowedDisruptions := nodePool.MustGetAllowedDisruptions(clk, numNodes[nodePool.Name], reason) disruptionBudgetMapping[nodePool.Name] = lo.Max([]int{allowedDisruptions - disrupting[nodePool.Name], 0}) - NodePoolAllowedDisruptions.With(map[string]string{ + NodePoolAllowedDisruptions.Set(float64(allowedDisruptions), map[string]string{ metrics.NodePoolLabel: nodePool.Name, metrics.ReasonLabel: string(reason), - }).Set(float64(allowedDisruptions)) + }) if allowedDisruptions == 0 { recorder.Publish(disruptionevents.NodePoolBlockedForDisruptionReason(lo.ToPtr(nodePool), reason)) } diff --git a/pkg/controllers/disruption/metrics.go b/pkg/controllers/disruption/metrics.go index 98ee231c0c..a288f05d66 100644 --- a/pkg/controllers/disruption/metrics.go +++ b/pkg/controllers/disruption/metrics.go @@ -17,22 +17,13 @@ limitations under the License. package disruption import ( + opmetrics "github.com/awslabs/operatorpkg/metrics" "github.com/prometheus/client_golang/prometheus" crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/karpenter/pkg/metrics" ) -func init() { - crmetrics.Registry.MustRegister( - EvaluationDurationSeconds, - DecisionsPerformedTotal, - EligibleNodes, - ConsolidationTimeoutsTotal, - NodePoolAllowedDisruptions, - ) -} - const ( voluntaryDisruptionSubsystem = "voluntary_disruption" decisionLabel = "decision" @@ -40,7 +31,8 @@ const ( ) var ( - EvaluationDurationSeconds = prometheus.NewHistogramVec( + EvaluationDurationSeconds = opmetrics.NewPrometheusHistogram( + crmetrics.Registry, prometheus.HistogramOpts{ Namespace: metrics.Namespace, Subsystem: voluntaryDisruptionSubsystem, @@ -50,7 +42,8 @@ var ( }, []string{metrics.ReasonLabel, consolidationTypeLabel}, ) - DecisionsPerformedTotal = prometheus.NewCounterVec( + DecisionsPerformedTotal = opmetrics.NewPrometheusCounter( + crmetrics.Registry, prometheus.CounterOpts{ Namespace: metrics.Namespace, Subsystem: voluntaryDisruptionSubsystem, @@ -59,7 +52,8 @@ var ( }, []string{decisionLabel, metrics.ReasonLabel, consolidationTypeLabel}, ) - EligibleNodes = prometheus.NewGaugeVec( + EligibleNodes = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: voluntaryDisruptionSubsystem, @@ -68,7 +62,8 @@ var ( }, []string{metrics.ReasonLabel}, ) - ConsolidationTimeoutsTotal = prometheus.NewCounterVec( + ConsolidationTimeoutsTotal = opmetrics.NewPrometheusCounter( + crmetrics.Registry, prometheus.CounterOpts{ Namespace: metrics.Namespace, Subsystem: voluntaryDisruptionSubsystem, @@ -77,7 +72,8 @@ var ( }, []string{consolidationTypeLabel}, ) - NodePoolAllowedDisruptions = prometheus.NewGaugeVec( + NodePoolAllowedDisruptions = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: metrics.NodePoolSubsystem, diff --git a/pkg/controllers/disruption/multinodeconsolidation.go b/pkg/controllers/disruption/multinodeconsolidation.go index 2e2039c1a0..e9b74fa731 100644 --- a/pkg/controllers/disruption/multinodeconsolidation.go +++ b/pkg/controllers/disruption/multinodeconsolidation.go @@ -123,7 +123,7 @@ func (m *MultiNodeConsolidation) firstNConsolidationOption(ctx context.Context, // binary search to find the maximum number of NodeClaims we can terminate for min <= max { if m.clock.Now().After(timeout) { - ConsolidationTimeoutsTotal.WithLabelValues(m.ConsolidationType()).Inc() + ConsolidationTimeoutsTotal.Inc(map[string]string{consolidationTypeLabel: m.ConsolidationType()}) if lastSavedCommand.candidates == nil { log.FromContext(ctx).V(1).Info(fmt.Sprintf("failed to find a multi-node consolidation after timeout, last considered batch had %d", (min+max)/2)) } else { diff --git a/pkg/controllers/disruption/orchestration/metrics.go b/pkg/controllers/disruption/orchestration/metrics.go index a3aeed478e..fd82969e1e 100644 --- a/pkg/controllers/disruption/orchestration/metrics.go +++ b/pkg/controllers/disruption/orchestration/metrics.go @@ -17,16 +17,13 @@ limitations under the License. package orchestration import ( + opmetrics "github.com/awslabs/operatorpkg/metrics" "github.com/prometheus/client_golang/prometheus" crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/karpenter/pkg/metrics" ) -func init() { - crmetrics.Registry.MustRegister(disruptionQueueFailuresTotal) -} - const ( voluntaryDisruptionSubsystem = "voluntary_disruption" consolidationTypeLabel = "consolidation_type" @@ -34,7 +31,8 @@ const ( ) var ( - disruptionQueueFailuresTotal = prometheus.NewCounterVec( + DisruptionQueueFailuresTotal = opmetrics.NewPrometheusCounter( + crmetrics.Registry, prometheus.CounterOpts{ Namespace: metrics.Namespace, Subsystem: voluntaryDisruptionSubsystem, diff --git a/pkg/controllers/disruption/orchestration/queue.go b/pkg/controllers/disruption/orchestration/queue.go index e9a24ff4b7..1cf800c143 100644 --- a/pkg/controllers/disruption/orchestration/queue.go +++ b/pkg/controllers/disruption/orchestration/queue.go @@ -25,7 +25,6 @@ import ( "time" "github.com/awslabs/operatorpkg/singleton" - "github.com/prometheus/client_golang/prometheus" "github.com/samber/lo" "go.uber.org/multierr" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -195,11 +194,11 @@ func (q *Queue) Reconcile(ctx context.Context) (reconcile.Result, error) { failedLaunches := lo.Filter(cmd.Replacements, func(r Replacement, _ int) bool { return !r.Initialized }) - disruptionQueueFailuresTotal.With(map[string]string{ + DisruptionQueueFailuresTotal.Add(float64(len(failedLaunches)), map[string]string{ decisionLabel: cmd.Decision(), metrics.ReasonLabel: string(cmd.reason), consolidationTypeLabel: cmd.consolidationType, - }).Add(float64(len(failedLaunches))) + }) multiErr := multierr.Combine(err, cmd.lastError, state.RequireNoScheduleTaint(ctx, q.kubeClient, false, cmd.candidates...)) // Log the error log.FromContext(ctx).WithValues("nodes", strings.Join(lo.Map(cmd.candidates, func(s *state.StateNode, _ int) string { @@ -265,11 +264,11 @@ func (q *Queue) waitOrTerminate(ctx context.Context, cmd *Command) error { if err := q.kubeClient.Delete(ctx, candidate.NodeClaim); err != nil { multiErr = multierr.Append(multiErr, client.IgnoreNotFound(err)) } else { - metrics.NodeClaimsDisruptedTotal.With(prometheus.Labels{ + metrics.NodeClaimsDisruptedTotal.Inc(map[string]string{ metrics.ReasonLabel: string(cmd.reason), metrics.NodePoolLabel: cmd.candidates[i].NodeClaim.Labels[v1.NodePoolLabelKey], metrics.CapacityTypeLabel: cmd.candidates[i].NodeClaim.Labels[v1.CapacityTypeLabelKey], - }).Inc() + }) } } // If there were any deletion failures, we should requeue. diff --git a/pkg/controllers/disruption/singlenodeconsolidation.go b/pkg/controllers/disruption/singlenodeconsolidation.go index 41af43d41f..a50bd0f5de 100644 --- a/pkg/controllers/disruption/singlenodeconsolidation.go +++ b/pkg/controllers/disruption/singlenodeconsolidation.go @@ -68,7 +68,7 @@ func (s *SingleNodeConsolidation) ComputeCommand(ctx context.Context, disruption continue } if s.clock.Now().After(timeout) { - ConsolidationTimeoutsTotal.WithLabelValues(s.ConsolidationType()).Inc() + ConsolidationTimeoutsTotal.Inc(map[string]string{consolidationTypeLabel: s.ConsolidationType()}) log.FromContext(ctx).V(1).Info(fmt.Sprintf("abandoning single-node consolidation due to timeout after evaluating %d candidates", i)) return Command{}, scheduling.Results{}, nil } diff --git a/pkg/controllers/metrics/node/controller.go b/pkg/controllers/metrics/node/controller.go index fc054e776d..beaa0e3c6d 100644 --- a/pkg/controllers/metrics/node/controller.go +++ b/pkg/controllers/metrics/node/controller.go @@ -21,6 +21,7 @@ import ( "strings" "time" + opmetrics "github.com/awslabs/operatorpkg/metrics" "github.com/awslabs/operatorpkg/singleton" "github.com/prometheus/client_golang/prometheus" "github.com/samber/lo" @@ -46,7 +47,8 @@ const ( ) var ( - allocatable = prometheus.NewGaugeVec( + Allocatable = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: metrics.NodeSubsystem, @@ -55,7 +57,8 @@ var ( }, nodeLabelNamesWithResourceType(), ) - totalPodRequests = prometheus.NewGaugeVec( + TotalPodRequests = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: metrics.NodeSubsystem, @@ -64,7 +67,8 @@ var ( }, nodeLabelNamesWithResourceType(), ) - totalPodLimits = prometheus.NewGaugeVec( + TotalPodLimits = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: metrics.NodeSubsystem, @@ -73,7 +77,8 @@ var ( }, nodeLabelNamesWithResourceType(), ) - totalDaemonRequests = prometheus.NewGaugeVec( + TotalDaemonRequests = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: metrics.NodeSubsystem, @@ -82,7 +87,8 @@ var ( }, nodeLabelNamesWithResourceType(), ) - totalDaemonLimits = prometheus.NewGaugeVec( + TotalDaemonLimits = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: metrics.NodeSubsystem, @@ -91,7 +97,8 @@ var ( }, nodeLabelNamesWithResourceType(), ) - systemOverhead = prometheus.NewGaugeVec( + SystemOverhead = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: metrics.NodeSubsystem, @@ -100,7 +107,8 @@ var ( }, nodeLabelNamesWithResourceType(), ) - lifetimeGaugeVec = prometheus.NewGaugeVec( + Lifetime = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: metrics.NodeSubsystem, @@ -109,7 +117,8 @@ var ( }, nodeLabelNames(), ) - clusterUtilizationGaugeVec = prometheus.NewGaugeVec( + ClusterUtilization = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: "cluster", @@ -138,19 +147,6 @@ func nodeLabelNames() []string { ) } -func init() { - crmetrics.Registry.MustRegister( - allocatable, - totalPodRequests, - totalPodLimits, - totalDaemonRequests, - totalDaemonLimits, - systemOverhead, - lifetimeGaugeVec, - clusterUtilizationGaugeVec, - ) -} - type Controller struct { cluster *state.Cluster metricStore *metrics.Store @@ -218,9 +214,9 @@ func buildClusterUtilizationMetric(nodes state.StateNodes) []*metrics.StoreMetri float64(utilizedResource.Value())/float64(allocatableResource.Value())) res = append(res, &metrics.StoreMetric{ - GaugeVec: clusterUtilizationGaugeVec, - Value: utilizationPercentage, - Labels: prometheus.Labels{resourceType: resourceNameToString(resourceName)}, + GaugeMetric: ClusterUtilization, + Value: utilizationPercentage, + Labels: map[string]string{resourceType: resourceNameToString(resourceName)}, }) } @@ -228,27 +224,27 @@ func buildClusterUtilizationMetric(nodes state.StateNodes) []*metrics.StoreMetri } func buildMetrics(n *state.StateNode) (res []*metrics.StoreMetric) { - for gaugeVec, resourceList := range map[*prometheus.GaugeVec]corev1.ResourceList{ - systemOverhead: resources.Subtract(n.Node.Status.Capacity, n.Node.Status.Allocatable), - totalPodRequests: n.PodRequests(), - totalPodLimits: n.PodLimits(), - totalDaemonRequests: n.DaemonSetRequests(), - totalDaemonLimits: n.DaemonSetLimits(), - allocatable: n.Node.Status.Allocatable, + for gaugeMetric, resourceList := range map[opmetrics.GaugeMetric]corev1.ResourceList{ + SystemOverhead: resources.Subtract(n.Node.Status.Capacity, n.Node.Status.Allocatable), + TotalPodRequests: n.PodRequests(), + TotalPodLimits: n.PodLimits(), + TotalDaemonRequests: n.DaemonSetRequests(), + TotalDaemonLimits: n.DaemonSetLimits(), + Allocatable: n.Node.Status.Allocatable, } { for resourceName, quantity := range resourceList { res = append(res, &metrics.StoreMetric{ - GaugeVec: gaugeVec, - Value: lo.Ternary(resourceName == corev1.ResourceCPU, float64(quantity.MilliValue())/float64(1000), float64(quantity.Value())), - Labels: getNodeLabelsWithResourceType(n.Node, resourceNameToString(resourceName)), + GaugeMetric: gaugeMetric, + Value: lo.Ternary(resourceName == corev1.ResourceCPU, float64(quantity.MilliValue())/float64(1000), float64(quantity.Value())), + Labels: getNodeLabelsWithResourceType(n.Node, resourceNameToString(resourceName)), }) } } return append(res, &metrics.StoreMetric{ - GaugeVec: lifetimeGaugeVec, - Value: time.Since(n.Node.GetCreationTimestamp().Time).Seconds(), - Labels: getNodeLabels(n.Node), + GaugeMetric: Lifetime, + Value: time.Since(n.Node.GetCreationTimestamp().Time).Seconds(), + Labels: getNodeLabels(n.Node), }) } @@ -259,7 +255,7 @@ func getNodeLabelsWithResourceType(node *corev1.Node, resourceTypeName string) p } func getNodeLabels(node *corev1.Node) prometheus.Labels { - metricLabels := prometheus.Labels{} + metricLabels := map[string]string{} metricLabels[nodeName] = node.Name metricLabels[nodePhase] = string(node.Status.Phase) diff --git a/pkg/controllers/metrics/nodepool/controller.go b/pkg/controllers/metrics/nodepool/controller.go index 1b0984c828..63e14ee59f 100644 --- a/pkg/controllers/metrics/nodepool/controller.go +++ b/pkg/controllers/metrics/nodepool/controller.go @@ -21,6 +21,7 @@ import ( "strings" "time" + opmetrics "github.com/awslabs/operatorpkg/metrics" "github.com/prometheus/client_golang/prometheus" "github.com/samber/lo" corev1 "k8s.io/api/core/v1" @@ -42,7 +43,8 @@ const ( ) var ( - limit = prometheus.NewGaugeVec( + Limit = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: metrics.NodePoolSubsystem, @@ -54,7 +56,8 @@ var ( nodePoolNameLabel, }, ) - usage = prometheus.NewGaugeVec( + Usage = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: metrics.NodePoolSubsystem, @@ -68,10 +71,6 @@ var ( ) ) -func init() { - crmetrics.Registry.MustRegister(limit, usage) -} - type Controller struct { kubeClient client.Client metricStore *metrics.Store @@ -102,15 +101,15 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco } func buildMetrics(nodePool *v1.NodePool) (res []*metrics.StoreMetric) { - for gaugeVec, resourceList := range map[*prometheus.GaugeVec]corev1.ResourceList{ - usage: nodePool.Status.Resources, - limit: getLimits(nodePool), + for gaugeVec, resourceList := range map[opmetrics.GaugeMetric]corev1.ResourceList{ + Usage: nodePool.Status.Resources, + Limit: getLimits(nodePool), } { for k, v := range resourceList { res = append(res, &metrics.StoreMetric{ - GaugeVec: gaugeVec, - Labels: makeLabels(nodePool, strings.ReplaceAll(strings.ToLower(string(k)), "-", "_")), - Value: lo.Ternary(k == corev1.ResourceCPU, float64(v.MilliValue())/float64(1000), float64(v.Value())), + GaugeMetric: gaugeVec, + Labels: makeLabels(nodePool, strings.ReplaceAll(strings.ToLower(string(k)), "-", "_")), + Value: lo.Ternary(k == corev1.ResourceCPU, float64(v.MilliValue())/float64(1000), float64(v.Value())), }) } } @@ -125,7 +124,7 @@ func getLimits(nodePool *v1.NodePool) corev1.ResourceList { } func makeLabels(nodePool *v1.NodePool, resourceTypeName string) prometheus.Labels { - return prometheus.Labels{ + return map[string]string{ resourceTypeLabel: resourceTypeName, nodePoolNameLabel: nodePool.Name, } diff --git a/pkg/controllers/metrics/pod/controller.go b/pkg/controllers/metrics/pod/controller.go index 81a63ef587..1c1cdcf2ab 100644 --- a/pkg/controllers/metrics/pod/controller.go +++ b/pkg/controllers/metrics/pod/controller.go @@ -22,6 +22,7 @@ import ( "strings" "time" + opmetrics "github.com/awslabs/operatorpkg/metrics" "github.com/prometheus/client_golang/prometheus" "github.com/samber/lo" corev1 "k8s.io/api/core/v1" @@ -54,7 +55,8 @@ const ( ) var ( - podState = prometheus.NewGaugeVec( + PodState = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: "karpenter", Subsystem: metrics.PodSubsystem, @@ -63,7 +65,8 @@ var ( }, labelNames(), ) - podStartupDurationSeconds = prometheus.NewSummary( + PodStartupDurationSeconds = opmetrics.NewPrometheusSummary( + crmetrics.Registry, prometheus.SummaryOpts{ Namespace: "karpenter", Subsystem: metrics.PodSubsystem, @@ -71,8 +74,10 @@ var ( Help: "The time from pod creation until the pod is running.", Objectives: metrics.SummaryObjectives(), }, + []string{}, ) - podBoundDurationSeconds = prometheus.NewHistogram( + PodBoundDurationSeconds = opmetrics.NewPrometheusHistogram( + crmetrics.Registry, prometheus.HistogramOpts{ Namespace: "karpenter", Subsystem: metrics.PodSubsystem, @@ -80,8 +85,10 @@ var ( Help: "The time from pod creation until the pod is bound.", Buckets: metrics.DurationBuckets(), }, + []string{}, ) - podCurrentUnboundTimeSeconds = prometheus.NewGaugeVec( + PodCurrentUnboundTimeSeconds = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: "karpenter", Subsystem: metrics.PodSubsystem, @@ -90,7 +97,8 @@ var ( }, []string{podName, podNamespace}, ) - podUnstartedTimeSeconds = prometheus.NewGaugeVec( + PodUnstartedTimeSeconds = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: "karpenter", Subsystem: metrics.PodSubsystem, @@ -110,10 +118,6 @@ type Controller struct { unscheduledPods sets.Set[string] } -func init() { - crmetrics.Registry.MustRegister(podState, podStartupDurationSeconds, podBoundDurationSeconds, podCurrentUnboundTimeSeconds, podUnstartedTimeSeconds) -} - func labelNames() []string { return []string{ podName, @@ -148,13 +152,13 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco if errors.IsNotFound(err) { c.pendingPods.Delete(req.NamespacedName.String()) // Delete the unstarted metric since the pod is deleted - podUnstartedTimeSeconds.Delete(map[string]string{ + PodUnstartedTimeSeconds.Delete(map[string]string{ podName: req.Name, podNamespace: req.Namespace, }) c.unscheduledPods.Delete(req.NamespacedName.String()) // Delete the unbound metric since the pod is deleted - podCurrentUnboundTimeSeconds.Delete(map[string]string{ + PodCurrentUnboundTimeSeconds.Delete(map[string]string{ podName: req.Name, podNamespace: req.Namespace, }) @@ -168,9 +172,9 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco } c.metricStore.Update(client.ObjectKeyFromObject(pod).String(), []*metrics.StoreMetric{ { - GaugeVec: podState, - Value: 1, - Labels: labels, + GaugeMetric: PodState, + Value: 1, + Labels: labels, }, }) c.recordPodStartupMetric(pod) @@ -181,10 +185,10 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco func (c *Controller) recordPodStartupMetric(pod *corev1.Pod) { key := client.ObjectKeyFromObject(pod).String() if pod.Status.Phase == phasePending { - podUnstartedTimeSeconds.With(map[string]string{ + PodUnstartedTimeSeconds.Set(time.Since(pod.CreationTimestamp.Time).Seconds(), map[string]string{ podName: pod.Name, podNamespace: pod.Namespace, - }).Set(time.Since(pod.CreationTimestamp.Time).Seconds()) + }) c.pendingPods.Insert(key) return } @@ -193,17 +197,17 @@ func (c *Controller) recordPodStartupMetric(pod *corev1.Pod) { }) if c.pendingPods.Has(key) { if !ok || cond.Status != corev1.ConditionTrue { - podUnstartedTimeSeconds.With(map[string]string{ + PodUnstartedTimeSeconds.Set(time.Since(pod.CreationTimestamp.Time).Seconds(), map[string]string{ podName: pod.Name, podNamespace: pod.Namespace, - }).Set(time.Since(pod.CreationTimestamp.Time).Seconds()) + }) } else { // Delete the unstarted metric since the pod is now started - podUnstartedTimeSeconds.Delete(map[string]string{ + PodUnstartedTimeSeconds.Delete(map[string]string{ podName: pod.Name, podNamespace: pod.Namespace, }) - podStartupDurationSeconds.Observe(cond.LastTransitionTime.Sub(pod.CreationTimestamp.Time).Seconds()) + PodStartupDurationSeconds.Observe(cond.LastTransitionTime.Sub(pod.CreationTimestamp.Time).Seconds(), nil) c.pendingPods.Delete(key) } } @@ -216,28 +220,28 @@ func (c *Controller) recordPodBoundMetric(pod *corev1.Pod) { if pod.Status.Phase == phasePending { // If the podScheduled condition does not exist, or it exists and is not set to true, we emit pod_current_unbound_time_seconds metric. if !ok || condScheduled.Status != corev1.ConditionTrue { - podCurrentUnboundTimeSeconds.With(map[string]string{ + PodCurrentUnboundTimeSeconds.Set(time.Since(pod.CreationTimestamp.Time).Seconds(), map[string]string{ podName: pod.Name, podNamespace: pod.Namespace, - }).Set(time.Since(pod.CreationTimestamp.Time).Seconds()) + }) } c.unscheduledPods.Insert(key) return } if c.unscheduledPods.Has(key) && ok && condScheduled.Status == corev1.ConditionTrue { // Delete the unbound metric since the pod is now bound - podCurrentUnboundTimeSeconds.Delete(map[string]string{ + PodCurrentUnboundTimeSeconds.Delete(map[string]string{ podName: pod.Name, podNamespace: pod.Namespace, }) - podBoundDurationSeconds.Observe(condScheduled.LastTransitionTime.Sub(pod.CreationTimestamp.Time).Seconds()) + PodBoundDurationSeconds.Observe(condScheduled.LastTransitionTime.Sub(pod.CreationTimestamp.Time).Seconds(), nil) c.unscheduledPods.Delete(key) } } // makeLabels creates the makeLabels using the current state of the pod func (c *Controller) makeLabels(ctx context.Context, pod *corev1.Pod) (prometheus.Labels, error) { - metricLabels := prometheus.Labels{} + metricLabels := map[string]string{} metricLabels[podName] = pod.Name metricLabels[podNamespace] = pod.Namespace // Selflink has been deprecated after v.1.20 diff --git a/pkg/controllers/node/termination/controller.go b/pkg/controllers/node/termination/controller.go index bd2d5cf3ff..6249d1b27e 100644 --- a/pkg/controllers/node/termination/controller.go +++ b/pkg/controllers/node/termination/controller.go @@ -21,7 +21,6 @@ import ( "fmt" "time" - "github.com/prometheus/client_golang/prometheus" "github.com/samber/lo" "golang.org/x/time/rate" corev1 "k8s.io/api/core/v1" @@ -129,9 +128,9 @@ func (c *Controller) finalize(ctx context.Context, node *corev1.Node) (reconcile return reconcile.Result{RequeueAfter: 1 * time.Second}, nil } - NodesDrainedTotal.With(prometheus.Labels{ + NodesDrainedTotal.Inc(map[string]string{ metrics.NodePoolLabel: node.Labels[v1.NodePoolLabelKey], - }).Inc() + }) // In order for Pods associated with PersistentVolumes to smoothly migrate from the terminating Node, we wait // for VolumeAttachments of drain-able Pods to be cleaned up before terminating Node and removing its finalizer. // However, if TerminationGracePeriod is configured for Node, and we are past that period, we will skip waiting. @@ -247,18 +246,18 @@ func (c *Controller) removeFinalizer(ctx context.Context, n *corev1.Node) error return client.IgnoreNotFound(fmt.Errorf("removing finalizer, %w", err)) } - metrics.NodesTerminatedTotal.With(prometheus.Labels{ + metrics.NodesTerminatedTotal.Inc(map[string]string{ metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], - }).Inc() + }) // We use stored.DeletionTimestamp since the api-server may give back a node after the patch without a deletionTimestamp - TerminationDurationSeconds.With(prometheus.Labels{ + DurationSeconds.Observe(time.Since(stored.DeletionTimestamp.Time).Seconds(), map[string]string{ metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], - }).Observe(time.Since(stored.DeletionTimestamp.Time).Seconds()) + }) - NodeLifetimeDurationSeconds.With(map[string]string{ + NodeLifetimeDurationSeconds.Observe(time.Since(n.CreationTimestamp.Time).Seconds(), map[string]string{ metrics.NodePoolLabel: n.Labels[v1.NodePoolLabelKey], - }).Observe(time.Since(n.CreationTimestamp.Time).Seconds()) + }) log.FromContext(ctx).Info("deleted node") } diff --git a/pkg/controllers/node/termination/metrics.go b/pkg/controllers/node/termination/metrics.go index 13808c9e38..32d829c6d4 100644 --- a/pkg/controllers/node/termination/metrics.go +++ b/pkg/controllers/node/termination/metrics.go @@ -19,23 +19,18 @@ package termination import ( "time" + opmetrics "github.com/awslabs/operatorpkg/metrics" "github.com/prometheus/client_golang/prometheus" crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/karpenter/pkg/metrics" ) -func init() { - crmetrics.Registry.MustRegister( - TerminationDurationSeconds, - NodeLifetimeDurationSeconds, - NodesDrainedTotal) -} - const dayDuration = time.Hour * 24 var ( - TerminationDurationSeconds = prometheus.NewSummaryVec( + DurationSeconds = opmetrics.NewPrometheusSummary( + crmetrics.Registry, prometheus.SummaryOpts{ Namespace: metrics.Namespace, Subsystem: metrics.NodeSubsystem, @@ -45,7 +40,8 @@ var ( }, []string{metrics.NodePoolLabel}, ) - NodesDrainedTotal = prometheus.NewCounterVec( + NodesDrainedTotal = opmetrics.NewPrometheusCounter( + crmetrics.Registry, prometheus.CounterOpts{ Namespace: metrics.Namespace, Subsystem: metrics.NodeSubsystem, @@ -54,7 +50,8 @@ var ( }, []string{metrics.NodePoolLabel}, ) - NodeLifetimeDurationSeconds = prometheus.NewHistogramVec( + NodeLifetimeDurationSeconds = opmetrics.NewPrometheusHistogram( + crmetrics.Registry, prometheus.HistogramOpts{ Namespace: metrics.Namespace, Subsystem: metrics.NodeSubsystem, diff --git a/pkg/controllers/node/termination/suite_test.go b/pkg/controllers/node/termination/suite_test.go index da6daf60ad..d7727c48d9 100644 --- a/pkg/controllers/node/termination/suite_test.go +++ b/pkg/controllers/node/termination/suite_test.go @@ -93,7 +93,7 @@ var _ = Describe("Termination", func() { // Reset the metrics collectors metrics.NodesTerminatedTotal.Reset() - termination.TerminationDurationSeconds.Reset() + termination.DurationSeconds.Reset() termination.NodeLifetimeDurationSeconds.Reset() termination.NodesDrainedTotal.Reset() }) diff --git a/pkg/controllers/node/termination/terminator/eviction.go b/pkg/controllers/node/termination/terminator/eviction.go index 6df9f656e1..4401837067 100644 --- a/pkg/controllers/node/termination/terminator/eviction.go +++ b/pkg/controllers/node/termination/terminator/eviction.go @@ -183,7 +183,7 @@ func (q *Queue) Evict(ctx context.Context, key QueueKey) bool { var apiStatus apierrors.APIStatus if errors.As(err, &apiStatus) { code := apiStatus.Status().Code - NodesEvictionRequestsTotal.With(map[string]string{CodeLabel: fmt.Sprint(code)}).Inc() + NodesEvictionRequestsTotal.Inc(map[string]string{CodeLabel: fmt.Sprint(code)}) } // status codes for the eviction API are defined here: // https://kubernetes.io/docs/concepts/scheduling-eviction/api-eviction/#how-api-initiated-eviction-works @@ -204,7 +204,7 @@ func (q *Queue) Evict(ctx context.Context, key QueueKey) bool { log.FromContext(ctx).Error(err, "failed evicting pod") return false } - NodesEvictionRequestsTotal.With(map[string]string{CodeLabel: "200"}).Inc() + NodesEvictionRequestsTotal.Inc(map[string]string{CodeLabel: "200"}) q.recorder.Publish(terminatorevents.EvictPod(&corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: key.Name, Namespace: key.Namespace}})) return true } diff --git a/pkg/controllers/node/termination/terminator/metrics.go b/pkg/controllers/node/termination/terminator/metrics.go index d7c591d9bb..c2c2fb381b 100644 --- a/pkg/controllers/node/termination/terminator/metrics.go +++ b/pkg/controllers/node/termination/terminator/metrics.go @@ -17,6 +17,7 @@ limitations under the License. package terminator import ( + opmetrics "github.com/awslabs/operatorpkg/metrics" "github.com/prometheus/client_golang/prometheus" crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" @@ -28,7 +29,8 @@ const ( CodeLabel = "code" ) -var NodesEvictionRequestsTotal = prometheus.NewCounterVec( +var NodesEvictionRequestsTotal = opmetrics.NewPrometheusCounter( + crmetrics.Registry, prometheus.CounterOpts{ Namespace: metrics.Namespace, Subsystem: metrics.NodeSubsystem, @@ -37,7 +39,3 @@ var NodesEvictionRequestsTotal = prometheus.NewCounterVec( }, []string{CodeLabel}, ) - -func init() { - crmetrics.Registry.MustRegister(NodesEvictionRequestsTotal) -} diff --git a/pkg/controllers/nodeclaim/expiration/controller.go b/pkg/controllers/nodeclaim/expiration/controller.go index 5681475253..13c7cfd1bb 100644 --- a/pkg/controllers/nodeclaim/expiration/controller.go +++ b/pkg/controllers/nodeclaim/expiration/controller.go @@ -20,7 +20,6 @@ import ( "context" "time" - "github.com/prometheus/client_golang/prometheus" "k8s.io/utils/clock" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -67,11 +66,11 @@ func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (re } // 4. The deletion timestamp has successfully been set for the NodeClaim, update relevant metrics. log.FromContext(ctx).V(1).Info("deleting expired nodeclaim") - metrics.NodeClaimsDisruptedTotal.With(prometheus.Labels{ + metrics.NodeClaimsDisruptedTotal.Inc(map[string]string{ metrics.ReasonLabel: metrics.ExpiredReason, metrics.NodePoolLabel: nodeClaim.Labels[v1.NodePoolLabelKey], metrics.CapacityTypeLabel: nodeClaim.Labels[v1.CapacityTypeLabelKey], - }).Inc() + }) // We sleep here after the delete operation since we want to ensure that we are able to read our own writes so that // we avoid duplicating metrics and log lines due to quick re-queues. // USE CAUTION when determining whether to increase this timeout or remove this line diff --git a/pkg/controllers/nodeclaim/garbagecollection/controller.go b/pkg/controllers/nodeclaim/garbagecollection/controller.go index e3a97d1ee6..5a4d510682 100644 --- a/pkg/controllers/nodeclaim/garbagecollection/controller.go +++ b/pkg/controllers/nodeclaim/garbagecollection/controller.go @@ -21,7 +21,6 @@ import ( "time" "github.com/awslabs/operatorpkg/singleton" - "github.com/prometheus/client_golang/prometheus" "github.com/samber/lo" "go.uber.org/multierr" corev1 "k8s.io/api/core/v1" @@ -105,11 +104,11 @@ func (c *Controller) Reconcile(ctx context.Context) (reconcile.Result, error) { "provider-id", nodeClaims[i].Status.ProviderID, "nodepool", nodeClaims[i].Labels[v1.NodePoolLabelKey], ).V(1).Info("garbage collecting nodeclaim with no cloudprovider representation") - metrics.NodeClaimsDisruptedTotal.With(prometheus.Labels{ + metrics.NodeClaimsDisruptedTotal.Inc(map[string]string{ metrics.ReasonLabel: "garbage_collected", metrics.NodePoolLabel: nodeClaims[i].Labels[v1.NodePoolLabelKey], metrics.CapacityTypeLabel: nodeClaims[i].Labels[v1.CapacityTypeLabelKey], - }).Inc() + }) }) if err = multierr.Combine(errs...); err != nil { return reconcile.Result{}, err diff --git a/pkg/controllers/nodeclaim/lifecycle/controller.go b/pkg/controllers/nodeclaim/lifecycle/controller.go index 290e7fbb2f..d91f6a9360 100644 --- a/pkg/controllers/nodeclaim/lifecycle/controller.go +++ b/pkg/controllers/nodeclaim/lifecycle/controller.go @@ -22,7 +22,6 @@ import ( "time" "github.com/patrickmn/go-cache" - "github.com/prometheus/client_golang/prometheus" "github.com/samber/lo" "go.uber.org/multierr" "golang.org/x/time/rate" @@ -210,9 +209,9 @@ func (c *Controller) finalize(ctx context.Context, nodeClaim *v1.NodeClaim) (rec if !isInstanceTerminated { return reconcile.Result{RequeueAfter: 5 * time.Second}, nil } - InstanceTerminationDurationSeconds.With(map[string]string{ + InstanceTerminationDurationSeconds.Observe(time.Since(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).LastTransitionTime.Time).Seconds(), map[string]string{ metrics.NodePoolLabel: nodeClaim.Labels[v1.NodePoolLabelKey], - }).Observe(time.Since(nodeClaim.StatusConditions().Get(v1.ConditionTypeInstanceTerminating).LastTransitionTime.Time).Seconds()) + }) } stored := nodeClaim.DeepCopy() // The NodeClaim may have been modified in the EnsureTerminated function controllerutil.RemoveFinalizer(nodeClaim, v1.TerminationFinalizer) @@ -228,13 +227,13 @@ func (c *Controller) finalize(ctx context.Context, nodeClaim *v1.NodeClaim) (rec return reconcile.Result{}, client.IgnoreNotFound(fmt.Errorf("removing termination finalizer, %w", err)) } log.FromContext(ctx).Info("deleted nodeclaim") - NodeClaimTerminationDurationSeconds.With(map[string]string{ + NodeClaimTerminationDurationSeconds.Observe(time.Since(stored.DeletionTimestamp.Time).Seconds(), map[string]string{ metrics.NodePoolLabel: nodeClaim.Labels[v1.NodePoolLabelKey], - }).Observe(time.Since(stored.DeletionTimestamp.Time).Seconds()) - metrics.NodeClaimsTerminatedTotal.With(prometheus.Labels{ + }) + metrics.NodeClaimsTerminatedTotal.Inc(map[string]string{ metrics.NodePoolLabel: nodeClaim.Labels[v1.NodePoolLabelKey], metrics.CapacityTypeLabel: nodeClaim.Labels[v1.CapacityTypeLabelKey], - }).Inc() + }) } return reconcile.Result{}, nil diff --git a/pkg/controllers/nodeclaim/lifecycle/launch.go b/pkg/controllers/nodeclaim/lifecycle/launch.go index 55878542f5..da93b5cc18 100644 --- a/pkg/controllers/nodeclaim/lifecycle/launch.go +++ b/pkg/controllers/nodeclaim/lifecycle/launch.go @@ -21,7 +21,6 @@ import ( "fmt" "github.com/patrickmn/go-cache" - "github.com/prometheus/client_golang/prometheus" "github.com/samber/lo" corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" @@ -81,22 +80,22 @@ func (l *Launch) launchNodeClaim(ctx context.Context, nodeClaim *v1.NodeClaim) ( if err = l.kubeClient.Delete(ctx, nodeClaim); err != nil { return nil, client.IgnoreNotFound(err) } - metrics.NodeClaimsDisruptedTotal.With(prometheus.Labels{ + metrics.NodeClaimsDisruptedTotal.Inc(map[string]string{ metrics.ReasonLabel: "insufficient_capacity", metrics.NodePoolLabel: nodeClaim.Labels[v1.NodePoolLabelKey], metrics.CapacityTypeLabel: nodeClaim.Labels[v1.CapacityTypeLabelKey], - }).Inc() + }) return nil, nil case cloudprovider.IsNodeClassNotReadyError(err): log.FromContext(ctx).Error(err, "failed launching nodeclaim") if err = l.kubeClient.Delete(ctx, nodeClaim); err != nil { return nil, client.IgnoreNotFound(err) } - metrics.NodeClaimsDisruptedTotal.With(prometheus.Labels{ + metrics.NodeClaimsDisruptedTotal.Inc(map[string]string{ metrics.ReasonLabel: "nodeclass_not_ready", metrics.NodePoolLabel: nodeClaim.Labels[v1.NodePoolLabelKey], metrics.CapacityTypeLabel: nodeClaim.Labels[v1.CapacityTypeLabelKey], - }).Inc() + }) return nil, nil default: nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeLaunched, "LaunchFailed", truncateMessage(err.Error())) diff --git a/pkg/controllers/nodeclaim/lifecycle/liveness.go b/pkg/controllers/nodeclaim/lifecycle/liveness.go index f1181c95ac..fc1a272752 100644 --- a/pkg/controllers/nodeclaim/lifecycle/liveness.go +++ b/pkg/controllers/nodeclaim/lifecycle/liveness.go @@ -20,7 +20,6 @@ import ( "context" "time" - "github.com/prometheus/client_golang/prometheus" "k8s.io/utils/clock" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" @@ -57,11 +56,11 @@ func (l *Liveness) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reco return reconcile.Result{}, client.IgnoreNotFound(err) } log.FromContext(ctx).V(1).WithValues("ttl", registrationTTL).Info("terminating due to registration ttl") - metrics.NodeClaimsDisruptedTotal.With(prometheus.Labels{ + metrics.NodeClaimsDisruptedTotal.Inc(map[string]string{ metrics.ReasonLabel: "liveness", metrics.NodePoolLabel: nodeClaim.Labels[v1.NodePoolLabelKey], metrics.CapacityTypeLabel: nodeClaim.Labels[v1.CapacityTypeLabelKey], - }).Inc() + }) return reconcile.Result{}, nil } diff --git a/pkg/controllers/nodeclaim/lifecycle/metrics.go b/pkg/controllers/nodeclaim/lifecycle/metrics.go index 0343f231a7..808999e652 100644 --- a/pkg/controllers/nodeclaim/lifecycle/metrics.go +++ b/pkg/controllers/nodeclaim/lifecycle/metrics.go @@ -17,17 +17,15 @@ limitations under the License. package lifecycle import ( + opmetrics "github.com/awslabs/operatorpkg/metrics" "github.com/prometheus/client_golang/prometheus" crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/karpenter/pkg/metrics" ) -func init() { - crmetrics.Registry.MustRegister(InstanceTerminationDurationSeconds, NodeClaimTerminationDurationSeconds) -} - -var InstanceTerminationDurationSeconds = prometheus.NewHistogramVec( +var InstanceTerminationDurationSeconds = opmetrics.NewPrometheusHistogram( + crmetrics.Registry, prometheus.HistogramOpts{ Namespace: metrics.Namespace, Subsystem: metrics.NodeClaimSubsystem, @@ -38,7 +36,8 @@ var InstanceTerminationDurationSeconds = prometheus.NewHistogramVec( []string{metrics.NodePoolLabel}, ) -var NodeClaimTerminationDurationSeconds = prometheus.NewHistogramVec( +var NodeClaimTerminationDurationSeconds = opmetrics.NewPrometheusHistogram( + crmetrics.Registry, prometheus.HistogramOpts{ Namespace: metrics.Namespace, Subsystem: metrics.NodeClaimSubsystem, diff --git a/pkg/controllers/nodeclaim/lifecycle/registration.go b/pkg/controllers/nodeclaim/lifecycle/registration.go index 77e0e22160..3f29c64a58 100644 --- a/pkg/controllers/nodeclaim/lifecycle/registration.go +++ b/pkg/controllers/nodeclaim/lifecycle/registration.go @@ -20,7 +20,6 @@ import ( "context" "fmt" - "github.com/prometheus/client_golang/prometheus" "github.com/samber/lo" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" @@ -78,9 +77,9 @@ func (r *Registration) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) ( nodeClaim.StatusConditions().SetTrue(v1.ConditionTypeRegistered) nodeClaim.Status.NodeName = node.Name - metrics.NodesCreatedTotal.With(prometheus.Labels{ + metrics.NodesCreatedTotal.Inc(map[string]string{ metrics.NodePoolLabel: nodeClaim.Labels[v1.NodePoolLabelKey], - }).Inc() + }) return reconcile.Result{}, nil } diff --git a/pkg/controllers/provisioning/provisioner.go b/pkg/controllers/provisioning/provisioner.go index 79eb8da63a..447fb1ab9b 100644 --- a/pkg/controllers/provisioning/provisioner.go +++ b/pkg/controllers/provisioning/provisioner.go @@ -26,7 +26,6 @@ import ( "github.com/awslabs/operatorpkg/option" "github.com/awslabs/operatorpkg/singleton" "github.com/awslabs/operatorpkg/status" - "github.com/prometheus/client_golang/prometheus" "github.com/samber/lo" "go.uber.org/multierr" appsv1 "k8s.io/api/apps/v1" @@ -170,7 +169,7 @@ func (p *Provisioner) GetPendingPods(ctx context.Context) ([]*corev1.Pod, error) } return false }) - scheduler.IgnoredPodCount.Set(float64(len(rejectedPods))) + scheduler.IgnoredPodCount.Set(float64(len(rejectedPods)), nil) p.consolidationWarnings(ctx, pods) return pods, nil } @@ -306,9 +305,7 @@ func (p *Provisioner) NewScheduler(ctx context.Context, pods []*corev1.Pod, stat } func (p *Provisioner) Schedule(ctx context.Context) (scheduler.Results, error) { - defer metrics.Measure(scheduler.SchedulingDurationSeconds.With( - prometheus.Labels{scheduler.ControllerLabel: injection.GetControllerName(ctx)}, - ))() + defer metrics.Measure(scheduler.DurationSeconds, map[string]string{scheduler.ControllerLabel: injection.GetControllerName(ctx)})() start := time.Now() // We collect the nodes with their used capacities before we get the list of pending pods. This ensures that @@ -349,9 +346,7 @@ func (p *Provisioner) Schedule(ctx context.Context) (scheduler.Results, error) { return scheduler.Results{}, fmt.Errorf("creating scheduler, %w", err) } results := s.Solve(ctx, pods).TruncateInstanceTypes(scheduler.MaxInstanceTypes) - scheduler.UnschedulablePodsCount.With( - prometheus.Labels{scheduler.ControllerLabel: injection.GetControllerName(ctx)}, - ).Set(float64(len(results.PodErrors))) + scheduler.UnschedulablePodsCount.Set(float64(len(results.PodErrors)), map[string]string{scheduler.ControllerLabel: injection.GetControllerName(ctx)}) if len(results.NewNodeClaims) > 0 { log.FromContext(ctx).WithValues("Pods", pretty.Slice(lo.Map(pods, func(p *corev1.Pod, _ int) string { return klog.KRef(p.Namespace, p.Name).String() }), 5), "duration", time.Since(start)).Info("found provisionable pod(s)") } @@ -380,11 +375,11 @@ func (p *Provisioner) Create(ctx context.Context, n *scheduler.NodeClaim, opts . log.FromContext(ctx).WithValues("NodeClaim", klog.KRef("", nodeClaim.Name), "requests", nodeClaim.Spec.Resources.Requests, "instance-types", instanceTypeList(instanceTypeRequirement.Values)). Info("created nodeclaim") - metrics.NodeClaimsCreatedTotal.With(prometheus.Labels{ + metrics.NodeClaimsCreatedTotal.Inc(map[string]string{ metrics.ReasonLabel: options.Reason, metrics.NodePoolLabel: nodeClaim.Labels[v1.NodePoolLabelKey], metrics.CapacityTypeLabel: nodeClaim.Labels[v1.CapacityTypeLabelKey], - }).Inc() + }) // Update the nodeclaim manually in state to avoid evenutal consistency delay races with our watcher. // This is essential to avoiding races where disruption can create a replacement node, then immediately // requeue. This can race with controller-runtime's internal cache as it watches events on the cluster diff --git a/pkg/controllers/provisioning/scheduling/metrics.go b/pkg/controllers/provisioning/scheduling/metrics.go index 9f6d9cfd1b..43d341bb0c 100644 --- a/pkg/controllers/provisioning/scheduling/metrics.go +++ b/pkg/controllers/provisioning/scheduling/metrics.go @@ -17,16 +17,13 @@ limitations under the License. package scheduling import ( + opmetrics "github.com/awslabs/operatorpkg/metrics" "github.com/prometheus/client_golang/prometheus" crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/karpenter/pkg/metrics" ) -func init() { - crmetrics.Registry.MustRegister(SchedulingDurationSeconds, QueueDepth, IgnoredPodCount, UnschedulablePodsCount) -} - const ( ControllerLabel = "controller" schedulingIDLabel = "scheduling_id" @@ -34,7 +31,8 @@ const ( ) var ( - SchedulingDurationSeconds = prometheus.NewHistogramVec( + DurationSeconds = opmetrics.NewPrometheusHistogram( + crmetrics.Registry, prometheus.HistogramOpts{ Namespace: metrics.Namespace, Subsystem: schedulerSubsystem, @@ -46,7 +44,8 @@ var ( ControllerLabel, }, ) - QueueDepth = prometheus.NewGaugeVec( + QueueDepth = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: schedulerSubsystem, @@ -58,14 +57,17 @@ var ( schedulingIDLabel, }, ) - IgnoredPodCount = prometheus.NewGauge( + IgnoredPodCount = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Name: "ignored_pod_count", Help: "Number of pods ignored during scheduling by Karpenter", }, + []string{}, ) - UnschedulablePodsCount = prometheus.NewGaugeVec( + UnschedulablePodsCount = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: schedulerSubsystem, diff --git a/pkg/controllers/provisioning/scheduling/scheduler.go b/pkg/controllers/provisioning/scheduling/scheduler.go index 65a65d6079..fb4b7f55e3 100644 --- a/pkg/controllers/provisioning/scheduling/scheduler.go +++ b/pkg/controllers/provisioning/scheduling/scheduler.go @@ -23,7 +23,6 @@ import ( "sort" "time" - "github.com/prometheus/client_golang/prometheus" "github.com/samber/lo" "go.uber.org/multierr" corev1 "k8s.io/api/core/v1" @@ -198,9 +197,7 @@ func (r Results) TruncateInstanceTypes(maxInstanceTypes int) Results { } func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod) Results { - defer metrics.Measure(SchedulingDurationSeconds.With( - prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)}, - ))() + defer metrics.Measure(DurationSeconds, map[string]string{ControllerLabel: injection.GetControllerName(ctx)})() // We loop trying to schedule unschedulable pods as long as we are making progress. This solves a few // issues including pods with affinity to another pod in the batch. We could topo-sort to solve this, but it wouldn't // solve the problem of scheduling pods where a particular order is needed to prevent a max-skew violation. E.g. if we @@ -208,17 +205,15 @@ func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod) Results { // We need to schedule them alternating, A, B, A, B, .... and this solution also solves that as well. errors := map[*corev1.Pod]error{} // Reset the metric for the controller, so we don't keep old ids around - UnschedulablePodsCount.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)}) - QueueDepth.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)}) + UnschedulablePodsCount.DeletePartialMatch(map[string]string{ControllerLabel: injection.GetControllerName(ctx)}) + QueueDepth.DeletePartialMatch(map[string]string{ControllerLabel: injection.GetControllerName(ctx)}) q := NewQueue(pods...) startTime := s.clock.Now() lastLogTime := s.clock.Now() batchSize := len(q.pods) for { - QueueDepth.With( - prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx), schedulingIDLabel: string(s.id)}, - ).Set(float64(len(q.pods))) + QueueDepth.Set(float64(len(q.pods)), map[string]string{ControllerLabel: injection.GetControllerName(ctx), schedulingIDLabel: string(s.id)}) if s.clock.Since(lastLogTime) > time.Minute { log.FromContext(ctx).WithValues("pods-scheduled", batchSize-len(q.pods), "pods-remaining", len(q.pods), "duration", s.clock.Since(startTime).Truncate(time.Second), "scheduling-id", string(s.id)).Info("computing pod scheduling...") diff --git a/pkg/controllers/provisioning/scheduling/suite_test.go b/pkg/controllers/provisioning/scheduling/suite_test.go index bcb65dc963..f548ed6ecf 100644 --- a/pkg/controllers/provisioning/scheduling/suite_test.go +++ b/pkg/controllers/provisioning/scheduling/suite_test.go @@ -111,7 +111,7 @@ var _ = AfterEach(func() { ExpectCleanedUp(ctx, env.Client) cluster.Reset() scheduling.QueueDepth.Reset() - scheduling.SchedulingDurationSeconds.Reset() + scheduling.DurationSeconds.Reset() scheduling.UnschedulablePodsCount.Reset() }) diff --git a/pkg/controllers/provisioning/suite_test.go b/pkg/controllers/provisioning/suite_test.go index b24f93c444..acee66e8f3 100644 --- a/pkg/controllers/provisioning/suite_test.go +++ b/pkg/controllers/provisioning/suite_test.go @@ -100,7 +100,7 @@ var _ = AfterEach(func() { ExpectCleanedUp(ctx, env.Client) cloudProvider.Reset() cluster.Reset() - pscheduling.IgnoredPodCount.Set(0) + pscheduling.IgnoredPodCount.Set(0, nil) }) var _ = Describe("Provisioning", func() { diff --git a/pkg/controllers/state/cluster.go b/pkg/controllers/state/cluster.go index aeec1ee06b..28b51ea3fb 100644 --- a/pkg/controllers/state/cluster.go +++ b/pkg/controllers/state/cluster.go @@ -87,17 +87,17 @@ func (c *Cluster) Synced(ctx context.Context) (synced bool) { defer func() { if synced { c.unsyncedStartTime = time.Time{} - ClusterStateUnsyncedTimeSeconds.With(map[string]string{}).Set(0) + ClusterStateUnsyncedTimeSeconds.Set(0, nil) } else { if c.unsyncedStartTime.IsZero() { c.unsyncedStartTime = c.clock.Now() } - ClusterStateUnsyncedTimeSeconds.With(map[string]string{}).Set(c.clock.Since(c.unsyncedStartTime).Seconds()) + ClusterStateUnsyncedTimeSeconds.Set(c.clock.Since(c.unsyncedStartTime).Seconds(), nil) } }() // Set the metric to whatever the result of the Synced() call is defer func() { - ClusterStateSynced.Set(lo.Ternary[float64](synced, 1, 0)) + ClusterStateSynced.Set(lo.Ternary[float64](synced, 1, 0), nil) }() nodeClaimList := &v1.NodeClaimList{} if err := c.kubeClient.List(ctx, nodeClaimList); err != nil { @@ -247,7 +247,7 @@ func (c *Cluster) UpdateNodeClaim(nodeClaim *v1.NodeClaim) { // If the nodeclaim hasn't launched yet, we want to add it into cluster state to ensure // that we're not racing with the internal cache for the cluster, assuming the node doesn't exist. c.nodeClaimNameToProviderID[nodeClaim.Name] = nodeClaim.Status.ProviderID - ClusterStateNodesCount.Set(float64(len(c.nodes))) + ClusterStateNodesCount.Set(float64(len(c.nodes)), nil) } func (c *Cluster) DeleteNodeClaim(name string) { @@ -255,7 +255,7 @@ func (c *Cluster) DeleteNodeClaim(name string) { defer c.mu.Unlock() c.cleanupNodeClaim(name) - ClusterStateNodesCount.Set(float64(len(c.nodes))) + ClusterStateNodesCount.Set(float64(len(c.nodes)), nil) } func (c *Cluster) UpdateNode(ctx context.Context, node *corev1.Node) error { @@ -282,7 +282,7 @@ func (c *Cluster) UpdateNode(ctx context.Context, node *corev1.Node) error { } c.nodes[node.Spec.ProviderID] = n c.nodeNameToProviderID[node.Name] = node.Spec.ProviderID - ClusterStateNodesCount.Set(float64(len(c.nodes))) + ClusterStateNodesCount.Set(float64(len(c.nodes)), nil) return nil } @@ -290,7 +290,7 @@ func (c *Cluster) DeleteNode(name string) { c.mu.Lock() defer c.mu.Unlock() c.cleanupNode(name) - ClusterStateNodesCount.Set(float64(len(c.nodes))) + ClusterStateNodesCount.Set(float64(len(c.nodes)), nil) } func (c *Cluster) UpdatePod(ctx context.Context, pod *corev1.Pod) error { diff --git a/pkg/controllers/state/metrics.go b/pkg/controllers/state/metrics.go index a344c59de8..172378b601 100644 --- a/pkg/controllers/state/metrics.go +++ b/pkg/controllers/state/metrics.go @@ -17,6 +17,7 @@ limitations under the License. package state import ( + opmetrics "github.com/awslabs/operatorpkg/metrics" "github.com/prometheus/client_golang/prometheus" crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" @@ -28,24 +29,29 @@ const ( ) var ( - ClusterStateNodesCount = prometheus.NewGauge( + ClusterStateNodesCount = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: stateSubsystem, Name: "node_count", Help: "Current count of nodes in cluster state", }, + []string{}, ) - ClusterStateSynced = prometheus.NewGauge( + ClusterStateSynced = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: stateSubsystem, Name: "synced", Help: "Returns 1 if cluster state is synced and 0 otherwise. Synced checks that nodeclaims and nodes that are stored in the APIServer have the same representation as Karpenter's cluster state", }, + []string{}, ) - ClusterStateUnsyncedTimeSeconds = prometheus.NewGaugeVec( + ClusterStateUnsyncedTimeSeconds = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: stateSubsystem, @@ -55,7 +61,3 @@ var ( []string{}, ) ) - -func init() { - crmetrics.Registry.MustRegister(ClusterStateNodesCount, ClusterStateSynced, ClusterStateUnsyncedTimeSeconds) -} diff --git a/pkg/metrics/constants.go b/pkg/metrics/constants.go index 25559e9a3e..32d8bdcac8 100644 --- a/pkg/metrics/constants.go +++ b/pkg/metrics/constants.go @@ -19,7 +19,7 @@ package metrics import ( "time" - "github.com/prometheus/client_golang/prometheus" + opmetrics "github.com/awslabs/operatorpkg/metrics" ) const ( @@ -58,7 +58,7 @@ func SummaryObjectives() map[float64]float64 { // Measure returns a deferrable function that observes the duration between the // defer statement and the end of the function. -func Measure(observer prometheus.Observer) func() { +func Measure(observer opmetrics.ObservationMetric, labels map[string]string) func() { start := time.Now() - return func() { observer.Observe(time.Since(start).Seconds()) } + return func() { observer.Observe(time.Since(start).Seconds(), labels) } } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index fd21137cba..e05ec1c031 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -17,6 +17,7 @@ limitations under the License. package metrics import ( + opmetrics "github.com/awslabs/operatorpkg/metrics" "github.com/prometheus/client_golang/prometheus" crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" ) @@ -29,7 +30,8 @@ const ( ) var ( - NodeClaimsCreatedTotal = prometheus.NewCounterVec( + NodeClaimsCreatedTotal = opmetrics.NewPrometheusCounter( + crmetrics.Registry, prometheus.CounterOpts{ Namespace: Namespace, Subsystem: NodeClaimSubsystem, @@ -42,7 +44,8 @@ var ( CapacityTypeLabel, }, ) - NodeClaimsTerminatedTotal = prometheus.NewCounterVec( + NodeClaimsTerminatedTotal = opmetrics.NewPrometheusCounter( + crmetrics.Registry, prometheus.CounterOpts{ Namespace: Namespace, Subsystem: NodeClaimSubsystem, @@ -54,7 +57,8 @@ var ( CapacityTypeLabel, }, ) - NodeClaimsDisruptedTotal = prometheus.NewCounterVec( + NodeClaimsDisruptedTotal = opmetrics.NewPrometheusCounter( + crmetrics.Registry, prometheus.CounterOpts{ Namespace: Namespace, Subsystem: NodeClaimSubsystem, @@ -67,7 +71,8 @@ var ( CapacityTypeLabel, }, ) - NodesCreatedTotal = prometheus.NewCounterVec( + NodesCreatedTotal = opmetrics.NewPrometheusCounter( + crmetrics.Registry, prometheus.CounterOpts{ Namespace: Namespace, Subsystem: NodeSubsystem, @@ -78,7 +83,8 @@ var ( NodePoolLabel, }, ) - NodesTerminatedTotal = prometheus.NewCounterVec( + NodesTerminatedTotal = opmetrics.NewPrometheusCounter( + crmetrics.Registry, prometheus.CounterOpts{ Namespace: Namespace, Subsystem: NodeSubsystem, @@ -90,8 +96,3 @@ var ( }, ) ) - -func init() { - crmetrics.Registry.MustRegister(NodeClaimsCreatedTotal, NodeClaimsTerminatedTotal, NodeClaimsDisruptedTotal, - NodesCreatedTotal, NodesTerminatedTotal) -} diff --git a/pkg/metrics/store.go b/pkg/metrics/store.go index d429959301..93eb323807 100644 --- a/pkg/metrics/store.go +++ b/pkg/metrics/store.go @@ -19,6 +19,7 @@ package metrics import ( "sync" + opmetrics "github.com/awslabs/operatorpkg/metrics" "github.com/prometheus/client_golang/prometheus" "github.com/samber/lo" "k8s.io/apimachinery/pkg/api/equality" @@ -38,9 +39,9 @@ func NewStore() *Store { return &Store{store: map[string][]*StoreMetric{}} } -// StoreMetric is a single state metric associated with a prometheus.GaugeVec +// StoreMetric is a single state metric associated with a metrics.Gauge type StoreMetric struct { - *prometheus.GaugeVec + opmetrics.GaugeMetric Value float64 Labels prometheus.Labels } @@ -48,13 +49,13 @@ type StoreMetric struct { // update is an internal non-thread-safe method for updating metrics given a key in the Store func (s *Store) update(key string, metrics []*StoreMetric) { for _, metric := range metrics { - metric.With(metric.Labels).Set(metric.Value) + metric.Set(metric.Value, metric.Labels) } // Cleanup old metrics if the old metric family has metrics that weren't updated by this round of metrics if oldMetrics, ok := s.store[key]; ok { for _, oldMetric := range oldMetrics { if _, ok = lo.Find(metrics, func(m *StoreMetric) bool { - return oldMetric.GaugeVec == m.GaugeVec && equality.Semantic.DeepEqual(oldMetric.Labels, m.Labels) + return oldMetric.GaugeMetric == m.GaugeMetric && equality.Semantic.DeepEqual(oldMetric.Labels, m.Labels) }); !ok { oldMetric.Delete(oldMetric.Labels) } diff --git a/pkg/metrics/suite_test.go b/pkg/metrics/suite_test.go index 1db5d29faf..791dfefa70 100644 --- a/pkg/metrics/suite_test.go +++ b/pkg/metrics/suite_test.go @@ -19,6 +19,7 @@ package metrics_test import ( "testing" + opmetrics "github.com/awslabs/operatorpkg/metrics" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/prometheus/client_golang/prometheus" @@ -30,7 +31,7 @@ import ( . "sigs.k8s.io/karpenter/pkg/test/expectations" ) -var testGauge1, testGauge2 *prometheus.GaugeVec +var testGauge1, testGauge2 opmetrics.GaugeMetric func TestAPIs(t *testing.T) { RegisterFailHandler(Fail) @@ -38,9 +39,8 @@ func TestAPIs(t *testing.T) { } var _ = BeforeSuite(func() { - testGauge1 = prometheus.NewGaugeVec(prometheus.GaugeOpts{Name: "test_gauge_1"}, []string{"label_1", "label_2"}) - testGauge2 = prometheus.NewGaugeVec(prometheus.GaugeOpts{Name: "test_gauge_2"}, []string{"label_1", "label_2"}) - crmetrics.Registry.MustRegister(testGauge1, testGauge2) + testGauge1 = opmetrics.NewPrometheusGauge(crmetrics.Registry, prometheus.GaugeOpts{Name: "test_gauge_1"}, []string{"label_1", "label_2"}) + testGauge2 = opmetrics.NewPrometheusGauge(crmetrics.Registry, prometheus.GaugeOpts{Name: "test_gauge_2"}, []string{"label_1", "label_2"}) }) var _ = Describe("Store", func() { @@ -55,17 +55,17 @@ var _ = Describe("Store", func() { It("should create metrics when calling update", func() { storeMetrics := []*metrics.StoreMetric{ { - GaugeVec: testGauge1, - Value: 3.65, - Labels: prometheus.Labels{ + GaugeMetric: testGauge1, + Value: 3.65, + Labels: map[string]string{ "label_1": "test", "label_2": "test", }, }, { - GaugeVec: testGauge2, - Value: 5.3, - Labels: prometheus.Labels{ + GaugeMetric: testGauge2, + Value: 5.3, + Labels: map[string]string{ "label_1": "test", "label_2": "test", }, @@ -86,17 +86,17 @@ var _ = Describe("Store", func() { It("should delete metrics when calling update", func() { storeMetrics := []*metrics.StoreMetric{ { - GaugeVec: testGauge1, - Value: 3.65, - Labels: prometheus.Labels{ + GaugeMetric: testGauge1, + Value: 3.65, + Labels: map[string]string{ "label_1": "test", "label_2": "test", }, }, { - GaugeVec: testGauge2, - Value: 5.3, - Labels: prometheus.Labels{ + GaugeMetric: testGauge2, + Value: 5.3, + Labels: map[string]string{ "label_1": "test", "label_2": "test", }, @@ -106,17 +106,17 @@ var _ = Describe("Store", func() { newStoreMetrics := []*metrics.StoreMetric{ { - GaugeVec: testGauge1, - Value: 3.65, - Labels: prometheus.Labels{ + GaugeMetric: testGauge1, + Value: 3.65, + Labels: map[string]string{ "label_1": "test_2", "label_2": "test_2", }, }, { - GaugeVec: testGauge2, - Value: 5.3, - Labels: prometheus.Labels{ + GaugeMetric: testGauge2, + Value: 5.3, + Labels: map[string]string{ "label_1": "test_2", "label_2": "test_2", }, @@ -141,17 +141,17 @@ var _ = Describe("Store", func() { It("should consider metrics equal with the same labels", func() { storeMetrics := []*metrics.StoreMetric{ { - GaugeVec: testGauge1, - Value: 3.65, - Labels: prometheus.Labels{ + GaugeMetric: testGauge1, + Value: 3.65, + Labels: map[string]string{ "label_1": "test", "label_2": "test", }, }, { - GaugeVec: testGauge2, - Value: 5.3, - Labels: prometheus.Labels{ + GaugeMetric: testGauge2, + Value: 5.3, + Labels: map[string]string{ "label_1": "test", "label_2": "test", }, @@ -171,17 +171,17 @@ var _ = Describe("Store", func() { // Flip around the labels in the map newStoreMetrics := []*metrics.StoreMetric{ { - GaugeVec: testGauge1, - Value: 4.5, - Labels: prometheus.Labels{ + GaugeMetric: testGauge1, + Value: 4.5, + Labels: map[string]string{ "label_2": "test", "label_1": "test", }, }, { - GaugeVec: testGauge2, - Value: 6.9, - Labels: prometheus.Labels{ + GaugeMetric: testGauge2, + Value: 6.9, + Labels: map[string]string{ "label_2": "test", "label_1": "test", }, @@ -203,17 +203,17 @@ var _ = Describe("Store", func() { It("should delete metrics by key", func() { storeMetrics := []*metrics.StoreMetric{ { - GaugeVec: testGauge1, - Value: 3.65, - Labels: prometheus.Labels{ + GaugeMetric: testGauge1, + Value: 3.65, + Labels: map[string]string{ "label_1": "test", "label_2": "test", }, }, { - GaugeVec: testGauge2, - Value: 5.3, - Labels: prometheus.Labels{ + GaugeMetric: testGauge2, + Value: 5.3, + Labels: map[string]string{ "label_1": "test", "label_2": "test", }, @@ -243,17 +243,17 @@ var _ = Describe("Store", func() { It("should replace all metrics", func() { storeMetrics := []*metrics.StoreMetric{ { - GaugeVec: testGauge1, - Value: 3.65, - Labels: prometheus.Labels{ + GaugeMetric: testGauge1, + Value: 3.65, + Labels: map[string]string{ "label_1": "test", "label_2": "test", }, }, { - GaugeVec: testGauge2, - Value: 5.3, - Labels: prometheus.Labels{ + GaugeMetric: testGauge2, + Value: 5.3, + Labels: map[string]string{ "label_1": "test", "label_2": "test", }, @@ -273,17 +273,17 @@ var _ = Describe("Store", func() { newStore := map[string][]*metrics.StoreMetric{ key2.String(): { { - GaugeVec: testGauge1, - Value: 3.65, - Labels: prometheus.Labels{ + GaugeMetric: testGauge1, + Value: 3.65, + Labels: map[string]string{ "label_1": "test2", "label_2": "test2", }, }, { - GaugeVec: testGauge2, - Value: 4.3, - Labels: prometheus.Labels{ + GaugeMetric: testGauge2, + Value: 4.3, + Labels: map[string]string{ "label_1": "test2", "label_2": "test2", }, @@ -291,17 +291,17 @@ var _ = Describe("Store", func() { }, key3.String(): { { - GaugeVec: testGauge1, - Value: 2.1, - Labels: prometheus.Labels{ + GaugeMetric: testGauge1, + Value: 2.1, + Labels: map[string]string{ "label_1": "test3", "label_2": "test3", }, }, { - GaugeVec: testGauge2, - Value: 8.9, - Labels: prometheus.Labels{ + GaugeMetric: testGauge2, + Value: 8.9, + Labels: map[string]string{ "label_1": "test3", "label_2": "test3", }, @@ -331,17 +331,17 @@ var _ = Describe("Store", func() { It("should replace with an empty store", func() { storeMetrics := []*metrics.StoreMetric{ { - GaugeVec: testGauge1, - Value: 3.65, - Labels: prometheus.Labels{ + GaugeMetric: testGauge1, + Value: 3.65, + Labels: map[string]string{ "label_1": "test", "label_2": "test", }, }, { - GaugeVec: testGauge2, - Value: 5.3, - Labels: prometheus.Labels{ + GaugeMetric: testGauge2, + Value: 5.3, + Labels: map[string]string{ "label_1": "test", "label_2": "test", }, diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index a6ba3a9f36..3793302d4d 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -67,7 +67,8 @@ const ( ) var ( - BuildInfo = prometheus.NewGaugeVec( + BuildInfo = opmetrics.NewPrometheusGauge( + crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, Name: "build_info", @@ -82,10 +83,14 @@ var ( var Version = "unspecified" func init() { - crmetrics.Registry.MustRegister(BuildInfo) opmetrics.RegisterClientMetrics(crmetrics.Registry) - BuildInfo.WithLabelValues(Version, runtime.Version(), runtime.GOARCH, env.GetRevision()).Set(1) + BuildInfo.Set(1, map[string]string{ + "version": Version, + "goversion": runtime.Version(), + "goarch": runtime.GOARCH, + "commit": env.GetRevision(), + }) } type Operator struct { diff --git a/pkg/test/expectations/expectations.go b/pkg/test/expectations/expectations.go index e600aa8de6..590699e18d 100644 --- a/pkg/test/expectations/expectations.go +++ b/pkg/test/expectations/expectations.go @@ -26,6 +26,7 @@ import ( "sync" "time" + opmetrics "github.com/awslabs/operatorpkg/metrics" "github.com/awslabs/operatorpkg/singleton" "github.com/awslabs/operatorpkg/status" . "github.com/onsi/ginkgo/v2" //nolint:revive,stylecheck @@ -543,17 +544,17 @@ func FindMetricWithLabelValues(name string, labelValues map[string]string) (*pro return nil, false } -func ExpectMetricGaugeValue(collector prometheus.Collector, expectedValue float64, labels map[string]string) { +func ExpectMetricGaugeValue(collector opmetrics.GaugeMetric, expectedValue float64, labels map[string]string) { GinkgoHelper() - metricName := ExpectMetricName(collector) + metricName := ExpectMetricName(collector.(*opmetrics.PrometheusGauge)) metric, ok := FindMetricWithLabelValues(metricName, labels) Expect(ok).To(BeTrue(), "Metric "+metricName+" should be available") Expect(lo.FromPtr(metric.Gauge.Value)).To(Equal(expectedValue), "Metric "+metricName+" should have the expected value") } -func ExpectMetricCounterValue(collector prometheus.Collector, expectedValue float64, labels map[string]string) { +func ExpectMetricCounterValue(collector opmetrics.CounterMetric, expectedValue float64, labels map[string]string) { GinkgoHelper() - metricName := ExpectMetricName(collector) + metricName := ExpectMetricName(collector.(*opmetrics.PrometheusCounter)) metric, ok := FindMetricWithLabelValues(metricName, labels) Expect(ok).To(BeTrue(), "Metric "+metricName+" should be available") Expect(lo.FromPtr(metric.Counter.Value)).To(Equal(expectedValue), "Metric "+metricName+" should have the expected value")