Skip to content

Commit

Permalink
Merge pull request #346 from wy-lucky/main
Browse files Browse the repository at this point in the history
optimize: add probe for cluster status
  • Loading branch information
mrlihanbo authored Dec 17, 2024
2 parents 26a5bcb + ee09ca5 commit 92754f8
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 1 deletion.
8 changes: 8 additions & 0 deletions cmd/controller-manager/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ type Options struct {

NSAutoPropExcludeRegexp string
ClusterJoinTimeout time.Duration
ClusterStatusThreshold time.Duration
MemberObjectEnqueueDelay time.Duration

MaxPodListers int64
Expand Down Expand Up @@ -174,6 +175,13 @@ func (o *Options) AddFlags(flags *pflag.FlagSet, allControllers []string, disabl
time.Second*30,
"The period of health check for member clusters. The minimum value is "+MinClusterHealthCheckPeriod.String()+".",
)

flags.DurationVar(
&o.ClusterStatusThreshold,
"cluster-status-threshold",
time.Second*100,
"The threshold of member clusters status change.",
)
}

func (o *Options) addKlogFlags(flags *pflag.FlagSet) {
Expand Down
1 change: 1 addition & 0 deletions cmd/controller-manager/app/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ func getComponentConfig(opts *options.Options) (*controllercontext.ComponentConf
MemberObjectEnqueueDelay: opts.MemberObjectEnqueueDelay,
EnableKatalystSupport: opts.EnableKatalystSupport,
ClusterHealthCheckPeriod: opts.ClusterHealthCheckPeriod,
ClusterStatusThreshold: opts.ClusterStatusThreshold,
}

if opts.ClusterHealthCheckPeriod < options.MinClusterHealthCheckPeriod {
Expand Down
1 change: 1 addition & 0 deletions pkg/controllers/context/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,5 @@ type ComponentConfig struct {
ResourceAggregationNodeFilter []labels.Selector
EnableKatalystSupport bool
ClusterHealthCheckPeriod time.Duration
ClusterStatusThreshold time.Duration
}
103 changes: 103 additions & 0 deletions pkg/controllers/federatedcluster/cluster_status_cache.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
This file may have been modified by The KubeAdmiral Authors
("KubeAdmiral Modifications"). All KubeAdmiral Modifications
are Copyright 2023 The KubeAdmiral Authors.
*/

package federatedcluster

import (
"context"
"sync"
"time"

"k8s.io/klog/v2"

fedcorev1a1 "github.com/kubewharf/kubeadmiral/pkg/apis/core/v1alpha1"
)

type clusterStatusStore struct {
clusterStatusData sync.Map
clusterStatusThreshold time.Duration
}

type clusterStatusConditionData struct {
offlineCondition fedcorev1a1.ClusterCondition
readyCondition fedcorev1a1.ClusterCondition
probeTimestamp time.Time
}

func (c *clusterStatusStore) thresholdAdjustedStatusCondition(
ctx context.Context,
cluster *fedcorev1a1.FederatedCluster,
observedOfflineCondition fedcorev1a1.ClusterCondition,
observedReadyCondition fedcorev1a1.ClusterCondition,
) (fedcorev1a1.ClusterCondition, fedcorev1a1.ClusterCondition) {
logger := klog.FromContext(ctx)

saved := c.get(cluster.Name)
if saved == nil {
// the cluster is just joined
c.update(cluster.Name, &clusterStatusConditionData{
offlineCondition: observedOfflineCondition,
readyCondition: observedReadyCondition,
})
return observedOfflineCondition, observedReadyCondition
}
curOfflineCondition := getClusterCondition(&cluster.Status, fedcorev1a1.ClusterOffline)
curReadyCondition := getClusterCondition(&cluster.Status, fedcorev1a1.ClusterReady)
if curOfflineCondition == nil || curReadyCondition == nil {
return observedOfflineCondition, observedReadyCondition
}

now := time.Now()
if saved.offlineCondition.Status != observedOfflineCondition.Status || saved.readyCondition.Status != observedReadyCondition.Status {
// condition status changed, record the probe timestamp
saved = &clusterStatusConditionData{
offlineCondition: observedOfflineCondition,
readyCondition: observedReadyCondition,
probeTimestamp: now,
}
c.update(cluster.Name, saved)
}

if curOfflineCondition.Status != observedOfflineCondition.Status || curReadyCondition.Status != observedReadyCondition.Status {
// threshold not exceeded, return the old status condition
if now.Before(saved.probeTimestamp.Add(c.clusterStatusThreshold)) {
logger.V(3).WithValues("offline", curOfflineCondition.Status, "ready", curReadyCondition.Status).
Info("Threshold not exceeded, return the old status condition")
return *curOfflineCondition, *curReadyCondition
}

logger.V(3).WithValues("offline", observedOfflineCondition.Status, "ready", observedReadyCondition.Status).
Info("Cluster status condition changed")
}

return observedOfflineCondition, observedReadyCondition
}

func (c *clusterStatusStore) get(cluster string) *clusterStatusConditionData {
condition, ok := c.clusterStatusData.Load(cluster)
if !ok {
return nil
}
return condition.(*clusterStatusConditionData)
}

func (c *clusterStatusStore) update(cluster string, data *clusterStatusConditionData) {
c.clusterStatusData.Store(cluster, data)
}
5 changes: 4 additions & 1 deletion pkg/controllers/federatedcluster/clusterstatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,8 +298,11 @@ func (c *FederatedClusterController) collectIndividualClusterStatus(
}

offlineCondition := getNewClusterOfflineCondition(offlineStatus, conditionTime)
setClusterCondition(&cluster.Status, &offlineCondition)
readyCondition := getNewClusterReadyCondition(readyStatus, readyReason, readyMessage, conditionTime)

offlineCondition, readyCondition = c.clusterStatusCache.thresholdAdjustedStatusCondition(ctx, cluster, offlineCondition, readyCondition)

setClusterCondition(&cluster.Status, &offlineCondition)
setClusterCondition(&cluster.Status, &readyCondition)

if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
Expand Down
5 changes: 5 additions & 0 deletions pkg/controllers/federatedcluster/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ type FederatedClusterController struct {
clusterHealthCheckConfig *ClusterHealthCheckConfig
clusterJoinTimeout time.Duration
resourceAggregationNodeFilter []labels.Selector
clusterStatusCache clusterStatusStore

lock sync.Mutex
clusterConnectionHashes map[string]string
Expand Down Expand Up @@ -120,6 +121,10 @@ func NewFederatedClusterController(
clusterHealthCheckConfig: &ClusterHealthCheckConfig{
Period: componentConfig.ClusterHealthCheckPeriod,
},
clusterStatusCache: clusterStatusStore{
clusterStatusData: sync.Map{},
clusterStatusThreshold: componentConfig.ClusterStatusThreshold,
},

lock: sync.Mutex{},
clusterConnectionHashes: map[string]string{},
Expand Down

0 comments on commit 92754f8

Please sign in to comment.