Skip to content

Commit

Permalink
feat: expose controller leader election duration and renew opts (#1657)
Browse files Browse the repository at this point in the history
Signed-off-by: Derek Wang <[email protected]>
  • Loading branch information
whynowy authored Apr 10, 2024
1 parent 4b52756 commit e9c3731
Show file tree
Hide file tree
Showing 9 changed files with 180 additions and 39 deletions.
20 changes: 19 additions & 1 deletion config/advanced-install/namespaced-controller-wo-crds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,25 @@ spec:
- name: NUMAFLOW_LEADER_ELECTION_DISABLED
valueFrom:
configMapKeyRef:
key: controller.disable.leader.election
key: controller.leader.election.disabled
name: numaflow-cmd-params-config
optional: true
- name: NUMAFLOW_LEADER_ELECTION_LEASE_DURATION
valueFrom:
configMapKeyRef:
key: controller.leader.election.lease.duration
name: numaflow-cmd-params-config
optional: true
- name: NUMAFLOW_LEADER_ELECTION_LEASE_RENEW_DEADLINE
valueFrom:
configMapKeyRef:
key: controller.leader.election.lease.renew.deadline
name: numaflow-cmd-params-config
optional: true
- name: NUMAFLOW_LEADER_ELECTION_LEASE_RENEW_PERIOD
valueFrom:
configMapKeyRef:
key: controller.leader.election.lease.renew.period
name: numaflow-cmd-params-config
optional: true
image: quay.io/numaproj/numaflow:latest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,25 @@ spec:
valueFrom:
configMapKeyRef:
name: numaflow-cmd-params-config
key: controller.disable.leader.election
key: controller.leader.election.disabled
optional: true
- name: NUMAFLOW_LEADER_ELECTION_LEASE_DURATION
valueFrom:
configMapKeyRef:
name: numaflow-cmd-params-config
key: controller.leader.election.lease.duration
optional: true
- name: NUMAFLOW_LEADER_ELECTION_LEASE_RENEW_DEADLINE
valueFrom:
configMapKeyRef:
name: numaflow-cmd-params-config
key: controller.leader.election.lease.renew.deadline
optional: true
- name: NUMAFLOW_LEADER_ELECTION_LEASE_RENEW_PERIOD
valueFrom:
configMapKeyRef:
name: numaflow-cmd-params-config
key: controller.leader.election.lease.renew.period
optional: true
volumeMounts:
- mountPath: /etc/numaflow
Expand Down
17 changes: 16 additions & 1 deletion config/base/shared-config/numaflow-cmd-params-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,22 @@ data:
# managed.namespace: numaflow-system
#
### Whether to disable leader election for the controller, defaults to false
# controller.disable.leader.election: "false"
# controller.leader.election.disabled: "false"
#
### The duration that non-leader candidates will wait to force acquire leadership.
# This is measured against time of last observed ack. Default is 15 seconds.
# The configuration has to be: lease.duration > lease.renew.deadline > lease.renew.period
# controller.leader.election.lease.duration: 15s
#
### The duration that the acting controlplane will retry refreshing leadership before giving up.
# Default is 10 seconds.
# The configuration has to be: lease.duration > lease.renew.deadline > lease.renew.period
# controller.leader.election.lease.renew.deadline: 10s
#
### The duration the LeaderElector clients should wait between tries of actions, which means every
# this period of time, it tries to renew the lease. Default is 2 seconds.
# The configuration has to be: lease.duration > lease.renew.deadline > lease.renew.period
# controller.leader.election.lease.renew.period: 2s
#
### Whether to disable TLS for UX server.
# server.insecure: "false"
Expand Down
20 changes: 19 additions & 1 deletion config/install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17382,7 +17382,25 @@ spec:
- name: NUMAFLOW_LEADER_ELECTION_DISABLED
valueFrom:
configMapKeyRef:
key: controller.disable.leader.election
key: controller.leader.election.disabled
name: numaflow-cmd-params-config
optional: true
- name: NUMAFLOW_LEADER_ELECTION_LEASE_DURATION
valueFrom:
configMapKeyRef:
key: controller.leader.election.lease.duration
name: numaflow-cmd-params-config
optional: true
- name: NUMAFLOW_LEADER_ELECTION_LEASE_RENEW_DEADLINE
valueFrom:
configMapKeyRef:
key: controller.leader.election.lease.renew.deadline
name: numaflow-cmd-params-config
optional: true
- name: NUMAFLOW_LEADER_ELECTION_LEASE_RENEW_PERIOD
valueFrom:
configMapKeyRef:
key: controller.leader.election.lease.renew.period
name: numaflow-cmd-params-config
optional: true
image: quay.io/numaproj/numaflow:latest
Expand Down
20 changes: 19 additions & 1 deletion config/namespace-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17276,7 +17276,25 @@ spec:
- name: NUMAFLOW_LEADER_ELECTION_DISABLED
valueFrom:
configMapKeyRef:
key: controller.disable.leader.election
key: controller.leader.election.disabled
name: numaflow-cmd-params-config
optional: true
- name: NUMAFLOW_LEADER_ELECTION_LEASE_DURATION
valueFrom:
configMapKeyRef:
key: controller.leader.election.lease.duration
name: numaflow-cmd-params-config
optional: true
- name: NUMAFLOW_LEADER_ELECTION_LEASE_RENEW_DEADLINE
valueFrom:
configMapKeyRef:
key: controller.leader.election.lease.renew.deadline
name: numaflow-cmd-params-config
optional: true
- name: NUMAFLOW_LEADER_ELECTION_LEASE_RENEW_PERIOD
valueFrom:
configMapKeyRef:
key: controller.leader.election.lease.renew.period
name: numaflow-cmd-params-config
optional: true
image: quay.io/numaproj/numaflow:latest
Expand Down
27 changes: 26 additions & 1 deletion docs/operations/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,31 @@ Similarly, another approach is to add `--managed-namespace` and the specific nam
By default, the Numaflow controller is installed with `Active-Passive` HA strategy enabled, which means you can run the controller with multiple replicas (defaults to 1 in the manifests).
There are some parameters can be tuned for the leader election mechanism of HA.
```yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: numaflow-cmd-params-config
data:
### The duration that non-leader candidates will wait to force acquire leadership.
# This is measured against time of last observed ack. Default is 15 seconds.
# The configuration has to be: lease.duration > lease.renew.deadline > lease.renew.period
controller.leader.election.lease.duration: 15s
#
### The duration that the acting controlplane will retry refreshing leadership before giving up.
# Default is 10 seconds.
# The configuration has to be: lease.duration > lease.renew.deadline > lease.renew.period
controller.leader.election.lease.renew.deadline: 10s
### The duration the LeaderElector clients should wait between tries of actions, which means every
# this period of time, it tries to renew the lease. Default is 2 seconds.
# The configuration has to be: lease.duration > lease.renew.deadline > lease.renew.period
controller.leader.election.lease.renew.period: 2s
```

These parameters are useful when you want to tune the frequency of leader election renewal calls to K8s API server, which are usually configured at a high priority level of [API Priority and Fairness](https://kubernetes.io/docs/concepts/cluster-administration/flow-control/).

To turn off HA, configure the ConfigMap `numaflow-cmd-params-config` as following.

```yaml
Expand All @@ -109,7 +134,7 @@ metadata:
name: numaflow-cmd-params-config
data:
# Whether to disable leader election for the controller, defaults to false
controller.disable.leader.election: "true"
controller.leader.election.disabled: "true"
```
If HA is turned off, the controller deployment should not run with multiple replicas.
67 changes: 35 additions & 32 deletions pkg/apis/numaflow/v1alpha1/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,38 +87,41 @@ const (
ControllerVertex = "vertex-controller"

// ENV vars
EnvNamespace = "NUMAFLOW_NAMESPACE"
EnvPipelineName = "NUMAFLOW_PIPELINE_NAME"
EnvVertexName = "NUMAFLOW_VERTEX_NAME"
EnvPod = "NUMAFLOW_POD"
EnvReplica = "NUMAFLOW_REPLICA"
EnvVertexObject = "NUMAFLOW_VERTEX_OBJECT"
EnvPipelineObject = "NUMAFLOW_PIPELINE_OBJECT"
EnvSideInputObject = "NUMAFLOW_SIDE_INPUT_OBJECT"
EnvImage = "NUMAFLOW_IMAGE"
EnvImagePullPolicy = "NUMAFLOW_IMAGE_PULL_POLICY"
EnvISBSvcRedisSentinelURL = "NUMAFLOW_ISBSVC_REDIS_SENTINEL_URL"
EnvISBSvcSentinelMaster = "NUMAFLOW_ISBSVC_REDIS_SENTINEL_MASTER"
EnvISBSvcRedisURL = "NUMAFLOW_ISBSVC_REDIS_URL"
EnvISBSvcRedisUser = "NUMAFLOW_ISBSVC_REDIS_USER"
EnvISBSvcRedisPassword = "NUMAFLOW_ISBSVC_REDIS_PASSWORD"
EnvISBSvcRedisSentinelPassword = "NUMAFLOW_ISBSVC_REDIS_SENTINEL_PASSWORD"
EnvISBSvcRedisClusterMaxRedirects = "NUMAFLOW_ISBSVC_REDIS_CLUSTER_MAX_REDIRECTS"
EnvISBSvcJetStreamUser = "NUMAFLOW_ISBSVC_JETSTREAM_USER"
EnvISBSvcJetStreamPassword = "NUMAFLOW_ISBSVC_JETSTREAM_PASSWORD"
EnvISBSvcJetStreamURL = "NUMAFLOW_ISBSVC_JETSTREAM_URL"
EnvISBSvcJetStreamTLSEnabled = "NUMAFLOW_ISBSVC_JETSTREAM_TLS_ENABLED"
EnvISBSvcConfig = "NUMAFLOW_ISBSVC_CONFIG"
EnvLeaderElectionDisabled = "NUMAFLOW_LEADER_ELECTION_DISABLED"
EnvDebug = "NUMAFLOW_DEBUG"
EnvPPROF = "NUMAFLOW_PPROF"
EnvHealthCheckDisabled = "NUMAFLOW_HEALTH_CHECK_DISABLED"
EnvGRPCMaxMessageSize = "NUMAFLOW_GRPC_MAX_MESSAGE_SIZE"
EnvCPURequest = "NUMAFLOW_CPU_REQUEST"
EnvCPULimit = "NUMAFLOW_CPU_LIMIT"
EnvMemoryRequest = "NUMAFLOW_MEMORY_REQUEST"
EnvMemoryLimit = "NUMAFLOW_MEMORY_LIMIT"
EnvGoDebug = "GODEBUG"
EnvNamespace = "NUMAFLOW_NAMESPACE"
EnvPipelineName = "NUMAFLOW_PIPELINE_NAME"
EnvVertexName = "NUMAFLOW_VERTEX_NAME"
EnvPod = "NUMAFLOW_POD"
EnvReplica = "NUMAFLOW_REPLICA"
EnvVertexObject = "NUMAFLOW_VERTEX_OBJECT"
EnvPipelineObject = "NUMAFLOW_PIPELINE_OBJECT"
EnvSideInputObject = "NUMAFLOW_SIDE_INPUT_OBJECT"
EnvImage = "NUMAFLOW_IMAGE"
EnvImagePullPolicy = "NUMAFLOW_IMAGE_PULL_POLICY"
EnvISBSvcRedisSentinelURL = "NUMAFLOW_ISBSVC_REDIS_SENTINEL_URL"
EnvISBSvcSentinelMaster = "NUMAFLOW_ISBSVC_REDIS_SENTINEL_MASTER"
EnvISBSvcRedisURL = "NUMAFLOW_ISBSVC_REDIS_URL"
EnvISBSvcRedisUser = "NUMAFLOW_ISBSVC_REDIS_USER"
EnvISBSvcRedisPassword = "NUMAFLOW_ISBSVC_REDIS_PASSWORD"
EnvISBSvcRedisSentinelPassword = "NUMAFLOW_ISBSVC_REDIS_SENTINEL_PASSWORD"
EnvISBSvcRedisClusterMaxRedirects = "NUMAFLOW_ISBSVC_REDIS_CLUSTER_MAX_REDIRECTS"
EnvISBSvcJetStreamUser = "NUMAFLOW_ISBSVC_JETSTREAM_USER"
EnvISBSvcJetStreamPassword = "NUMAFLOW_ISBSVC_JETSTREAM_PASSWORD"
EnvISBSvcJetStreamURL = "NUMAFLOW_ISBSVC_JETSTREAM_URL"
EnvISBSvcJetStreamTLSEnabled = "NUMAFLOW_ISBSVC_JETSTREAM_TLS_ENABLED"
EnvISBSvcConfig = "NUMAFLOW_ISBSVC_CONFIG"
EnvLeaderElectionDisabled = "NUMAFLOW_LEADER_ELECTION_DISABLED"
EnvLeaderElectionLeaseDuration = "NUMAFLOW_LEADER_ELECTION_LEASE_DURATION"
EnvLeaderElectionLeaseRenewDeadline = "NUMAFLOW_LEADER_ELECTION_LEASE_RENEW_DEADLINE"
EnvLeaderElectionLeaseRenewPeriod = "NUMAFLOW_LEADER_ELECTION_LEASE_RENEW_PERIOD"
EnvDebug = "NUMAFLOW_DEBUG"
EnvPPROF = "NUMAFLOW_PPROF"
EnvHealthCheckDisabled = "NUMAFLOW_HEALTH_CHECK_DISABLED"
EnvGRPCMaxMessageSize = "NUMAFLOW_GRPC_MAX_MESSAGE_SIZE"
EnvCPURequest = "NUMAFLOW_CPU_REQUEST"
EnvCPULimit = "NUMAFLOW_CPU_LIMIT"
EnvMemoryRequest = "NUMAFLOW_MEMORY_REQUEST"
EnvMemoryLimit = "NUMAFLOW_MEMORY_LIMIT"
EnvGoDebug = "GODEBUG"

PathVarRun = "/var/run/numaflow"
VertexMetricsPort = 2469
Expand Down
26 changes: 26 additions & 0 deletions pkg/reconciler/cmd/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package cmd
import (
"context"
"reflect"
"time"

"go.uber.org/zap"
appv1 "k8s.io/api/apps/v1"
Expand Down Expand Up @@ -71,6 +72,31 @@ func Start(namespaced bool, managedNamespace string) {

if sharedutil.LookupEnvStringOr(dfv1.EnvLeaderElectionDisabled, "false") == "true" {
opts.LeaderElection = false
} else {
leaseDurationStr := sharedutil.LookupEnvStringOr(dfv1.EnvLeaderElectionLeaseDuration, "15s") // Defaults to 15s
leaseDuration, err := time.ParseDuration(leaseDurationStr)
if err != nil {
logger.Fatalf("Invalid ENV %s value: %s", dfv1.EnvLeaderElectionLeaseDuration, leaseDurationStr)
}
opts.LeaseDuration = &leaseDuration
leaseRenewDeadlineStr := sharedutil.LookupEnvStringOr(dfv1.EnvLeaderElectionLeaseRenewDeadline, "10s") // Defaults to 10s
leaseRenewDeadline, err := time.ParseDuration(leaseRenewDeadlineStr)
if err != nil {
logger.Fatalf("Invalid ENV %s value: %s", dfv1.EnvLeaderElectionLeaseRenewDeadline, leaseRenewDeadlineStr)
}
if leaseDuration <= leaseRenewDeadline {
logger.Fatalf("Invalid config: %s should always be greater than %s", dfv1.EnvLeaderElectionLeaseDuration, dfv1.EnvLeaderElectionLeaseRenewDeadline)
}
opts.RenewDeadline = &leaseRenewDeadline
leaseRenewPeriodStr := sharedutil.LookupEnvStringOr(dfv1.EnvLeaderElectionLeaseRenewPeriod, "2s") // Defaults to 2s
leaseRenewPeriod, err := time.ParseDuration(leaseRenewPeriodStr)
if err != nil {
logger.Fatalf("Invalid ENV %s value: %s", dfv1.EnvLeaderElectionLeaseRenewPeriod, leaseRenewPeriodStr)
}
if leaseRenewDeadline <= leaseRenewPeriod {
logger.Fatalf("Invalid config: %s should always be greater than %s", dfv1.EnvLeaderElectionLeaseRenewDeadline, dfv1.EnvLeaderElectionLeaseRenewPeriod)
}
opts.RetryPeriod = &leaseRenewPeriod
}

if namespaced {
Expand Down
2 changes: 1 addition & 1 deletion test/manifests/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ patchesStrategicMerge:
metadata:
name: numaflow-cmd-params-config
data:
controller.disable.leader.election: "true"
controller.leader.election.disabled: "true"
namespace: numaflow-system

Expand Down

0 comments on commit e9c3731

Please sign in to comment.