Add ability to configure client-go's QPS and Burst settings

## Problem and Symptoms When having a very large number of proxies request identity in a short period of time (e.g. during large node scaling events), the identity controller will attempt to validate the tokens sent by the proxies at a rate surpassing client-go's the default request rate threshold, triggering client-side throttling, which will delay the proxies initialization, and even failing their startup (after a 2m timeout). The identity controller will surface this through log entries like this: ``` time="2023-11-08T19:50:45Z" level=error msg="error validating token for web.emojivoto.serviceaccount.identity.linkerd.cluster.local: client rate limiter Wait returned an error: rate: Wait(n=1) would exceed context deadline" ``` ## Solution Client-go's default `QPS` is 5 and `Burst` is 10. This PR exposes those settings as entries in `values.yaml` with defaults of 100 and 200 respectively. Note this only applies to the identity controller, as it's the only controller performing direct requests to the `kube-apiserver` in a hot path. The other controllers mostly rely in informers, and direct calls are sporadic. ## Observability The `QPS` and `Burst` settings used are exposed both as a log entry as soon as the controller starts, and as in the new metric gauges `http_client_qps` and `http_client_burst` ## Testing You can use the following K6 script, which simulates 6k calls to the `Certify` service during one minute from emojivoto's web pod. Before running this you need to: - Put the identity.proto and [all the other proto files](https://github.com/linkerd/linkerd2-proxy-api/tree/v0.11.0/proto) in the same directory. - Edit the [checkRequest](https://github.com/linkerd/linkerd2/blob/edge-23.11.3/pkg/identity/service.go#L266) function and add logging statements to figure the `token` and `csr` entries you can use here, that will be shown as soon as a web pod starts. ```javascript import { Client, Stream } from 'k6/experimental/grpc'; import { sleep } from 'k6'; const client = new Client(); client.load(['.'], 'identity.proto'); // This always holds: // req_num = (1 / req_duration ) * duration * VUs // Given req_duration (0.5s) test duration (1m) and the target req_num (6k), we // can solve for the required VUs: // VUs = req_num * req_duration / duration // VUs = 6000 * 0.5 / 60 = 50 export const options = { scenarios: { identity: { executor: 'constant-vus', vus: 50, duration: '1m', }, }, }; export default () => { client.connect('localhost:8080', { plaintext: true, }); const stream = new Stream(client, 'io.linkerd.proxy.identity.Identity/Certify'); // Replace with your own token let token = "ZXlKaGJHY2lPaUpTVXpJMU5pSXNJbXRwWkNJNkluQjBaV1pUZWtaNWQyVm5OMmxmTTBkV2VUTlhWSFpqTmxwSmJYRmtNMWRSVEhwNVNHWllhUzFaZDNNaWZRLmV5SmhkV1FpT2xzaWFXUmxiblJwZEhrdWJEVmtMbWx2SWwwc0ltVjRjQ0k2TVRjd01EWTRPVFk1TUN3aWFXRjBJam94TnpBd05qQXpNamt3TENKcGMzTWlPaUpvZEhSd2N6b3ZMMnQxWW1WeWJtVjBaWE11WkdWbVlYVnNkQzV6ZG1NdVkyeDFjM1JsY2k1c2IyTmhiQ0lzSW10MVltVnlibVYwWlhNdWFXOGlPbnNpYm1GdFpYTndZV05sSWpvaVpXMXZhbWwyYjNSdklpd2ljRzlrSWpwN0ltNWhiV1VpT2lKM1pXSXRPRFUxT1dJNU4yWTNZeTEwYldJNU5TSXNJblZwWkNJNklqaGlZbUV5WWpsbExXTXdOVGN0TkRnMk1TMWhNalZsTFRjelpEY3dOV1EzWmpoaU1TSjlMQ0p6WlhKMmFXTmxZV05qYjNWdWRDSTZleUp1WVcxbElqb2lkMlZpSWl3aWRXbGtJam9pWm1JelpUQXlNRE10TmpZMU55MDBOMk0xTFRoa09EUXRORGt6WXpBM1lXUTJaak0zSW4xOUxDSnVZbVlpT2pFM01EQTJNRE15T1RBc0luTjFZaUk2SW5ONWMzUmxiVHB6WlhKMmFXTmxZV05qYjNWdWREcGxiVzlxYVhadmRHODZkMlZpSW4wLnlwMzAzZVZkeHhpamxBOG1wVjFObGZKUDB3SC03RmpUQl9PcWJ3NTNPeGU1cnNTcDNNNk96VWR6OFdhYS1hcjNkVVhQR2x2QXRDRVU2RjJUN1lKUFoxVmxxOUFZZTNvV2YwOXUzOWRodUU1ZDhEX21JUl9rWDUxY193am9UcVlORHA5ZzZ4ZFJNcW9reGg3NE9GNXFjaEFhRGtENUJNZVZ6a25kUWZtVVZwME5BdTdDMTZ3UFZWSlFmNlVXRGtnYkI1SW9UQXpxSmcyWlpyNXBBY3F5enJ0WE1rRkhSWmdvYUxVam5sN1FwX0ljWm8yYzJWWk03T2QzRjIwcFZaVzJvejlOdGt3THZoSEhSMkc5WlNJQ3RHRjdhTkYwNVR5ZC1UeU1BVnZMYnM0ZFl1clRYaHNORjhQMVk4RmFuNjE4d0x6ZUVMOUkzS1BJLUctUXRUNHhWdw=="; // Replace with your own CSR let csr = "MIIBWjCCAQECAQAwRjFEMEIGA1UEAxM7d2ViLmVtb2ppdm90by5zZXJ2aWNlYWNjb3VudC5pZGVudGl0eS5saW5rZXJkLmNsdXN0ZXIubG9jYWwwWTATBgcqhkjOPQIBBggqhkjOPQMBBwNCAATKjgVXu6F+WCda3Bbq2ue6m3z6OTMfQ4Vnmekmvirip/XGyi2HbzRzjARnIzGlG8wo4EfeYBtd2MBCb50kP8F8oFkwVwYJKoZIhvcNAQkOMUowSDBGBgNVHREEPzA9gjt3ZWIuZW1vaml2b3RvLnNlcnZpY2VhY2NvdW50LmlkZW50aXR5LmxpbmtlcmQuY2x1c3Rlci5sb2NhbDAKBggqhkjOPQQDAgNHADBEAiAM7aXY8MRs/EOhtPo4+PRHuiNOV+nsmNDv5lvtJt8T+QIgFP5JAq0iq7M6ShRNkRG99ZquJ3L3TtLWMNVTPvqvvUE="; const data = { identity: "web.emojivoto.serviceaccount.identity.linkerd.cluster.local", token: token, certificate_signing_request: csr, }; stream.write(data); // This request takes around 2ms, so this sleep will mostly determine its final duration sleep(0.5); }; ``` This results in the following report: ``` scenarios: (100.00%) 1 scenario, 50 max VUs, 1m30s max duration (incl. graceful stop): * identity: 50 looping VUs for 1m0s (gracefulStop: 30s) data_received................: 6.3 MB 104 kB/s data_sent....................: 9.4 MB 156 kB/s grpc_req_duration............: avg=2.14ms min=873.93µs med=1.9ms max=12.89ms p(90)=3.13ms p(95)=3.86ms grpc_streams.................: 6000 99.355331/s grpc_streams_msgs_received...: 6000 99.355331/s grpc_streams_msgs_sent.......: 6000 99.355331/s iteration_duration...........: avg=503.16ms min=500.8ms med=502.64ms max=532.36ms p(90)=504.05ms p(95)=505.72ms iterations...................: 6000 99.355331/s vus..........................: 50 min=50 max=50 vus_max......................: 50 min=50 max=50 running (1m00.4s), 00/50 VUs, 6000 complete and 0 interrupted iterations ``` With the old defaults (QPS=5 and Burst=10), the latencies would be much higher and number of complete requests much lower.
linkerd · Nov 22, 2023 · 5f6a703 · 5f6a703
1 parent e849b1a
commit 5f6a703
Show file tree

Hide file tree

Showing 27 changed files with 138 additions and 10 deletions.
diff --git a/charts/linkerd-control-plane/README.md b/charts/linkerd-control-plane/README.md
@@ -175,6 +175,8 @@ Kubernetes: `>=1.21.0-0`
 | identity.issuer.tls | object | `{"crtPEM":"","keyPEM":""}` | Which scheme is used for the identity issuer secret format |
 | identity.issuer.tls.crtPEM | string | `""` | Issuer certificate (ECDSA). It must be provided during install. |
 | identity.issuer.tls.keyPEM | string | `""` | Key for the issuer certificate (ECDSA). It must be provided during install |
+| identity.kubeAPIClientBurst | int | `200` | Burst value over kubeAPIClientQPS |
+| identity.kubeAPIClientQPS | int | `100` | Maximum QPS sent to the kube-apiserver before throttling. See https://github.com/kubernetes/client-go/blob/v12.0.0/util/flowcontrol/throttle.go |
 | identity.serviceAccountTokenProjection | bool | `true` | Use [Service Account token Volume projection](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/#service-account-token-volume-projection) for pod validation instead of the default token |
 | identityTrustAnchorsPEM | string | `""` | Trust root certificate (ECDSA). It must be provided during install. |
 | identityTrustDomain | string | clusterDomain | Trust domain used for identity |

diff --git a/charts/linkerd-control-plane/templates/identity.yaml b/charts/linkerd-control-plane/templates/identity.yaml
@@ -159,6 +159,8 @@ spec:
         - -identity-clock-skew-allowance={{.Values.identity.issuer.clockSkewAllowance}}
         - -identity-scheme={{.Values.identity.issuer.scheme}}
         - -enable-pprof={{.Values.enablePprof | default false}}
+        - -kube-apiclient-qps={{.Values.identity.kubeAPIClientQPS}}
+        - -kube-apiclient-burst={{.Values.identity.kubeAPIClientBurst}}
         {{- include "partials.linkerd.trace" . | nindent 8 -}}
         env:
         - name: LINKERD_DISABLED

diff --git a/charts/linkerd-control-plane/values.yaml b/charts/linkerd-control-plane/values.yaml
@@ -326,6 +326,14 @@ identity:
 
   # -- Use [Service Account token Volume projection](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/#service-account-token-volume-projection) for pod validation instead of the default token
   serviceAccountTokenProjection: true
+
+  # -- Maximum QPS sent to the kube-apiserver before throttling.
+  # See https://github.com/kubernetes/client-go/blob/v12.0.0/util/flowcontrol/throttle.go
+  kubeAPIClientQPS: 100
+
+  # -- Burst value over kubeAPIClientQPS
+  kubeAPIClientBurst: 200
+
   issuer:
     scheme: linkerd.io/tls
 

diff --git a/cli/cmd/testdata/install_controlplane_tracing_output.golden b/cli/cmd/testdata/install_controlplane_tracing_output.golden
diff --git a/cli/cmd/testdata/install_custom_domain.golden b/cli/cmd/testdata/install_custom_domain.golden
diff --git a/cli/cmd/testdata/install_custom_registry.golden b/cli/cmd/testdata/install_custom_registry.golden
diff --git a/cli/cmd/testdata/install_default.golden b/cli/cmd/testdata/install_default.golden
diff --git a/cli/cmd/testdata/install_default_override_dst_get_nets.golden b/cli/cmd/testdata/install_default_override_dst_get_nets.golden
diff --git a/cli/cmd/testdata/install_default_token.golden b/cli/cmd/testdata/install_default_token.golden
diff --git a/cli/cmd/testdata/install_ha_output.golden b/cli/cmd/testdata/install_ha_output.golden
diff --git a/cli/cmd/testdata/install_ha_with_overrides_output.golden b/cli/cmd/testdata/install_ha_with_overrides_output.golden
diff --git a/cli/cmd/testdata/install_heartbeat_disabled_output.golden b/cli/cmd/testdata/install_heartbeat_disabled_output.golden
diff --git a/cli/cmd/testdata/install_helm_control_plane_output.golden b/cli/cmd/testdata/install_helm_control_plane_output.golden
diff --git a/cli/cmd/testdata/install_helm_control_plane_output_ha.golden b/cli/cmd/testdata/install_helm_control_plane_output_ha.golden
diff --git a/cli/cmd/testdata/install_helm_output_ha_labels.golden b/cli/cmd/testdata/install_helm_output_ha_labels.golden
diff --git a/cli/cmd/testdata/install_helm_output_ha_namespace_selector.golden b/cli/cmd/testdata/install_helm_output_ha_namespace_selector.golden
diff --git a/cli/cmd/testdata/install_no_init_container.golden b/cli/cmd/testdata/install_no_init_container.golden
diff --git a/cli/cmd/testdata/install_output.golden b/cli/cmd/testdata/install_output.golden
diff --git a/cli/cmd/testdata/install_proxy_ignores.golden b/cli/cmd/testdata/install_proxy_ignores.golden
diff --git a/cli/cmd/testdata/install_values_file.golden b/cli/cmd/testdata/install_values_file.golden
diff --git a/controller/cmd/identity/main.go b/controller/cmd/identity/main.go
@@ -40,6 +40,8 @@ func Main(args []string) {
 	identityIssuanceLifeTime := cmd.String("identity-issuance-lifetime", "", "the amount of time for which the Identity issuer should certify identity")
 	identityClockSkewAllowance := cmd.String("identity-clock-skew-allowance", "", "the amount of time to allow for clock skew within a Linkerd cluster")
 	enablePprof := cmd.Bool("enable-pprof", false, "Enable pprof endpoints on the admin server")
+	qps := cmd.Float64("kube-apiclient-qps", 100, "Maximum QPS sent to the kube-apiserver before throttling")
+	burst := cmd.Int("kube-apiclient-burst", 200, "Burst value over kube-apiclient-qps")
 
 	issuerPath := cmd.String("issuer",
 		"/var/run/linkerd/identity/issuer",
@@ -137,10 +139,16 @@ func Main(args []string) {
 	//
 	// Create k8s API
 	//
-	k8sAPI, err := k8s.NewAPI(*kubeConfigPath, "", "", []string{}, 0)
+	config, err := k8s.GetConfig(*kubeConfigPath, "")
+	if err != nil {
+		log.Fatalf("Error configuring Kubernetes API client: %s", err)
+	}
+	k8sAPI, err := k8s.NewAPIForConfig(config, "", []string{}, 0, float32(*qps), *burst)
 	if err != nil {
 		log.Fatalf("Failed to load kubeconfig: %s: %s", *kubeConfigPath, err)
 	}
+	log.Infof("Using k8s client with QPS=%.2f Burst=%d", config.QPS, config.Burst)
+
 	v, err := idctl.NewK8sTokenValidator(ctx, k8sAPI, dom)
 	if err != nil {
 		log.Fatalf("Failed to initialize identity service: %s", err)

diff --git a/controller/k8s/api.go b/controller/k8s/api.go
@@ -79,7 +79,7 @@ func InitializeAPI(ctx context.Context, kubeConfig string, ensureClusterWideAcce
 		return nil, err
 	}
 
-	k8sClient, err := k8s.NewAPIForConfig(config, "", []string{}, 0)
+	k8sClient, err := k8s.NewAPIForConfig(config, "", []string{}, 0, 0, 0)
 	if err != nil {
 		return nil, err
 	}
@@ -89,7 +89,7 @@ func InitializeAPI(ctx context.Context, kubeConfig string, ensureClusterWideAcce
 
 // InitializeAPIForConfig creates Kubernetes clients and returns an initialized API wrapper.
 func InitializeAPIForConfig(ctx context.Context, kubeConfig *rest.Config, ensureClusterWideAccess bool, cluster string, resources ...APIResource) (*API, error) {
-	k8sClient, err := k8s.NewAPIForConfig(kubeConfig, "", []string{}, 0)
+	k8sClient, err := k8s.NewAPIForConfig(kubeConfig, "", []string{}, 0, 0, 0)
 	if err != nil {
 		return nil, err
 	}

diff --git a/controller/webhook/launcher.go b/controller/webhook/launcher.go
@@ -44,7 +44,7 @@ func Launch(
 		log.Fatalf("error building Kubernetes API config: %s", err)
 	}
 
-	k8sAPI, err := pkgk8s.NewAPIForConfig(config, "", []string{}, 0)
+	k8sAPI, err := pkgk8s.NewAPIForConfig(config, "", []string{}, 0, 0, 0)
 	if err != nil {
 		//nolint:gocritic
 		log.Fatalf("error configuring Kubernetes API client: %s", err)