From 283da96eeaabdcdef78e9121c93742a1b56e6bda Mon Sep 17 00:00:00 2001 From: Benjamin Schimke Date: Mon, 27 Jan 2025 14:52:30 +0100 Subject: [PATCH] Retry Temporary API Failures in Microcluster (#992) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When bootstrapping or starting a cluster, we wait for the k8sd server to be fully ready before interacting with it. However, there are edge cases—such as during a snap refresh—where the snap attempts to interact with the CLI (e.g., to configure snap settings) while the database is still initializing. In these scenarios, immediate failure is unnecessary. The k8sd client now retries such requests, ensuring smoother operation. This behavior applies only to specific edge cases where it is known that the microcluster database will eventually become available. --- src/k8s/pkg/client/k8sd/query.go | 42 ++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/src/k8s/pkg/client/k8sd/query.go b/src/k8s/pkg/client/k8sd/query.go index ade4f5877..1b65d3db4 100644 --- a/src/k8s/pkg/client/k8sd/query.go +++ b/src/k8s/pkg/client/k8sd/query.go @@ -6,14 +6,46 @@ import ( "strings" apiv1 "github.com/canonical/k8s-snap-api/api/v1" + "github.com/canonical/k8s/pkg/log" + "github.com/canonical/k8s/pkg/utils/control" "github.com/canonical/lxd/shared/api" + "github.com/canonical/microcluster/v2/rest/types" ) -// query is a helper method to wrap common error checking and response handling. -func query[T any](ctx context.Context, c *k8sd, method string, path string, in any, out *T) (T, error) { - if err := c.client.Query(ctx, method, apiv1.K8sdAPIVersion, api.NewURL().Path(strings.Split(path, "/")...), in, out); err != nil { - var zero T - return zero, fmt.Errorf("failed to %s /%s: %w", method, path, err) +// query is a helper method for sending requests to the k8sd client with common error checking and automatic retries. +// It retries on temporary microcluster errors and returns the deserialized response. +func query[T any](ctx context.Context, c *k8sd, method, path string, in any, out *T) (T, error) { + var result T + if out == nil { + return result, fmt.Errorf("out must be a non-nil pointer") } + + retryErr := control.WaitUntilReady(ctx, func() (bool, error) { + err := c.client.Query(ctx, method, apiv1.K8sdAPIVersion, api.NewURL().Path(strings.Split(path, "/")...), in, out) + if err != nil { + if isTemporary(err) { + log.FromContext(ctx).Info("Temporary error from k8sd: %v", err) + return false, nil + } + return false, fmt.Errorf("failed to %s /%s: %w", method, path, err) + } + return true, nil + }) + + if retryErr != nil { + return result, fmt.Errorf("failed after potential retry: %w", retryErr) + } + return *out, nil } + +// isTemporary checks if an error is temporary and should be retried. +// This function is tightly coupled with the error messages returned by microcluster and +// should not contain any generic error checks. +func isTemporary(err error) bool { + if strings.Contains(err.Error(), string(types.DatabaseStarting)) || + strings.Contains(err.Error(), "Database is not yet ready") { + return true + } + return false +}