From ec5bc4a20c8517d1b585629f5832dd742289a819 Mon Sep 17 00:00:00 2001 From: Homayoon Alimohammadi Date: Fri, 30 Aug 2024 13:49:05 +0400 Subject: [PATCH] Pass independent context to DeleteClusterMember (#638) Fix node removal in CAPI --- src/k8s/pkg/k8sd/api/cluster_remove.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/k8s/pkg/k8sd/api/cluster_remove.go b/src/k8s/pkg/k8sd/api/cluster_remove.go index 1b9f11287..6cbc22e62 100644 --- a/src/k8s/pkg/k8sd/api/cluster_remove.go +++ b/src/k8s/pkg/k8sd/api/cluster_remove.go @@ -5,6 +5,7 @@ import ( "database/sql" "fmt" "net/http" + "time" apiv1 "github.com/canonical/k8s-snap-api/api/v1" databaseutil "github.com/canonical/k8s/pkg/k8sd/database/util" @@ -66,7 +67,15 @@ func (e *Endpoints) postClusterRemove(s state.State, r *http.Request) response.R if err != nil { return response.InternalError(fmt.Errorf("failed to create client to cluster leader: %w", err)) } - if err := c.DeleteClusterMember(ctx, req.Name, req.Force); err != nil { + + // NOTE(hue): node removal process in CAPI might fail, we figured that the context passed to + // `DeleteClusterMember` is somehow getting canceled but couldn't figure out why or by which component. + // The cancellation happens after the `RunPreRemoveHook` call and before the `DeleteCoreClusterMember` call + // in `clusterMemberDelete` endpoint of microcluster. This is a workaround to avoid the cancellation. + // keep in mind that this failure is flaky and might not happen in every run. + deleteCtx, deleteCancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer deleteCancel() + if err := c.DeleteClusterMember(deleteCtx, req.Name, req.Force); err != nil { return response.InternalError(fmt.Errorf("failed to delete cluster member %s: %w", req.Name, err)) }