Skip to content

Commit

Permalink
Don't remove Kubernetes certificates and containerd if skipped (#1002)
Browse files Browse the repository at this point in the history
Currently, if the k8sd/v1alpha/lifecycle/skip-stop-services-on-remove
annotation is set, we're not stopping the Kubernetes-related services,
but we're still removing its certificates and containerd-related paths.
This will end up paralyzing services like kubelet, which might have to
do Pod evictions, blocking it from finishing its job, and resulting CAPI
not being able to complete its downscaling or upgrade operations.

We should remove those certificates only if we're also stopping the
services.
  • Loading branch information
claudiubelu authored Jan 24, 2025
1 parent 9459b1d commit 934a2fb
Showing 1 changed file with 19 additions and 15 deletions.
34 changes: 19 additions & 15 deletions src/k8s/pkg/k8sd/app/hooks_remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,27 +115,31 @@ func (a *App) onPreRemove(ctx context.Context, s state.State, force bool) (rerr
}
}

// Perform all cleanup steps regardless of if this is a worker node or control plane.
// Trying to detect the node type is not reliable as the node might have been marked as worker
// or not, depending on which step it failed.
log.Info("Cleaning up worker certificates")
if _, err := setup.EnsureWorkerPKI(snap, &pki.WorkerNodePKI{}); err != nil {
log.Error(err, "failed to cleanup worker certificates")
}

log.Info("Removing worker node mark")
if err := snaputil.MarkAsWorkerNode(snap, false); err != nil {
if !errors.Is(err, os.ErrNotExist) {
log.Error(err, "failed to unmark node as worker")
}
}

log.Info("Cleaning up control plane certificates")
if _, err := setup.EnsureControlPlanePKI(snap, &pki.ControlPlanePKI{}); err != nil {
log.Error(err, "failed to cleanup control plane certificates")
}

// NOTE(claudiub): We should only remove the certificates only if we're stopping the Kubernetes
// services as well. Removing them without stopping the services will result in the services
// being paralyzed and unable to continue their function, including potential Pod evictions
// started by CAPI.
if _, ok := cfg.Annotations.Get(apiv1_annotations.AnnotationSkipStopServicesOnRemove); !ok {
// Perform all cleanup steps regardless of if this is a worker node or control plane.
// Trying to detect the node type is not reliable as the node might have been marked as worker
// or not, depending on which step it failed.
log.Info("Cleaning up worker certificates")
if _, err := setup.EnsureWorkerPKI(snap, &pki.WorkerNodePKI{}); err != nil {
log.Error(err, "failed to cleanup worker certificates")
}

log.Info("Cleaning up control plane certificates")
if _, err := setup.EnsureControlPlanePKI(snap, &pki.ControlPlanePKI{}); err != nil {
log.Error(err, "failed to cleanup control plane certificates")
}

log.Info("Stopping worker services")
if err := snaputil.StopWorkerServices(ctx, snap); err != nil {
log.Error(err, "Failed to stop worker services")
Expand All @@ -150,9 +154,9 @@ func (a *App) onPreRemove(ctx context.Context, s state.State, force bool) (rerr
if err := snaputil.StopK8sDqliteServices(ctx, snap); err != nil {
log.Error(err, "Failed to stop k8s-dqlite service")
}
}

tryCleanupContainerdPaths(log, snap)
tryCleanupContainerdPaths(log, snap)
}

return nil
}
Expand Down

0 comments on commit 934a2fb

Please sign in to comment.