Skip to content

Commit

Permalink
Wait for limited db retries to prevent hang and show correct error on…
Browse files Browse the repository at this point in the history
… PreRemove hook (#607)
  • Loading branch information
HomayoonAlimohammadi authored Aug 26, 2024
1 parent 96634b2 commit 365a4ed
Showing 1 changed file with 17 additions and 2 deletions.
19 changes: 17 additions & 2 deletions src/k8s/pkg/k8sd/app/hooks_remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,13 @@ func (a *App) onPreRemove(ctx context.Context, s state.State, force bool) (rerr
log.Info("Running preremove hook")

log.Info("Waiting for node to finish microcluster join before removing")
control.WaitUntilReady(ctx, func() (bool, error) {
// NOTE (hue): in microcluster v2, PreRemove hook is also called if something goes wrong on
// `bootstrap` and `join-cluster`. It is possible that we get stuck in this loop forever which causes
// the `bootstrap` and `join-cluster` commands to hang and finally return an uninformative `context deadline exceeded` error
// we optimistically stop trying after a fixed number of retries.
maxRetries := 10
var txnRetries int
if err := control.WaitUntilReady(ctx, func() (bool, error) {
var notPending bool
if err := s.Database().Transaction(ctx, func(ctx context.Context, tx *sql.Tx) error {
member, err := cluster.GetCoreClusterMember(ctx, tx, s.Name())
Expand All @@ -40,9 +46,18 @@ func (a *App) onPreRemove(ctx context.Context, s state.State, force bool) (rerr
return nil
}); err != nil {
log.Error(err, "Failed database transaction to check cluster member role")
txnRetries++
}

if txnRetries >= maxRetries {
log.Info("Reached maximum number of retries for database transactions on pre-remove hook, continuing cleanup", "max_retries", maxRetries)
return true, nil
}

return notPending, nil
})
}); err != nil {
log.Error(err, "Failed to wait for node to finish microcluster join before removing. Continuing with the cleanup...")
}

if cfg, err := databaseutil.GetClusterConfig(ctx, s); err == nil {
if _, ok := cfg.Annotations[apiv1.AnnotationSkipCleanupKubernetesNodeOnRemove]; !ok {
Expand Down

0 comments on commit 365a4ed

Please sign in to comment.