Skip to content

Commit

Permalink
Merge pull request #8626 from Lyndon-Li/repo-maintainance-for-windows-2
Browse files Browse the repository at this point in the history
Repo maintenance for windows
  • Loading branch information
Lyndon-Li authored Jan 21, 2025
2 parents 5b1738a + 0a4b05c commit a9031eb
Show file tree
Hide file tree
Showing 7 changed files with 340 additions and 150 deletions.
1 change: 1 addition & 0 deletions changelogs/unreleased/8626-Lyndon-Li
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix issue #8419, support repo maintenance job to run on Windows nodes
49 changes: 28 additions & 21 deletions pkg/cmd/cli/repomantenance/maintenance.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/vmware-tanzu/velero/pkg/util/logging"

repokey "github.com/vmware-tanzu/velero/pkg/repository/keys"
"github.com/vmware-tanzu/velero/pkg/repository/maintenance"
repomanager "github.com/vmware-tanzu/velero/pkg/repository/manager"
)

Expand Down Expand Up @@ -78,17 +79,7 @@ func (o *Options) Run(f velerocli.Factory) {
}()

if pruneError != nil {
logger.WithError(pruneError).Error("An error occurred when running repo prune")
terminationLogFile, err := os.Create("/dev/termination-log")
if err != nil {
logger.WithError(err).Error("Failed to create termination log file")
return
}
defer terminationLogFile.Close()

if _, errWrite := terminationLogFile.WriteString(fmt.Sprintf("An error occurred: %v", err)); errWrite != nil {
logger.WithError(errWrite).Error("Failed to write error to termination log file")
}
os.Stdout.WriteString(fmt.Sprintf("%s%v", maintenance.TerminationLogIndicator, pruneError))
}
}

Expand Down Expand Up @@ -163,22 +154,38 @@ func (o *Options) runRepoPrune(f velerocli.Factory, namespace string, logger log
return err
}

manager, err := initRepoManager(namespace, cli, kubeClient, logger)
if err != nil {
return err
var repo *velerov1api.BackupRepository
retry := 10
for {
repo, err = repository.GetBackupRepository(context.Background(), cli, namespace,
repository.BackupRepositoryKey{
VolumeNamespace: o.RepoName,
BackupLocation: o.BackupStorageLocation,
RepositoryType: o.RepoType,
}, true)
if err == nil {
break
}

retry--
if retry == 0 {
break
}

logger.WithError(err).Warn("Failed to retrieve backup repo, need retry")

time.Sleep(time.Second)
}

// backupRepository
repo, err := repository.GetBackupRepository(context.Background(), cli, namespace,
repository.BackupRepositoryKey{
VolumeNamespace: o.RepoName,
BackupLocation: o.BackupStorageLocation,
RepositoryType: o.RepoType,
}, true)
if err != nil {
return errors.Wrap(err, "failed to get backup repository")
}

manager, err := initRepoManager(namespace, cli, kubeClient, logger)
if err != nil {
return err
}

err = manager.PruneRepo(repo)
if err != nil {
return errors.Wrap(err, "failed to prune repo")
Expand Down
36 changes: 20 additions & 16 deletions pkg/controller/backup_repository_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,15 @@ func waitMaintenanceJobCompleteFail(client.Client, context.Context, string, stri
}

func waitMaintenanceJobCompleteFunc(now time.Time, result velerov1api.BackupRepositoryMaintenanceResult, message string) func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) {
completionTimeStamp := &metav1.Time{Time: now.Add(time.Hour)}
if result == velerov1api.BackupRepositoryMaintenanceFailed {
completionTimeStamp = nil
}

return func(client.Client, context.Context, string, string, logrus.FieldLogger) (velerov1api.BackupRepositoryMaintenanceStatus, error) {
return velerov1api.BackupRepositoryMaintenanceStatus{
StartTimestamp: &metav1.Time{Time: now},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: completionTimeStamp,
Result: result,
Message: message,
}, nil
Expand Down Expand Up @@ -316,10 +321,9 @@ func TestRunMaintenanceIfDue(t *testing.T) {
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
},
{
StartTimestamp: &metav1.Time{Time: now},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Message: "fake-maintenance-message",
StartTimestamp: &metav1.Time{Time: now},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Message: "fake-maintenance-message",
},
},
},
Expand Down Expand Up @@ -893,7 +897,7 @@ func TestUpdateRepoMaintenanceHistory(t *testing.T) {
{
name: "full history",
backupRepo: backupRepoWithFullHistory,
result: velerov1api.BackupRepositoryMaintenanceFailed,
result: velerov1api.BackupRepositoryMaintenanceSucceeded,
expectedHistory: []velerov1api.BackupRepositoryMaintenanceStatus{
{
StartTimestamp: &metav1.Time{Time: standardTime.Add(-time.Hour * 22)},
Expand All @@ -915,7 +919,7 @@ func TestUpdateRepoMaintenanceHistory(t *testing.T) {
{
name: "over full history",
backupRepo: backupRepoWithOverFullHistory,
result: velerov1api.BackupRepositoryMaintenanceFailed,
result: velerov1api.BackupRepositoryMaintenanceSucceeded,
expectedHistory: []velerov1api.BackupRepositoryMaintenanceStatus{
{
StartTimestamp: &metav1.Time{Time: standardTime.Add(-time.Hour * 20)},
Expand Down Expand Up @@ -1127,7 +1131,7 @@ func TestConsolidateHistory(t *testing.T) {
{
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Message: "fake-maintenance-message-2",
},
},
Expand All @@ -1149,7 +1153,7 @@ func TestConsolidateHistory(t *testing.T) {
{
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Message: "fake-maintenance-message-2",
},
{
Expand All @@ -1172,7 +1176,7 @@ func TestConsolidateHistory(t *testing.T) {
{
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Message: "fake-maintenance-message-2",
},
},
Expand All @@ -1194,7 +1198,7 @@ func TestConsolidateHistory(t *testing.T) {
{
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Message: "fake-maintenance-message-2",
},
{
Expand Down Expand Up @@ -1223,7 +1227,7 @@ func TestConsolidateHistory(t *testing.T) {
{
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Message: "fake-maintenance-message-2",
},
{
Expand All @@ -1237,7 +1241,7 @@ func TestConsolidateHistory(t *testing.T) {
{
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Message: "fake-maintenance-message-2",
},
{
Expand All @@ -1257,7 +1261,7 @@ func TestConsolidateHistory(t *testing.T) {
{
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Message: "fake-maintenance-message-2",
},
{
Expand Down Expand Up @@ -1339,13 +1343,13 @@ func TestGetLastMaintenanceTimeFromHistory(t *testing.T) {
history: []velerov1api.BackupRepositoryMaintenanceStatus{
{
StartTimestamp: &metav1.Time{Time: now},
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Message: "fake-maintenance-message",
},
{
StartTimestamp: &metav1.Time{Time: now.Add(time.Hour)},
CompleteTimestamp: &metav1.Time{Time: now.Add(time.Hour * 2)},
Result: velerov1api.BackupRepositoryMaintenanceFailed,
Result: velerov1api.BackupRepositoryMaintenanceSucceeded,
Message: "fake-maintenance-message-2",
},
{
Expand Down
83 changes: 50 additions & 33 deletions pkg/repository/maintenance/maintenance.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"fmt"
"math"
"sort"
"strings"
"time"

"github.com/pkg/errors"
Expand All @@ -35,6 +36,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"

velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
"github.com/vmware-tanzu/velero/pkg/util"
"github.com/vmware-tanzu/velero/pkg/util/kube"

appsv1 "k8s.io/api/apps/v1"
Expand All @@ -47,6 +49,7 @@ import (
const (
RepositoryNameLabel = "velero.io/repo-name"
GlobalKeyForRepoMaintenanceJobCM = "global"
TerminationLogIndicator = "Repo maintenance error: "
)

type JobConfigs struct {
Expand Down Expand Up @@ -147,24 +150,37 @@ func getResultFromJob(cli client.Client, job *batchv1.Job) (string, error) {
}

if len(podList.Items) == 0 {
return "", fmt.Errorf("no pod found for job %s", job.Name)
return "", errors.Errorf("no pod found for job %s", job.Name)
}

// we only have one maintenance pod for the job
pod := podList.Items[0]

statuses := pod.Status.ContainerStatuses
if len(statuses) == 0 {
return "", fmt.Errorf("no container statuses found for job %s", job.Name)
return "", errors.Errorf("no container statuses found for job %s", job.Name)
}

// we only have one maintenance container
terminated := statuses[0].State.Terminated
if terminated == nil {
return "", fmt.Errorf("container for job %s is not terminated", job.Name)
return "", errors.Errorf("container for job %s is not terminated", job.Name)
}

return terminated.Message, nil
if terminated.Message == "" {
return "", nil
}

idx := strings.Index(terminated.Message, TerminationLogIndicator)
if idx == -1 {
return "", errors.New("error to locate repo maintenance error indicator from termination message")
}

if idx+len(TerminationLogIndicator) >= len(terminated.Message) {
return "", errors.New("nothing after repo maintenance error indicator in termination message")
}

return terminated.Message[idx+len(TerminationLogIndicator):], nil
}

// getJobConfig is called to get the Maintenance Job Config for the
Expand Down Expand Up @@ -331,7 +347,7 @@ func WaitAllJobsComplete(ctx context.Context, cli client.Client, repo *velerov1a
if job.Status.Failed > 0 {
if msg, err := getResultFromJob(cli, job); err != nil {
log.WithError(err).Warnf("Failed to get result of maintenance job %s", job.Name)
message = "Repo maintenance failed but result is not retrieveable"
message = fmt.Sprintf("Repo maintenance failed but result is not retrieveable, err: %v", err)
} else {
message = msg
}
Expand Down Expand Up @@ -434,6 +450,16 @@ func buildJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRe
return nil, errors.Wrap(err, "failed to parse resource requirements for maintenance job")
}

podLabels := map[string]string{
RepositoryNameLabel: repo.Name,
}

for _, k := range util.ThirdPartyLabels {
if v := veleroutil.GetVeleroServerLabelValue(deployment, k); v != "" {
podLabels[k] = v
}
}

// Set arguments
args := []string{"repo-maintenance"}
args = append(args, fmt.Sprintf("--repo-name=%s", repo.Spec.VolumeNamespace))
Expand All @@ -455,10 +481,8 @@ func buildJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRe
BackoffLimit: new(int32), // Never retry
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Name: "velero-repo-maintenance-pod",
Labels: map[string]string{
RepositoryNameLabel: repo.Name,
},
Name: "velero-repo-maintenance-pod",
Labels: podLabels,
},
Spec: v1.PodSpec{
Containers: []v1.Container{
Expand All @@ -468,17 +492,26 @@ func buildJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRe
Command: []string{
"/velero",
},
Args: args,
ImagePullPolicy: v1.PullIfNotPresent,
Env: envVars,
EnvFrom: envFromSources,
VolumeMounts: volumeMounts,
Resources: resources,
Args: args,
ImagePullPolicy: v1.PullIfNotPresent,
Env: envVars,
EnvFrom: envFromSources,
VolumeMounts: volumeMounts,
Resources: resources,
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
},
},
RestartPolicy: v1.RestartPolicyNever,
Volumes: volumes,
ServiceAccountName: serviceAccount,
Tolerations: []v1.Toleration{
{
Key: "os",
Operator: "Equal",
Effect: "NoSchedule",
Value: "windows",
},
},
},
},
},
Expand All @@ -489,22 +522,6 @@ func buildJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRe
job.Spec.Template.Spec.Affinity = affinity
}

if tolerations := veleroutil.GetTolerationsFromVeleroServer(deployment); tolerations != nil {
job.Spec.Template.Spec.Tolerations = tolerations
}

if nodeSelector := veleroutil.GetNodeSelectorFromVeleroServer(deployment); nodeSelector != nil {
job.Spec.Template.Spec.NodeSelector = nodeSelector
}

if labels := veleroutil.GetVeleroServerLables(deployment); len(labels) > 0 {
job.Spec.Template.Labels = labels
}

if annotations := veleroutil.GetVeleroServerAnnotations(deployment); len(annotations) > 0 {
job.Spec.Template.Annotations = annotations
}

return job, nil
}

Expand All @@ -516,8 +533,8 @@ func composeStatusFromJob(job *batchv1.Job, message string) velerov1api.BackupRe

return velerov1api.BackupRepositoryMaintenanceStatus{
Result: result,
StartTimestamp: &metav1.Time{Time: job.CreationTimestamp.Time},
CompleteTimestamp: &metav1.Time{Time: job.Status.CompletionTime.Time},
StartTimestamp: &job.CreationTimestamp,
CompleteTimestamp: job.Status.CompletionTime,
Message: message,
}
}
Loading

0 comments on commit a9031eb

Please sign in to comment.