Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Print Hive installer logs if failure occurs during e2e #4008

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pipelines/e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ jobs:
export CI=true
. ./hack/e2e/run-rp-and-e2e.sh

delete_e2e_cluster
#delete_e2e_cluster
kill_rp
kill_mimo_actuator
kill_selenium
Expand Down
4 changes: 2 additions & 2 deletions pkg/util/steps/condition.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import (
// Efforts are being made to not have generic Hive errors but specific, actionable failure cases.
// Instead of providing Hive-specific error messages to customers, the below will send a timeout error message.
// The below functions are run during Install, Update, AdminUpdate.
var timeoutConditionErrors = map[string]string{
var TimeoutConditionErrors = map[string]string{
"attachNSGs": "Failed to attach the ARO NSG to the cluster subnets.",
"apiServersReady": "Kube API has not initialised successfully and is unavailable.",
"minimumWorkerNodesReady": "Minimum number of worker nodes have not been successfully created.",
Expand Down Expand Up @@ -110,7 +110,7 @@ func enrichConditionTimeoutError(f conditionFunction, originalErr error) error {
funcNameParts := strings.Split(FriendlyName(f), ".")
funcName := funcNameParts[len(funcNameParts)-1]

message, exists := timeoutConditionErrors[funcName]
message, exists := TimeoutConditionErrors[funcName]
if !exists {
return originalErr
}
Expand Down
119 changes: 115 additions & 4 deletions test/e2e/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"os/exec"
"path/filepath"
"regexp"
"strings"
"time"

. "github.com/onsi/ginkgo/v2"
Expand All @@ -29,6 +30,7 @@
projectclient "github.com/openshift/client-go/project/clientset/versioned"
routeclient "github.com/openshift/client-go/route/clientset/versioned"
securityclient "github.com/openshift/client-go/security/clientset/versioned"
hivev1 "github.com/openshift/hive/apis/hive/v1"
mcoclient "github.com/openshift/machine-config-operator/pkg/generated/clientset/versioned"
monitoringclient "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned"
"github.com/sirupsen/logrus"
Expand All @@ -40,6 +42,7 @@
"k8s.io/client-go/tools/clientcmd/api/latest"
"sigs.k8s.io/controller-runtime/pkg/client"

internalapi "github.com/Azure/ARO-RP/pkg/api"
"github.com/Azure/ARO-RP/pkg/api/admin"
"github.com/Azure/ARO-RP/pkg/env"
"github.com/Azure/ARO-RP/pkg/hive"
Expand All @@ -54,6 +57,7 @@
"github.com/Azure/ARO-RP/pkg/util/cluster"
msgraph_errors "github.com/Azure/ARO-RP/pkg/util/graph/graphsdk/models/odataerrors"
utillog "github.com/Azure/ARO-RP/pkg/util/log"
"github.com/Azure/ARO-RP/pkg/util/steps"
"github.com/Azure/ARO-RP/pkg/util/uuid"
"github.com/Azure/ARO-RP/pkg/util/version"
"github.com/Azure/ARO-RP/test/util/dynamic"
Expand All @@ -73,12 +77,14 @@

var (
disallowedInFilenameRegex = regexp.MustCompile(`[<>:"/\\|?*\x00-\x1F]`)
clusterProvisionPodRegex = regexp.MustCompile(`cluster.*provision`)

Check failure on line 80 in test/e2e/setup.go

View workflow job for this annotation

GitHub Actions / golangci-lint

var `clusterProvisionPodRegex` is unused (unused)
DefaultEventuallyTimeout = 5 * time.Minute
)

type clientSet struct {
Operations redhatopenshift20231122.OperationsClient
OpenshiftClusters redhatopenshift20231122.OpenShiftClustersClient
InternalClient cluster.InternalClient

VirtualMachines compute.VirtualMachinesClient
Resources features.ResourcesClient
Expand Down Expand Up @@ -116,6 +122,7 @@
clusterName string
osClusterVersion string
clusterResourceID string
clusterDoc *internalapi.OpenShiftCluster

Check failure on line 125 in test/e2e/setup.go

View workflow job for this annotation

GitHub Actions / golangci-lint

var `clusterDoc` is unused (unused)
clients *clientSet
)

Expand Down Expand Up @@ -436,6 +443,7 @@
return &clientSet{
Operations: redhatopenshift20231122.NewOperationsClient(_env.Environment(), _env.SubscriptionID(), authorizer),
OpenshiftClusters: redhatopenshift20231122.NewOpenShiftClustersClient(_env.Environment(), _env.SubscriptionID(), authorizer),
InternalClient: cluster.NewInternalClient(log, _env, authorizer),

VirtualMachines: compute.NewVirtualMachinesClient(_env.Environment(), _env.SubscriptionID(), authorizer),
Resources: features.NewResourcesClient(_env.Environment(), _env.SubscriptionID(), authorizer),
Expand Down Expand Up @@ -493,8 +501,8 @@

osClusterVersion = os.Getenv("OS_CLUSTER_VERSION")

if os.Getenv("CI") != "" { // always create cluster in CI
cluster, err := cluster.New(log, _env, os.Getenv("CI") != "")
if os.Getenv("CI") != "" { // always create utilCluster in CI
utilCluster, err := cluster.New(log, _env, os.Getenv("CI") != "")
if err != nil {
return err
}
Expand All @@ -503,7 +511,58 @@
osClusterVersion = version.DefaultInstallStream.Version.String()
}

err = cluster.Create(ctx, vnetResourceGroup, clusterName, osClusterVersion)
// Hack: initialize the Hive clients before cluster creation so that
// I can get Hive installer logs when cluster creation fails. Will
// neaten this up later once E2E is working.
options := _env.Environment().EnvironmentCredentialOptions()
tokenCredential, err := azidentity.NewEnvironmentCredential(options)
if err != nil {
return err
}

scopes := []string{_env.Environment().ResourceManagerScope}
authorizer := azidext.NewTokenCredentialAdapter(tokenCredential, scopes)

var hiveRestConfig *rest.Config
var hiveClientSet client.Client
var hiveAKS *kubernetes.Clientset
var hiveCM hive.ClusterManager

liveCfg, err := _env.NewLiveConfigManager(ctx)
if err != nil {
return err
}

hiveShard := 1
hiveRestConfig, err = liveCfg.HiveRestConfig(ctx, hiveShard)
if err != nil {
return err
}

hiveClientSet, err = client.New(hiveRestConfig, client.Options{})
if err != nil {
return err
}

hiveAKS, err = kubernetes.NewForConfig(hiveRestConfig)
if err != nil {
return err
}

hiveCM, err = hive.NewFromConfig(log, _env, hiveRestConfig)
if err != nil {
return err
}

clients = &clientSet{
InternalClient: cluster.NewInternalClient(log, _env, authorizer),
HiveRestConfig: hiveRestConfig,
Hive: hiveClientSet,
HiveAKS: hiveAKS,
HiveClusterManager: hiveCM,
}

err = utilCluster.Create(ctx, vnetResourceGroup, clusterName, osClusterVersion)
if err != nil {
return err
}
Expand All @@ -516,12 +575,16 @@
return err
}

return nil
clusterDoc, err = clients.InternalClient.Get(ctx, vnetResourceGroup, clusterName)
return err
}

func done(ctx context.Context) error {
// Temp to avoid deleting cluster
return nil

// terminate early if delete flag is set to false
if os.Getenv("CI") != "" && os.Getenv("E2E_DELETE_CLUSTER") != "false" {

Check failure on line 587 in test/e2e/setup.go

View workflow job for this annotation

GitHub Actions / golangci-lint

unreachable: unreachable code (govet)
cluster, err := cluster.New(log, _env, os.Getenv("CI") != "")
if err != nil {
return err
Expand All @@ -546,6 +609,16 @@
if oDataError, ok := err.(msgraph_errors.ODataErrorable); ok {
spew.Dump(oDataError.GetErrorEscaped())
}

// If Hive installation timed out, print Hive logs
if strings.Contains(err.Error(), steps.TimeoutConditionErrors["hiveClusterDeploymentReady"]) || strings.Contains(err.Error(), steps.TimeoutConditionErrors["hiveClusterInstallationComplete"]) {
log.Warning("Hive installation timed out; attempting to fetch openshift installer logs...")
_err := printHiveInstallerLogs(context.Background())
if _err != nil {
log.Error(_err)
}
}

panic(err)
}
})
Expand All @@ -560,3 +633,41 @@
panic(err)
}
})

// printHiveInstallerLogs prints the cluster's installer Pod logs if it can
// and returns an error if it can't.
func printHiveInstallerLogs(ctx context.Context) error {
// This doesn't work because the "InternalClient" gets the cluster doc using an external client
// and then converts it to internal; the HiveProfile won't be populated.
clusterDoc, err := clients.InternalClient.Get(ctx, vnetResourceGroup, clusterName)
if err != nil {
return err
} else if clusterDoc.Properties.HiveProfile.Namespace == "" {
return fmt.Errorf("unable to get Hive installer logs because Hive namespace is empty in cluster doc")
}

cd := &hivev1.ClusterDeployment{}
err = clients.Hive.Get(ctx, client.ObjectKey{
Namespace: clusterDoc.Properties.HiveProfile.Namespace,
Name: hive.ClusterDeploymentName,
}, cd)
if err != nil {
return err
} else if cd.Status.ProvisionRef == nil {
return fmt.Errorf("unable to get Hive installer logs because the ClusterDeployment object's status.provisionRef is nil")
}

cp := &hivev1.ClusterProvision{}
err = clients.Hive.Get(ctx, client.ObjectKey{
Namespace: clusterDoc.Properties.HiveProfile.Namespace,
Name: cd.Status.ProvisionRef.Name,
}, cp)
if err != nil {
return err
} else if cp.Spec.InstallLog == nil {
return fmt.Errorf("unable to get Hive installer logs because the ClusterProvision object's spec.installLog is nil")
}

spew.Dump(*cp.Spec.InstallLog)
return nil
}
Loading