Skip to content

Commit

Permalink
If Hive installation times out, print openshift installer logs
Browse files Browse the repository at this point in the history
  • Loading branch information
kimorris27 committed Dec 13, 2024
1 parent f2bd147 commit fbe3a13
Showing 1 changed file with 135 additions and 1 deletion.
136 changes: 135 additions & 1 deletion test/e2e/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@ import (
"context"
"embed"
"fmt"
"io"
"math"
"net/http"
"net/url"
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"time"

. "github.com/onsi/ginkgo/v2"
Expand All @@ -33,13 +35,17 @@ import (
monitoringclient "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned"
"github.com/sirupsen/logrus"
"github.com/tebeka/selenium"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/client-go/tools/clientcmd/api"
"k8s.io/client-go/tools/clientcmd/api/latest"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"

internalapi "github.com/Azure/ARO-RP/pkg/api"
"github.com/Azure/ARO-RP/pkg/api/admin"
"github.com/Azure/ARO-RP/pkg/env"
"github.com/Azure/ARO-RP/pkg/hive"
Expand All @@ -54,6 +60,7 @@ import (
"github.com/Azure/ARO-RP/pkg/util/cluster"
msgraph_errors "github.com/Azure/ARO-RP/pkg/util/graph/graphsdk/models/odataerrors"
utillog "github.com/Azure/ARO-RP/pkg/util/log"
"github.com/Azure/ARO-RP/pkg/util/steps"
"github.com/Azure/ARO-RP/pkg/util/uuid"
"github.com/Azure/ARO-RP/pkg/util/version"
"github.com/Azure/ARO-RP/test/util/dynamic"
Expand All @@ -73,12 +80,14 @@ var staticResources embed.FS

var (
disallowedInFilenameRegex = regexp.MustCompile(`[<>:"/\\|?*\x00-\x1F]`)
clusterProvisionPodRegex = regexp.MustCompile(`cluster.*provision`)
DefaultEventuallyTimeout = 5 * time.Minute
)

type clientSet struct {
Operations redhatopenshift20231122.OperationsClient
OpenshiftClusters redhatopenshift20231122.OpenShiftClustersClient
InternalClient cluster.InternalClient

VirtualMachines compute.VirtualMachinesClient
Resources features.ResourcesClient
Expand Down Expand Up @@ -116,6 +125,7 @@ var (
clusterName string
osClusterVersion string
clusterResourceID string
clusterDoc *internalapi.OpenShiftCluster
clients *clientSet
)

Expand Down Expand Up @@ -436,6 +446,7 @@ func newClientSet(ctx context.Context) (*clientSet, error) {
return &clientSet{
Operations: redhatopenshift20231122.NewOperationsClient(_env.Environment(), _env.SubscriptionID(), authorizer),
OpenshiftClusters: redhatopenshift20231122.NewOpenShiftClustersClient(_env.Environment(), _env.SubscriptionID(), authorizer),
InternalClient: cluster.NewInternalClient(log, _env, authorizer),

VirtualMachines: compute.NewVirtualMachinesClient(_env.Environment(), _env.SubscriptionID(), authorizer),
Resources: features.NewResourcesClient(_env.Environment(), _env.SubscriptionID(), authorizer),
Expand Down Expand Up @@ -503,6 +514,57 @@ func setup(ctx context.Context) error {
osClusterVersion = version.DefaultInstallStream.Version.String()
}

// Hack: initialize the Hive clients before cluster creation so that
// I can get Hive installer logs when cluster creation fails. Will
// neaten this up later once E2E is working.
options := _env.Environment().EnvironmentCredentialOptions()
tokenCredential, err := azidentity.NewEnvironmentCredential(options)
if err != nil {
return err
}

scopes := []string{_env.Environment().ResourceManagerScope}
authorizer := azidext.NewTokenCredentialAdapter(tokenCredential, scopes)

var hiveRestConfig *rest.Config
var hiveClientSet client.Client
var hiveAKS *kubernetes.Clientset
var hiveCM hive.ClusterManager

liveCfg, err := _env.NewLiveConfigManager(ctx)
if err != nil {
return err
}

hiveShard := 1
hiveRestConfig, err = liveCfg.HiveRestConfig(ctx, hiveShard)
if err != nil {
return err
}

hiveClientSet, err = client.New(hiveRestConfig, client.Options{})
if err != nil {
return err
}

hiveAKS, err = kubernetes.NewForConfig(hiveRestConfig)
if err != nil {
return err
}

hiveCM, err = hive.NewFromConfig(log, _env, hiveRestConfig)
if err != nil {
return err
}

clients = &clientSet{
InternalClient: cluster.NewInternalClient(log, _env, authorizer),

Check failure on line 561 in test/e2e/setup.go

View workflow job for this annotation

GitHub Actions / golangci-lint

cluster.NewInternalClient undefined (type *"github.com/Azure/ARO-RP/pkg/util/cluster".Cluster has no field or method NewInternalClient)
HiveRestConfig: hiveRestConfig,
Hive: hiveClientSet,
HiveAKS: hiveAKS,
HiveClusterManager: hiveCM,
}

err = cluster.Create(ctx, vnetResourceGroup, clusterName, osClusterVersion)
if err != nil {
return err
Expand All @@ -516,7 +578,8 @@ func setup(ctx context.Context) error {
return err
}

return nil
clusterDoc, err = clients.InternalClient.Get(ctx, vnetResourceGroup, clusterName)
return err
}

func done(ctx context.Context) error {
Expand Down Expand Up @@ -546,6 +609,16 @@ var _ = BeforeSuite(func() {
if oDataError, ok := err.(msgraph_errors.ODataErrorable); ok {
spew.Dump(oDataError.GetErrorEscaped())
}

// If Hive installation timed out, print Hive logs
if strings.Contains(err.Error(), steps.TimeoutConditionErrors["hiveClusterDeploymentReady"]) || strings.Contains(err.Error(), steps.TimeoutConditionErrors["hiveClusterInstallationComplete"]) {
log.Warning("Hive installation timed out; attempting to fetch openshift installer logs...")
_err := printHiveInstallerLogs(context.Background())
if _err != nil {
log.Error(_err)
}
}

panic(err)
}
})
Expand All @@ -560,3 +633,64 @@ var _ = AfterSuite(func() {
panic(err)
}
})

// printHiveInstallerLogs prints the cluster's installer Pod logs if it can
// and returns an error if it can't.
func printHiveInstallerLogs(ctx context.Context) error {
clusterDoc, err = clients.InternalClient.Get(ctx, vnetResourceGroup, clusterName)

Check failure on line 640 in test/e2e/setup.go

View workflow job for this annotation

GitHub Actions / golangci-lint

undefined: err
if err != nil {

Check failure on line 641 in test/e2e/setup.go

View workflow job for this annotation

GitHub Actions / golangci-lint

undefined: err
return err

Check failure on line 642 in test/e2e/setup.go

View workflow job for this annotation

GitHub Actions / golangci-lint

undefined: err (typecheck)
}

pods, err := clients.HiveAKS.CoreV1().Pods(clusterDoc.Properties.HiveProfile.Namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("Failed to list Pods in cluster's Hive namespace with following error: %w", err)
}

if pods.Items == nil || len(pods.Items) == 0 {
return fmt.Errorf("Unexpectedly found zero Pods in cluster's Hive namespace")
}

var clusterProvisionPod *corev1.Pod
for _, pod := range pods.Items {
if clusterProvisionPodRegex.MatchString(pod.ObjectMeta.Name) {
clusterProvisionPod = ptr.To(pod)
break
}
}

if clusterProvisionPod == nil {
return fmt.Errorf("Could not find cluster provision Pod in cluster's Hive namespace")
}

req := clients.HiveAKS.CoreV1().Pods(clusterDoc.Properties.HiveProfile.Namespace).GetLogs(clusterProvisionPod.ObjectMeta.Name, nil)
stream, err := req.Stream(ctx)
if err != nil {
return fmt.Errorf("Failed to get logs for cluster provision Pod with following error: %s", err)
}

defer stream.Close()
logs := []string{}
for {
buf := make([]byte, 2000)
numBytes, err := stream.Read(buf)
if numBytes == 0 {
break
}
if err == io.EOF {
break
}
if err != nil {
return err
}
logs = append(logs, string(buf[:numBytes]))
}

log := strings.Join(logs, "\n")
if log == "" {
return fmt.Errorf("Cluster provision Pod didn't have any logs")
}

spew.Dump(log)
return nil
}

0 comments on commit fbe3a13

Please sign in to comment.