diff --git a/Makefile b/Makefile index 5e5547b2f6..5f99fe204a 100644 --- a/Makefile +++ b/Makefile @@ -181,7 +181,7 @@ MANIFESTS ?= $(STANDARD_MANIFEST) $(STANDARD_E2E_MANIFEST) $(EXPERIMENTAL_MANIFE $(STANDARD_MANIFEST) ?= helm/cert-manager.yaml $(STANDARD_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/e2e.yaml $(EXPERIMENTAL_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml -$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml +$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml helm/high-availability.yaml HELM_SETTINGS ?= .PHONY: $(MANIFESTS) $(MANIFESTS): $(HELM) $(CONFTEST) @@ -524,8 +524,8 @@ run-experimental: run-internal #HELP Build the operator-controller then deploy i CATD_NAMESPACE := olmv1-system .PHONY: wait wait: - kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=60s - kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert # Avoid upgrade test flakes when reissuing cert + kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=3m + kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert --timeout=3m # Avoid upgrade test flakes when reissuing cert .PHONY: docker-build docker-build: build-linux #EXHELP Build docker image for operator-controller and catalog with GOOS=linux and local GOARCH. diff --git a/hack/test/pre-upgrade-setup.sh b/hack/test/pre-upgrade-setup.sh new file mode 100755 index 0000000000..283afeaedc --- /dev/null +++ b/hack/test/pre-upgrade-setup.sh @@ -0,0 +1,159 @@ +#!/bin/bash + +set -euo pipefail + +help="pre-upgrade-setup.sh is used to create some basic resources +which will later be used in upgrade testing. + +Usage: + pre-upgrade-setup.sh [TEST_CATALOG_IMG] [TEST_CLUSTER_CATALOG_NAME] [TEST_CLUSTER_EXTENSION_NAME] +" + +if [[ "$#" -ne 3 ]]; then + echo "Illegal number of arguments passed" + echo "${help}" + exit 1 +fi + +TEST_CATALOG_IMG=$1 +TEST_CLUSTER_CATALOG_NAME=$2 +TEST_CLUSTER_EXTENSION_NAME=$3 + +kubectl apply -f - << EOF +apiVersion: olm.operatorframework.io/v1 +kind: ClusterCatalog +metadata: + name: ${TEST_CLUSTER_CATALOG_NAME} +spec: + source: + type: Image + image: + ref: ${TEST_CATALOG_IMG} + pollIntervalMinutes: 1440 +EOF + +kubectl apply -f - < than the catalogd controller pod creation time + cond = apimeta.FindStatusCondition(clusterCatalog.Status.Conditions, ocv1.TypeProgressing) + if cond == nil { + return + } + require.Equal(ct, metav1.ConditionTrue, cond.Status) + require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) + + require.True(ct, clusterCatalog.Status.LastUnpacked.After(catalogdManagerPod.CreationTimestamp.Time)) + }, pollDuration, pollInterval) + + // TODO: if we change the underlying revision storage mechanism, the new version + // will not detect any installed versions, we need to make sure that the upgrade + // test fails across revision storage mechanism changes that are not also accompanied + // by code that automatically migrates the revision storage. + + t.Log("Checking that the ClusterExtension is installed") + var clusterExtension ocv1.ClusterExtension + require.EventuallyWithT(t, func(ct *assert.CollectT) { + require.NoError(ct, c.Get(ctx, types.NamespacedName{Name: testClusterExtensionName}, &clusterExtension)) + cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(ct, cond) + require.Equal(ct, metav1.ConditionTrue, cond.Status) + require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) + require.Contains(ct, cond.Message, "Installed bundle") + require.NotNil(ct, clusterExtension.Status.Install) + require.NotEmpty(ct, clusterExtension.Status.Install.Bundle.Version) + }, pollDuration, pollInterval) + + previousVersion := clusterExtension.Status.Install.Bundle.Version + + t.Log("Updating the ClusterExtension to change version") + // Make sure that after we upgrade OLM itself we can still reconcile old objects if we change them + clusterExtension.Spec.Source.Catalog.Version = "1.0.1" + require.NoError(t, c.Update(ctx, &clusterExtension)) + + t.Log("Checking that the ClusterExtension installs successfully") + // Use 10 minutes for post-OLM-upgrade extension upgrade operations. + // After upgrading OLM itself, the system needs time to: + // - Stabilize after operator-controller pods restart (leader election: up to 163s) + // - Process the ClusterExtension spec change + // - Resolve and unpack the new bundle (1.0.1) + // - Apply manifests and wait for rollout + // In multi-replica deployments with recent OLM upgrade, this can take significant time + require.EventuallyWithT(t, func(ct *assert.CollectT) { + require.NoError(ct, c.Get(ctx, types.NamespacedName{Name: testClusterExtensionName}, &clusterExtension)) + cond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(ct, cond) + require.Equal(ct, ocv1.ReasonSucceeded, cond.Reason) + require.Contains(ct, cond.Message, "Installed bundle") + require.Equal(ct, ocv1.BundleMetadata{Name: "test-operator.1.0.1", Version: "1.0.1"}, clusterExtension.Status.Install.Bundle) + require.NotEqual(ct, previousVersion, clusterExtension.Status.Install.Bundle.Version) + }, 10*time.Minute, pollInterval) +} + +// waitForDeployment checks that the updated deployment with the given app.kubernetes.io/name label +// has reached the desired number of replicas and that the number pods matches that number +// i.e. no old pods remain. +func waitForDeployment(t *testing.T, ctx context.Context, controlPlaneLabel string) { + deploymentLabelSelector := labels.Set{"app.kubernetes.io/name": controlPlaneLabel}.AsSelector() + + t.Log("Checking that the deployment is updated and available") + var desiredNumReplicas int32 + require.EventuallyWithT(t, func(ct *assert.CollectT) { + var managerDeployments appsv1.DeploymentList + require.NoError(ct, c.List(ctx, &managerDeployments, client.MatchingLabelsSelector{Selector: deploymentLabelSelector}, client.InNamespace("olmv1-system"))) + require.Len(ct, managerDeployments.Items, 1) + managerDeployment := managerDeployments.Items[0] + + require.True(ct, + managerDeployment.Status.UpdatedReplicas == *managerDeployment.Spec.Replicas && + managerDeployment.Status.Replicas == *managerDeployment.Spec.Replicas && + managerDeployment.Status.AvailableReplicas == *managerDeployment.Spec.Replicas && + managerDeployment.Status.ReadyReplicas == *managerDeployment.Spec.Replicas, + ) + + // Check that the deployment has the Available condition set to True + var availableCond *appsv1.DeploymentCondition + for i := range managerDeployment.Status.Conditions { + if managerDeployment.Status.Conditions[i].Type == appsv1.DeploymentAvailable { + availableCond = &managerDeployment.Status.Conditions[i] + break + } + } + require.NotNil(ct, availableCond, "Available condition not found") + require.Equal(ct, corev1.ConditionTrue, availableCond.Status, "Deployment Available condition is not True") + + desiredNumReplicas = *managerDeployment.Spec.Replicas + }, time.Minute, time.Second) + + t.Logf("Ensure the number of remaining pods equal the desired number of replicas (%d)", desiredNumReplicas) + require.EventuallyWithT(t, func(ct *assert.CollectT) { + var managerPods corev1.PodList + require.NoError(ct, c.List(ctx, &managerPods, client.MatchingLabelsSelector{Selector: deploymentLabelSelector}, client.InNamespace("olmv1-system"))) + require.Len(ct, managerPods.Items, int(desiredNumReplicas)) + }, time.Minute, time.Second) +} + +// findLeaderPod finds the pod that has acquired the leader lease by inspecting the Lease resource. +// This is more reliable than checking logs as it directly queries the Kubernetes API for the lease holder. +func findLeaderPod(ctx context.Context, controlPlaneLabel string) (*corev1.Pod, error) { + // Map component name to its LeaderElectionID + leaseNameMap := map[string]string{ + "operator-controller": "9c4404e7.operatorframework.io", + "catalogd": "catalogd-operator-lock", + } + + leaseName, ok := leaseNameMap[controlPlaneLabel] + if !ok { + return nil, fmt.Errorf("unknown control plane label: %s", controlPlaneLabel) + } + + var leaderPod *corev1.Pod + + // Use wait.PollUntilContextTimeout for polling with proper context handling + err := wait.PollUntilContextTimeout(ctx, 2*time.Second, pollDuration, true, func(ctx context.Context) (bool, error) { + // Fetch lease to get the current leader's identity + var lease coordinationv1.Lease + if err := c.Get(ctx, types.NamespacedName{ + Name: leaseName, + Namespace: "olmv1-system", + }, &lease); err != nil { + // Lease might not exist yet, retry + return false, nil + } + + if lease.Spec.HolderIdentity == nil { + // No leader elected yet, retry + return false, nil + } + + // The HolderIdentity is in the format "pod-name_hash" + // Extract the pod name by splitting on "_" + holderIdentity := *lease.Spec.HolderIdentity + podName := strings.Split(holderIdentity, "_")[0] + + // Directly fetch the pod by name instead of listing all pods + pod := &corev1.Pod{} + if err := c.Get(ctx, types.NamespacedName{ + Name: podName, + Namespace: "olmv1-system", + }, pod); err != nil { + // Pod might not exist yet or is being terminated, retry + return false, nil + } + + leaderPod = pod + return true, nil + }) + + if err != nil { + return nil, fmt.Errorf("timeout waiting for leader election: %w", err) + } + + return leaderPod, nil +} + +func watchPodLogsForSubstring(ctx context.Context, pod *corev1.Pod, substrings ...string) (bool, error) { + podLogOpts := corev1.PodLogOptions{ + Follow: true, + Container: container, + } + + req := kclientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &podLogOpts) + podLogs, err := req.Stream(ctx) + if err != nil { + return false, err + } + defer podLogs.Close() + + scanner := bufio.NewScanner(podLogs) + for scanner.Scan() { + line := scanner.Text() + + foundCount := 0 + for _, substring := range substrings { + if strings.Contains(line, substring) { + foundCount++ + } + } + if foundCount == len(substrings) { + return true, nil + } + } + + return false, scanner.Err() +} diff --git a/test/upgrade-e2e/suite_test.go b/test/upgrade-e2e/suite_test.go new file mode 100644 index 0000000000..e343190e0d --- /dev/null +++ b/test/upgrade-e2e/suite_test.go @@ -0,0 +1,48 @@ +package upgradee2e + +import ( + "os" + + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/operator-framework/operator-controller/internal/operator-controller/scheme" +) + +var ( + cfg *rest.Config + c client.Client + kclientset *kubernetes.Clientset +) + +// testClusterCatalogName and testClusterExtensionName are set via environment variables +// by the Makefile targets (e.g., test-st2ex-e2e) or default to these values. +var ( + testClusterCatalogName = getEnvOrDefault("TEST_CLUSTER_CATALOG_NAME", "test-catalog") + testClusterExtensionName = getEnvOrDefault("TEST_CLUSTER_EXTENSION_NAME", "test-package") +) + +func init() { + var err error + + cfg = ctrl.GetConfigOrDie() + + c, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + if err != nil { + panic(err) + } + + kclientset, err = kubernetes.NewForConfig(cfg) + if err != nil { + panic(err) + } +} + +func getEnvOrDefault(key, defaultValue string) string { + if value := os.Getenv(key); value != "" { + return value + } + return defaultValue +} diff --git a/test/upgrade-e2e/upgrade_test.go b/test/upgrade-e2e/upgrade_test.go index 17a24a0734..fe3ec3fa59 100644 --- a/test/upgrade-e2e/upgrade_test.go +++ b/test/upgrade-e2e/upgrade_test.go @@ -28,6 +28,13 @@ func TestMain(m *testing.M) { pflag.Parse() opts.Paths = pflag.Args() + // Run standard Go tests first (e.g., post_upgrade_test.go) + exitCode := m.Run() + if exitCode != 0 { + os.Exit(exitCode) + } + + // Run Godog BDD tests sc := godog.TestSuite{ ScenarioInitializer: func(sc *godog.ScenarioContext) { sc.Before(steps.CreateScenarioContext) diff --git a/testdata/build-test-registry.sh b/testdata/build-test-registry.sh index e2dcc09148..7dee9e3c11 100755 --- a/testdata/build-test-registry.sh +++ b/testdata/build-test-registry.sh @@ -103,7 +103,7 @@ spec: type: NodePort EOF -kubectl wait --for=condition=Available -n "${namespace}" "deploy/${name}" --timeout=60s +kubectl wait --for=condition=Available -n "${namespace}" "deploy/${name}" --timeout=3m kubectl apply -f - << EOF apiVersion: batch/v1 @@ -135,4 +135,4 @@ spec: secretName: ${namespace}-registry EOF -kubectl wait --for=condition=Complete -n "${namespace}" "job/${name}-push" --timeout=60s +kubectl wait --for=condition=Complete -n "${namespace}" "job/${name}-push" --timeout=3m