Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ MANIFESTS ?= $(STANDARD_MANIFEST) $(STANDARD_E2E_MANIFEST) $(EXPERIMENTAL_MANIFE
$(STANDARD_MANIFEST) ?= helm/cert-manager.yaml
$(STANDARD_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/e2e.yaml
$(EXPERIMENTAL_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml
$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml
$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml helm/high-availability.yaml
HELM_SETTINGS ?=
.PHONY: $(MANIFESTS)
$(MANIFESTS): $(HELM) $(CONFTEST)
Expand Down Expand Up @@ -524,8 +524,8 @@ run-experimental: run-internal #HELP Build the operator-controller then deploy i
CATD_NAMESPACE := olmv1-system
.PHONY: wait
wait:
kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=60s
kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert # Avoid upgrade test flakes when reissuing cert
kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=3m
kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert --timeout=3m # Avoid upgrade test flakes when reissuing cert

.PHONY: docker-build
docker-build: build-linux #EXHELP Build docker image for operator-controller and catalog with GOOS=linux and local GOARCH.
Expand Down
159 changes: 159 additions & 0 deletions hack/test/pre-upgrade-setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
#!/bin/bash

set -euo pipefail

help="pre-upgrade-setup.sh is used to create some basic resources
which will later be used in upgrade testing.

Usage:
pre-upgrade-setup.sh [TEST_CATALOG_IMG] [TEST_CLUSTER_CATALOG_NAME] [TEST_CLUSTER_EXTENSION_NAME]
"

if [[ "$#" -ne 3 ]]; then
echo "Illegal number of arguments passed"
echo "${help}"
exit 1
fi

TEST_CATALOG_IMG=$1
TEST_CLUSTER_CATALOG_NAME=$2
TEST_CLUSTER_EXTENSION_NAME=$3

kubectl apply -f - << EOF
apiVersion: olm.operatorframework.io/v1
kind: ClusterCatalog
metadata:
name: ${TEST_CLUSTER_CATALOG_NAME}
spec:
source:
type: Image
image:
ref: ${TEST_CATALOG_IMG}
pollIntervalMinutes: 1440
EOF

kubectl apply -f - <<EOF
apiVersion: v1
kind: ServiceAccount
metadata:
name: upgrade-e2e
namespace: default
EOF

kubectl apply -f - <<EOF
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: upgrade-e2e
rules:
- apiGroups:
- ""
resources:
- "configmaps"
- "secrets"
- "services"
- "serviceaccounts"
verbs:
- "create"
- "update"
- "patch"
- "delete"
- "get"
- "list"
- "watch"
- apiGroups:
- "apps"
resources:
- "deployments"
verbs:
- "create"
- "update"
- "patch"
- "delete"
- "get"
- "list"
- "watch"
- apiGroups:
- "apiextensions.k8s.io"
resources:
- "customresourcedefinitions"
verbs:
- "create"
- "update"
- "patch"
- "delete"
- "get"
- "list"
- "watch"
- apiGroups:
- "rbac.authorization.k8s.io"
resources:
- "clusterroles"
- "clusterrolebindings"
- "roles"
- "rolebindings"
verbs:
- "create"
- "update"
- "patch"
- "delete"
- "get"
- "list"
- "watch"
- "bind"
- "escalate"
- apiGroups:
- networking.k8s.io
resources:
- networkpolicies
verbs:
- get
- list
- watch
- create
- update
- patch
- delete
- apiGroups:
- "olm.operatorframework.io"
resources:
- "clusterextensions/finalizers"
verbs:
- "update"
resourceNames:
- "${TEST_CLUSTER_EXTENSION_NAME}"
EOF

kubectl apply -f - <<EOF
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: upgrade-e2e
subjects:
- kind: ServiceAccount
name: upgrade-e2e
namespace: default
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: upgrade-e2e
EOF

kubectl apply -f - << EOF
apiVersion: olm.operatorframework.io/v1
kind: ClusterExtension
metadata:
name: ${TEST_CLUSTER_EXTENSION_NAME}
spec:
namespace: default
serviceAccount:
name: upgrade-e2e
source:
sourceType: Catalog
catalog:
packageName: test
version: 1.0.0
EOF

kubectl wait --for=condition=Serving --timeout=5m ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
kubectl wait --for=condition=Installed --timeout=5m ClusterExtension $TEST_CLUSTER_EXTENSION_NAME
Comment on lines +158 to +159
Copy link

Copilot AI Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For consistency with other scripts in this repo (e.g. hack/demo/catalogd-demo-script.sh uses kubectl wait ... clustercatalog/<name>), these waits should use the canonical resource form (clustercatalog/${TEST_CLUSTER_CATALOG_NAME} and clusterextension/${TEST_CLUSTER_EXTENSION_NAME}) rather than separate KIND + NAME args. This avoids reliance on kubectl’s kind-to-resource mapping and is more robust across API group/shortname differences.

Suggested change
kubectl wait --for=condition=Serving --timeout=5m ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
kubectl wait --for=condition=Installed --timeout=5m ClusterExtension $TEST_CLUSTER_EXTENSION_NAME
kubectl wait --for=condition=Serving --timeout=5m clustercatalog/${TEST_CLUSTER_CATALOG_NAME}
kubectl wait --for=condition=Installed --timeout=5m clusterextension/${TEST_CLUSTER_EXTENSION_NAME}

Copilot uses AI. Check for mistakes.
33 changes: 33 additions & 0 deletions helm/high-availability.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# High Availability (HA) configuration for OLMv1
# Sets replicas to 2 for both operator-controller and catalogd to enable HA setup
# This is used in experimental-e2e.yaml to test multi-replica deployments
#
# Pod anti-affinity is configured as "preferred" (not "required") to ensure:
# - In multi-node clusters: replicas are scheduled on different nodes for better availability
# - In single-node clusters (like kind): both replicas can still be scheduled on the same node
options:
operatorController:
deployment:
replicas: 2
catalogd:
deployment:
replicas: 2

# Pod anti-affinity configuration to prefer spreading replicas across different nodes
# Uses preferredDuringSchedulingIgnoredDuringExecution (soft constraint) to allow
# scheduling on the same node when necessary (e.g., single-node kind clusters for e2e tests)
deployments:
templateSpec:
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: control-plane
operator: In
values:
- operator-controller-controller-manager
- catalogd-controller-manager
topologyKey: kubernetes.io/hostname
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ metadata:
namespace: {{ .Values.namespaces.olmv1.name }}
spec:
minReadySeconds: 5
replicas: 1
replicas: {{ .Values.options.catalogd.deployment.replicas }}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we already have node anti affinity configured to make sure these replicas do not end up on the same node? If not, we need that as well (but only when replicas > 1).

Copy link
Contributor

@tmshort tmshort Dec 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, I will point out that this may cause an issue on our single-node kind experimental-e2e tests where we have two replicas (such that we are validating that two replicas does not cause issues with the e2e tests).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point! I added podAntiAffinity and used the preferred rule. Besides, I created openshift/release#72395 to add SNO upgrade test for the downstream OLMv1 and OLMv0, please take a look, thanks!

  podAntiAffinity:
    preferredDuringSchedulingIgnoredDuringExecution:
      - weight: 100
        podAffinityTerm:
          labelSelector:
            matchExpressions:
              - key: control-plane
                operator: In
                values:
                  - operator-controller-controller-manager
                  - catalogd-controller-manager
          topologyKey: kubernetes.io/hostname

strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ metadata:
name: operator-controller-controller-manager
namespace: {{ .Values.namespaces.olmv1.name }}
spec:
replicas: 1
replicas: {{ .Values.options.operatorController.deployment.replicas }}
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
2 changes: 2 additions & 0 deletions helm/olmv1/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ options:
enabled: true
deployment:
image: quay.io/operator-framework/operator-controller:devel
replicas: 1
extraArguments: []
Comment on lines 10 to 12
Copy link

Copilot AI Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Defaulting options.operatorController.deployment.replicas to 1 here means the out-of-the-box installation is still single-replica. Given the PDB is configured with minAvailable: 1, this can still block voluntary evictions during node drains. If the goal is HA-ready defaults, consider defaulting this to 2 (or adjusting the PDB behavior when replicas=1).

Copilot uses AI. Check for mistakes.
features:
enabled: []
Expand All @@ -19,6 +20,7 @@ options:
enabled: true
deployment:
image: quay.io/operator-framework/catalogd:devel
replicas: 1
Copy link

Copilot AI Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Defaulting options.catalogd.deployment.replicas to 1 keeps catalogd single-replica by default. With podDisruptionBudget.minAvailable: 1, this can still prevent node drains/evictions for that pod. Consider defaulting to 2 (or making the PDB conditional on the replica count) to actually resolve the HA/PDB deadlock for default installs.

Suggested change
replicas: 1
replicas: 2

Copilot uses AI. Check for mistakes.
extraArguments: []
Comment on lines 9 to 24
Copy link

Copilot AI Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR description says the chart will default both control-plane Deployments to 2 replicas to resolve the PDB deadlock, but the new replicas values added here default to 1. The rendered manifests/standard*.yaml and manifests/experimental.yaml also remain at 1 replica, so the deadlock scenario still exists by default. Either change these defaults to 2 (and regenerate manifests), or update the PR description/docs to clarify that HA requires an explicit values override (e.g., helm/high-availability.yaml).

Copilot uses AI. Check for mistakes.
features:
enabled: []
Expand Down
32 changes: 28 additions & 4 deletions manifests/experimental-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2535,11 +2535,11 @@ metadata:
namespace: olmv1-system
spec:
minReadySeconds: 5
replicas: 1
replicas: 2
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -2652,6 +2652,18 @@ spec:
operator: In
values:
- linux
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- podAffinityTerm:
labelSelector:
matchExpressions:
- key: control-plane
operator: In
values:
- operator-controller-controller-manager
- catalogd-controller-manager
topologyKey: kubernetes.io/hostname
weight: 100
nodeSelector:
kubernetes.io/os: linux
node-role.kubernetes.io/control-plane: ""
Expand Down Expand Up @@ -2686,11 +2698,11 @@ metadata:
name: operator-controller-controller-manager
namespace: olmv1-system
spec:
replicas: 1
replicas: 2
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -2812,6 +2824,18 @@ spec:
operator: In
values:
- linux
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- podAffinityTerm:
labelSelector:
matchExpressions:
- key: control-plane
operator: In
values:
- operator-controller-controller-manager
- catalogd-controller-manager
topologyKey: kubernetes.io/hostname
weight: 100
nodeSelector:
kubernetes.io/os: linux
node-role.kubernetes.io/control-plane: ""
Expand Down
4 changes: 2 additions & 2 deletions manifests/experimental.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2459,7 +2459,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -2597,7 +2597,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
4 changes: 2 additions & 2 deletions manifests/standard-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1782,7 +1782,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -1932,7 +1932,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
4 changes: 2 additions & 2 deletions manifests/standard.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1702,7 +1702,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down Expand Up @@ -1839,7 +1839,7 @@ spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
maxUnavailable: 0 # Never allow pods to be unavailable during updates
selector:
matchLabels:
Expand Down
Loading
Loading