Skip to content

Commit e8288a2

Browse files
dlomclaude
andcommitted
Overhaul status logic to correctly match spec
Progressing: - Detect progress via generation and resource version changes, not provisioning status - Report Progressing=True on operator version change - Track root secret and infrastructure resource version changes - Set Progressing=True with reason VersionChanged on operator version updates Degraded: - Suppress Degraded when Progressing is active - Enforce 20-minute ProgressingTooLong timeout in status controller Available: - Only report Available=False on pod-identity-webhook when not progressing - Clear stale Degraded/Available conditions on pod-identity-webhook during active rollout (based on the spec from https://github.com/openshift/api/blob/2757d677e9aaac40a9985e5a6a17b31c7c577ba8/config/v1/types_cluster_operator.go#L151) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Assisted-By: Claude Opus 4.6 <noreply@anthropic.com>"
1 parent 798098a commit e8288a2

10 files changed

Lines changed: 2999 additions & 351 deletions

File tree

pkg/operator/credentialsrequest/actuator/actuator.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ func (a *DummyActuator) IsTimedTokenCluster(c client.Client, ctx context.Context
8282
}
8383

8484
func (a *DummyActuator) Upgradeable(mode operatorv1.CloudCredentialsMode) *configv1.ClusterOperatorStatusCondition {
85+
// Spec: Upgradeable=True means it is safe to upgrade. The DummyActuator
86+
// always reports upgradeable since it performs no real cloud operations.
8587
upgradeableCondition := &configv1.ClusterOperatorStatusCondition{
8688
Status: configv1.ConditionTrue,
8789
Type: configv1.OperatorUpgradeable,

pkg/operator/credentialsrequest/credentialsrequest_controller.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,12 @@ func (r *ReconcileSecretMissingLabel) GetConditions(logger log.FieldLogger) ([]c
508508
}
509509

510510
if missing > 0 {
511+
// Progressing is the correct condition here per the spec: after an upgrade
512+
// introduces the labeling requirement, secrets created by prior versions won't
513+
// have the label. The operator is moving from one steady state (unlabeled) to
514+
// another (labeled) as a direct consequence of the version change, which fits
515+
// the spec's "propagating config changes" language. The labeling should complete
516+
// quickly; if it exceeds 20 minutes, ProgressingTooLong will fire.
511517
return []configv1.ClusterOperatorStatusCondition{{
512518
Type: configv1.OperatorProgressing,
513519
Status: configv1.ConditionTrue,

pkg/operator/credentialsrequest/credentialsrequest_controller_test.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -342,10 +342,8 @@ func TestCredentialsRequestReconcile(t *testing.T) {
342342
conditionType: configv1.OperatorProgressing,
343343
status: corev1.ConditionTrue,
344344
},
345-
{
346-
conditionType: configv1.OperatorDegraded,
347-
status: corev1.ConditionTrue,
348-
},
345+
// Per the spec, Degraded is not reported during active progress.
346+
// The 20-minute ProgressingTooLong timeout handles persistent failures.
349347
},
350348
},
351349
{

pkg/operator/credentialsrequest/status.go

Lines changed: 60 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,36 @@ func (r *ReconcileCredentialsRequest) GetConditions(logger log.FieldLogger) ([]c
3434
if err != nil {
3535
return []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("failed to get operator state: %v", err)
3636
}
37+
38+
// Fetch resource versions to detect cluster state changes that signal Progressing.
39+
// AdminClient is used for the root secret because the main manager's Secret cache
40+
// is label-filtered and does not include the root credential secret.
41+
var rootSecretResourceVersion string
42+
rootSecretLocation := r.Actuator.GetCredentialsRootSecretLocation()
43+
rootSecret := &corev1.Secret{}
44+
if err := r.AdminClient.Get(context.TODO(), rootSecretLocation, rootSecret); err != nil {
45+
if !apierrors.IsNotFound(err) {
46+
logger.WithError(err).Debug("error retrieving root credentials secret for status")
47+
}
48+
} else {
49+
rootSecretResourceVersion = rootSecret.ResourceVersion
50+
}
51+
52+
var infraResourceVersion string
53+
infra, err := utils.GetInfrastructure(r.Client)
54+
if err != nil {
55+
logger.WithError(err).Debug("error retrieving infrastructure for status")
56+
} else {
57+
infraResourceVersion = infra.ResourceVersion
58+
}
59+
3760
return computeStatusConditions(
3861
r.Actuator,
3962
mode,
4063
credRequests,
4164
r.platformType,
65+
rootSecretResourceVersion,
66+
infraResourceVersion,
4267
logger), nil
4368
}
4469

@@ -92,6 +117,8 @@ func computeStatusConditions(
92117
mode operatorv1.CloudCredentialsMode,
93118
credRequests []minterv1.CredentialsRequest,
94119
clusterCloudPlatform configv1.PlatformType,
120+
rootSecretResourceVersion string,
121+
infraResourceVersion string,
95122
logger log.FieldLogger) []configv1.ClusterOperatorStatusCondition {
96123
operatorIsDisabled := mode == operatorv1.CloudCredentialsModeManual
97124

@@ -125,6 +152,31 @@ func computeStatusConditions(
125152
validCredRequests = append(validCredRequests, credRequests[i])
126153
}
127154

155+
// Spec: Progressing means actively moving from one steady state to another,
156+
// not reconciling a previously known state. Detect actual changes:
157+
// - CR spec changed (Generation != LastSyncGeneration)
158+
// - Root cloud credentials secret changed since last sync
159+
// - Infrastructure resource changed since last sync
160+
credRequestsProgressing := 0
161+
logger.Debugf("%d cred requests", len(validCredRequests))
162+
163+
for _, cr := range validCredRequests {
164+
specChanged := cr.Generation != cr.Status.LastSyncGeneration
165+
// Match the reconciler's logic (credentialsrequest_controller.go):
166+
// the controller treats a non-nil root secret with a resource version
167+
// different from the stored one as a trigger to resync, even when the
168+
// stored version is empty (legacy CR that hasn't backfilled yet).
169+
rootSecretChanged := rootSecretResourceVersion != "" &&
170+
rootSecretResourceVersion != cr.Status.LastSyncCloudCredsSecretResourceVersion
171+
infraChanged := infraResourceVersion != "" &&
172+
infraResourceVersion != cr.Status.LastSyncInfrastructureResourceVersion
173+
if specChanged || rootSecretChanged || infraChanged {
174+
credRequestsProgressing++
175+
}
176+
}
177+
178+
isProgressing := credRequestsProgressing > 0
179+
128180
for _, cr := range validCredRequests {
129181
// Check for provision failure conditions:
130182
foundFailure := false
@@ -141,6 +193,10 @@ func computeStatusConditions(
141193
}
142194
}
143195

196+
// Spec: Degraded means the component does not match its desired state over a
197+
// period of time. Failing credentials requests indicate a persistent lower
198+
// quality of service. Suppression during cluster upgrades is handled centrally
199+
// in syncStatus via the ClusterVersion Progressing condition.
144200
if failingCredRequests > 0 {
145201
var degradedCondition configv1.ClusterOperatorStatusCondition
146202
degradedCondition.Type = configv1.OperatorDegraded
@@ -152,26 +208,15 @@ func computeStatusConditions(
152208
conditions = append(conditions, degradedCondition)
153209
}
154210

155-
// Progressing should be true if the operator is making changes to the operand. In this case
156-
// we will set true if any CredentialsRequests are not provisioned, or have failure conditions,
157-
// as this indicates the controllers have work they are trying to do.
158-
credRequestsNotProvisioned := 0
159-
logger.Debugf("%d cred requests", len(validCredRequests))
160-
161-
for _, cr := range validCredRequests {
162-
if !cr.Status.Provisioned {
163-
credRequestsNotProvisioned = credRequestsNotProvisioned + 1
164-
}
165-
}
166-
167-
if credRequestsNotProvisioned > 0 || failingCredRequests > 0 {
211+
// Spec: report Progressing=True only when actively rolling out.
212+
if isProgressing {
168213
var progressingCondition configv1.ClusterOperatorStatusCondition
169214
progressingCondition.Type = configv1.OperatorProgressing
170215
progressingCondition.Status = configv1.ConditionTrue
171216
progressingCondition.Reason = reasonReconciling
172217
progressingCondition.Message = fmt.Sprintf(
173-
"%d of %d credentials requests provisioned, %d reporting errors.",
174-
len(validCredRequests)-credRequestsNotProvisioned, len(validCredRequests), failingCredRequests)
218+
"%d of %d credentials requests are progressing.",
219+
credRequestsProgressing, len(validCredRequests))
175220
conditions = append(conditions, progressingCondition)
176221
}
177222

0 commit comments

Comments
 (0)