From 4721aea3c57c525bf78627f40a9afceba1bed143 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Mon, 15 Jun 2026 17:37:54 +0300 Subject: [PATCH 01/14] feat(operator): add SnapshotContent work-order source fields Signed-off-by: Ron Kahn --- .../crds/nvidia.com_snapshotcontents.yaml | 39 ++++++++++++------- .../operator/crds/nvidia.com_snapshots.yaml | 5 +++ .../operator/api/v1alpha1/snapshot_types.go | 6 +++ .../api/v1alpha1/snapshot_types_test.go | 11 ++++-- .../api/v1alpha1/snapshotcontent_types.go | 27 ++++++------- .../api/v1alpha1/zz_generated.deepcopy.go | 6 +-- .../bases/nvidia.com_snapshotcontents.yaml | 39 ++++++++++++------- .../crd/bases/nvidia.com_snapshots.yaml | 5 +++ 8 files changed, 90 insertions(+), 48 deletions(-) diff --git a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshotcontents.yaml b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshotcontents.yaml index 99dfbc11dee1..000f8955d691 100644 --- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshotcontents.yaml +++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshotcontents.yaml @@ -63,7 +63,8 @@ spec: spec: description: |- SnapshotContentSpec defines the desired state of SnapshotContent. It is - populated by the node agent at creation time and is immutable thereafter. + populated by the SnapshotReconciler (operator) at creation time and is + immutable thereafter. properties: snapshotRef: description: |- @@ -86,18 +87,35 @@ spec: - namespace type: object source: - description: Source locates the physical artifact via a self-contained, opaque handle. + description: 'Source describes what to capture: the source pod and the node it runs on.' properties: - snapshotHandle: + nodeName: description: |- - SnapshotHandle is a self-contained, opaque artifact locator. The v1alpha1 - PVC format is: - pvc://////versions/ - It fully locates the artifact without correlating any other field. + NodeName is the node the source pod runs on, denormalized from the live + pod so it travels with PodRef as one immutable unit and selects the node + agent that performs the dump. minLength: 1 type: string + podRef: + description: |- + PodRef identifies the pod to dump. Its UID guards against dumping a + same-named recreation of the pod. + properties: + name: + description: Name of the source pod. + minLength: 1 + type: string + uid: + description: |- + UID of the source pod, recorded so the node agent dumps that specific + pod and not a same-named recreation. + type: string + required: + - name + type: object required: - - snapshotHandle + - nodeName + - podRef type: object required: - snapshotRef @@ -164,11 +182,6 @@ spec: - type type: object type: array - snapshotHandle: - description: |- - SnapshotHandle mirrors spec.source.snapshotHandle once the node agent has - verified the artifact. - type: string type: object type: object x-kubernetes-validations: diff --git a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshots.yaml b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshots.yaml index a5101b641309..f6a8567176cd 100644 --- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshots.yaml +++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshots.yaml @@ -94,6 +94,11 @@ spec: description: Name of the source pod. minLength: 1 type: string + uid: + description: |- + UID of the source pod, recorded so the node agent dumps that specific + pod and not a same-named recreation. + type: string required: - name type: object diff --git a/deploy/operator/api/v1alpha1/snapshot_types.go b/deploy/operator/api/v1alpha1/snapshot_types.go index 8efe51ca4526..0918fddba06a 100644 --- a/deploy/operator/api/v1alpha1/snapshot_types.go +++ b/deploy/operator/api/v1alpha1/snapshot_types.go @@ -19,6 +19,7 @@ package v1alpha1 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" ) // Snapshot and SnapshotContent status condition types. Both objects share this @@ -69,6 +70,11 @@ type PodReference struct { // +kubebuilder:validation:Required // +kubebuilder:validation:MinLength=1 Name string `json:"name"` + + // UID of the source pod, recorded so the node agent dumps that specific + // pod and not a same-named recreation. + // +optional + UID types.UID `json:"uid,omitempty"` } // SnapshotStatus defines the observed state of Snapshot. diff --git a/deploy/operator/api/v1alpha1/snapshot_types_test.go b/deploy/operator/api/v1alpha1/snapshot_types_test.go index d562910e5f2c..5ff451ed0a20 100644 --- a/deploy/operator/api/v1alpha1/snapshot_types_test.go +++ b/deploy/operator/api/v1alpha1/snapshot_types_test.go @@ -76,7 +76,10 @@ func TestSnapshotContentDeepCopyIsIndependent(t *testing.T) { ObjectMeta: metav1.ObjectMeta{Name: "content-a"}, Spec: SnapshotContentSpec{ SnapshotRef: SnapshotReference{Namespace: "inference", Name: "snap-a", UID: types.UID("uid-1")}, - Source: SnapshotContentSource{SnapshotHandle: "pvc://inference/ckpt-pvc/checkpoints/abc123/versions/1"}, + Source: SnapshotContentSource{ + PodRef: PodReference{Name: "worker-0", UID: types.UID("pod-uid-1")}, + NodeName: "node-a", + }, }, Status: SnapshotContentStatus{ Conditions: []metav1.Condition{{Type: "Ready", Status: metav1.ConditionTrue, Reason: "Bound"}}, @@ -88,10 +91,10 @@ func TestSnapshotContentDeepCopyIsIndependent(t *testing.T) { t.Fatalf("DeepCopy is not equal to original") } - clone.Spec.Source.SnapshotHandle = "mutated" + clone.Spec.Source.PodRef.Name = "mutated" clone.Status.Conditions[0].Reason = "Changed" - if original.Spec.Source.SnapshotHandle != "pvc://inference/ckpt-pvc/checkpoints/abc123/versions/1" { - t.Errorf("mutating clone changed original handle: got %q", original.Spec.Source.SnapshotHandle) + if original.Spec.Source.PodRef.Name != "worker-0" { + t.Errorf("mutating clone changed original podRef name: got %q", original.Spec.Source.PodRef.Name) } if original.Status.Conditions[0].Reason != "Bound" { t.Errorf("mutating clone condition changed original: got %q", original.Status.Conditions[0].Reason) diff --git a/deploy/operator/api/v1alpha1/snapshotcontent_types.go b/deploy/operator/api/v1alpha1/snapshotcontent_types.go index 7a970f5b959d..863a61cf3cb6 100644 --- a/deploy/operator/api/v1alpha1/snapshotcontent_types.go +++ b/deploy/operator/api/v1alpha1/snapshotcontent_types.go @@ -23,14 +23,15 @@ import ( ) // SnapshotContentSpec defines the desired state of SnapshotContent. It is -// populated by the node agent at creation time and is immutable thereafter. +// populated by the SnapshotReconciler (operator) at creation time and is +// immutable thereafter. type SnapshotContentSpec struct { // SnapshotRef is the back-pointer to the bound Snapshot. It may span // namespaces because SnapshotContent is cluster-scoped. // +kubebuilder:validation:Required SnapshotRef SnapshotReference `json:"snapshotRef"` - // Source locates the physical artifact via a self-contained, opaque handle. + // Source describes what to capture: the source pod and the node it runs on. // +kubebuilder:validation:Required Source SnapshotContentSource `json:"source"` } @@ -51,24 +52,24 @@ type SnapshotReference struct { UID types.UID `json:"uid,omitempty"` } -// SnapshotContentSource locates the physical checkpoint artifact. +// SnapshotContentSource is the immutable source descriptor: what to dump +// (PodRef) and where it runs (NodeName). type SnapshotContentSource struct { - // SnapshotHandle is a self-contained, opaque artifact locator. The v1alpha1 - // PVC format is: - // pvc://////versions/ - // It fully locates the artifact without correlating any other field. + // PodRef identifies the pod to dump. Its UID guards against dumping a + // same-named recreation of the pod. + // +kubebuilder:validation:Required + PodRef PodReference `json:"podRef"` + + // NodeName is the node the source pod runs on, denormalized from the live + // pod so it travels with PodRef as one immutable unit and selects the node + // agent that performs the dump. // +kubebuilder:validation:Required // +kubebuilder:validation:MinLength=1 - SnapshotHandle string `json:"snapshotHandle"` + NodeName string `json:"nodeName"` } // SnapshotContentStatus defines the observed state of SnapshotContent. type SnapshotContentStatus struct { - // SnapshotHandle mirrors spec.source.snapshotHandle once the node agent has - // verified the artifact. - // +optional - SnapshotHandle *string `json:"snapshotHandle,omitempty"` - // Conditions reflect the latest observations of the SnapshotContent's state. // Standard types are Ready and Failed. // +optional diff --git a/deploy/operator/api/v1alpha1/zz_generated.deepcopy.go b/deploy/operator/api/v1alpha1/zz_generated.deepcopy.go index 119d1b89ce90..50b7e7b75452 100644 --- a/deploy/operator/api/v1alpha1/zz_generated.deepcopy.go +++ b/deploy/operator/api/v1alpha1/zz_generated.deepcopy.go @@ -1943,6 +1943,7 @@ func (in *SnapshotContentList) DeepCopyObject() runtime.Object { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SnapshotContentSource) DeepCopyInto(out *SnapshotContentSource) { *out = *in + out.PodRef = in.PodRef } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SnapshotContentSource. @@ -1975,11 +1976,6 @@ func (in *SnapshotContentSpec) DeepCopy() *SnapshotContentSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SnapshotContentStatus) DeepCopyInto(out *SnapshotContentStatus) { *out = *in - if in.SnapshotHandle != nil { - in, out := &in.SnapshotHandle, &out.SnapshotHandle - *out = new(string) - **out = **in - } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]metav1.Condition, len(*in)) diff --git a/deploy/operator/config/crd/bases/nvidia.com_snapshotcontents.yaml b/deploy/operator/config/crd/bases/nvidia.com_snapshotcontents.yaml index 99dfbc11dee1..000f8955d691 100644 --- a/deploy/operator/config/crd/bases/nvidia.com_snapshotcontents.yaml +++ b/deploy/operator/config/crd/bases/nvidia.com_snapshotcontents.yaml @@ -63,7 +63,8 @@ spec: spec: description: |- SnapshotContentSpec defines the desired state of SnapshotContent. It is - populated by the node agent at creation time and is immutable thereafter. + populated by the SnapshotReconciler (operator) at creation time and is + immutable thereafter. properties: snapshotRef: description: |- @@ -86,18 +87,35 @@ spec: - namespace type: object source: - description: Source locates the physical artifact via a self-contained, opaque handle. + description: 'Source describes what to capture: the source pod and the node it runs on.' properties: - snapshotHandle: + nodeName: description: |- - SnapshotHandle is a self-contained, opaque artifact locator. The v1alpha1 - PVC format is: - pvc://////versions/ - It fully locates the artifact without correlating any other field. + NodeName is the node the source pod runs on, denormalized from the live + pod so it travels with PodRef as one immutable unit and selects the node + agent that performs the dump. minLength: 1 type: string + podRef: + description: |- + PodRef identifies the pod to dump. Its UID guards against dumping a + same-named recreation of the pod. + properties: + name: + description: Name of the source pod. + minLength: 1 + type: string + uid: + description: |- + UID of the source pod, recorded so the node agent dumps that specific + pod and not a same-named recreation. + type: string + required: + - name + type: object required: - - snapshotHandle + - nodeName + - podRef type: object required: - snapshotRef @@ -164,11 +182,6 @@ spec: - type type: object type: array - snapshotHandle: - description: |- - SnapshotHandle mirrors spec.source.snapshotHandle once the node agent has - verified the artifact. - type: string type: object type: object x-kubernetes-validations: diff --git a/deploy/operator/config/crd/bases/nvidia.com_snapshots.yaml b/deploy/operator/config/crd/bases/nvidia.com_snapshots.yaml index a5101b641309..f6a8567176cd 100644 --- a/deploy/operator/config/crd/bases/nvidia.com_snapshots.yaml +++ b/deploy/operator/config/crd/bases/nvidia.com_snapshots.yaml @@ -94,6 +94,11 @@ spec: description: Name of the source pod. minLength: 1 type: string + uid: + description: |- + UID of the source pod, recorded so the node agent dumps that specific + pod and not a same-named recreation. + type: string required: - name type: object From 101a87a47ead8d305997cdbfa0c0521ab8c25652 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Mon, 15 Jun 2026 17:37:55 +0300 Subject: [PATCH 02/14] feat(operator): capture via SnapshotReconciler and Snapshot status Signed-off-by: Ron Kahn --- .../operator/templates/manager-rbac.yaml | 5 + deploy/operator/cmd/main.go | 12 + deploy/operator/config/rbac/role.yaml | 5 + .../controller/dynamocheckpoint_controller.go | 143 ++++--- .../dynamocheckpoint_controller_test.go | 217 +++-------- .../controller/snapshot_reconciler.go | 357 ++++++++++++++++++ .../controller/snapshot_reconciler_test.go | 258 +++++++++++++ 7 files changed, 780 insertions(+), 217 deletions(-) create mode 100644 deploy/operator/internal/controller/snapshot_reconciler.go create mode 100644 deploy/operator/internal/controller/snapshot_reconciler_test.go diff --git a/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml b/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml index bae96a422a04..988a2a42364a 100644 --- a/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml +++ b/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml @@ -330,6 +330,7 @@ rules: - dynamographdeploymentscalingadapters - dynamomodels - dynamoworkermetadatas + - snapshotcontents - snapshots verbs: - create @@ -347,6 +348,8 @@ rules: - dynamographdeploymentrequests/finalizers - dynamographdeployments/finalizers - dynamomodels/finalizers + - snapshotcontents/finalizers + - snapshots/finalizers verbs: - update - apiGroups: @@ -358,6 +361,8 @@ rules: - dynamographdeployments/status - dynamographdeploymentscalingadapters/status - dynamomodels/status + - snapshotcontents/status + - snapshots/status verbs: - get - patch diff --git a/deploy/operator/cmd/main.go b/deploy/operator/cmd/main.go index de54570bcf2f..320ee1c88e48 100644 --- a/deploy/operator/cmd/main.go +++ b/deploy/operator/cmd/main.go @@ -264,6 +264,11 @@ func main() { mgrOpts.Cache.DefaultNamespaces = map[string]cache.Config{ restrictedNamespace: {}, } + // SnapshotContent is cluster-scoped, so DefaultNamespaces does not cover it. + // Register it cluster-wide explicitly so the SnapshotReconciler can watch it. + mgrOpts.Cache.ByObject = map[client.Object]cache.ByObject{ + &nvidiacomv1alpha1.SnapshotContent{}: {}, + } setupLog.Info("Restricted namespace configured, launching in restricted mode", "namespace", restrictedNamespace) banner := strings.Repeat("=", 80) @@ -710,6 +715,13 @@ func registerControllers( return fmt.Errorf("unable to create DynamoCheckpoint controller: %w", err) } + if err = (&controller.SnapshotReconciler{ + Client: mgr.GetClient(), + Recorder: mgr.GetEventRecorderFor("snapshot"), + }).SetupWithManager(mgr); err != nil { + return fmt.Errorf("unable to create Snapshot controller: %w", err) + } + if runtimeConfig.GroveEnabled { if err = controller.NewFailoverCascadeReconciler( mgr.GetClient(), diff --git a/deploy/operator/config/rbac/role.yaml b/deploy/operator/config/rbac/role.yaml index 5ec2ac826ca9..45a0892273a3 100644 --- a/deploy/operator/config/rbac/role.yaml +++ b/deploy/operator/config/rbac/role.yaml @@ -227,6 +227,7 @@ rules: - dynamographdeployments - dynamographdeploymentscalingadapters - dynamomodels + - snapshotcontents - snapshots verbs: - create @@ -244,6 +245,8 @@ rules: - dynamographdeploymentrequests/finalizers - dynamographdeployments/finalizers - dynamomodels/finalizers + - snapshotcontents/finalizers + - snapshots/finalizers verbs: - update - apiGroups: @@ -255,6 +258,8 @@ rules: - dynamographdeployments/status - dynamographdeploymentscalingadapters/status - dynamomodels/status + - snapshotcontents/status + - snapshots/status verbs: - get - patch diff --git a/deploy/operator/internal/controller/dynamocheckpoint_controller.go b/deploy/operator/internal/controller/dynamocheckpoint_controller.go index 4c221792ae5d..3199b5a213a8 100644 --- a/deploy/operator/internal/controller/dynamocheckpoint_controller.go +++ b/deploy/operator/internal/controller/dynamocheckpoint_controller.go @@ -25,7 +25,6 @@ import ( appsv1 "k8s.io/api/apps/v1" batchv1 "k8s.io/api/batch/v1" - coordinationv1 "k8s.io/api/coordination/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" @@ -301,8 +300,6 @@ func (r *CheckpointReconciler) failPendingCheckpoint( } func (r *CheckpointReconciler) handleCreating(ctx context.Context, ckpt *nvidiacomv1alpha1.DynamoCheckpoint) (ctrl.Result, error) { - logger := log.FromContext(ctx) - if ckpt.Status.JobName == "" { ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhasePending ckpt.Status.Message = "checkpoint job is missing from status" @@ -352,73 +349,91 @@ func (r *CheckpointReconciler) handleCreating(ctx context.Context, ckpt *nvidiac return ctrl.Result{}, err } - var lease *coordinationv1.Lease - leaseKey := client.ObjectKey{Namespace: job.Namespace, Name: job.Name} - lease = &coordinationv1.Lease{} - if err := r.Get(ctx, leaseKey, lease); err != nil { - if !apierrors.IsNotFound(err) { - return ctrl.Result{}, err + return r.observeSnapshot(ctx, ckpt, job, checkpointID) +} + +// observeSnapshot maps the bound Snapshot's status (and the owned Job's failure / deadline +// hang guards) onto the DynamoCheckpoint phase. Completion cascades up from SnapshotContent +// → Snapshot → DynamoCheckpoint, so this never reads the Job's terminal annotation. +func (r *CheckpointReconciler) observeSnapshot(ctx context.Context, ckpt *nvidiacomv1alpha1.DynamoCheckpoint, job *batchv1.Job, checkpointID string) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + snap := &nvidiacomv1alpha1.Snapshot{} + if err := r.Get(ctx, client.ObjectKey{Namespace: ckpt.Namespace, Name: snapshotName(checkpointID)}, snap); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{RequeueAfter: time.Second}, nil } - lease = nil - } - - now := time.Now() - checkpointWorkerActive := false - if lease != nil && lease.Spec.LeaseDurationSeconds != nil { - // The snapshot-agent owns and renews this lease while it is still finalizing - // checkpoint state. A Job can complete before the agent writes the terminal - // checkpoint annotation, so we keep requeuing until the lease is no longer active. - lastRenewal := lease.Spec.RenewTime - if lastRenewal == nil { - lastRenewal = lease.Spec.AcquireTime + return ctrl.Result{}, err + } + + // Read Snapshot.status only once it is bound; an unbound Snapshot is still being + // set up by the SnapshotReconciler. + if snap.Status.BoundSnapshotContentName != nil { + if cond := meta.FindStatusCondition(snap.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady); cond != nil && cond.Status == metav1.ConditionTrue { + logger.Info("Snapshot ready", "snapshot", snap.Name) + r.Recorder.Event(ckpt, corev1.EventTypeNormal, "CheckpointReady", cond.Message) + now := metav1.Now() + ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseReady + ckpt.Status.CheckpointID = checkpointID + ckpt.Status.CreatedAt = &now + ckpt.Status.Message = "" + meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{ + Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted), + Status: metav1.ConditionTrue, + Reason: "SnapshotReady", + Message: cond.Message, + }) + return ctrl.Result{}, r.Status().Update(ctx, ckpt) } - if lastRenewal != nil { - checkpointWorkerActive = !now.After(lastRenewal.Time.Add(time.Duration(*lease.Spec.LeaseDurationSeconds) * time.Second)) + if cond := meta.FindStatusCondition(snap.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed); cond != nil && cond.Status == metav1.ConditionTrue { + return r.failCreating(ctx, ckpt, "SnapshotFailed", cond.Message) } } - observation := snapshotprotocol.ObserveCheckpointJob(job, checkpointWorkerActive) - switch observation.Phase { - case snapshotprotocol.CheckpointObservationPhaseWaitingForConfirmation: - logger.V(1).Info("Checkpoint job is complete but checkpoint worker is still active; waiting for terminal watcher status", "job", job.Name) - return ctrl.Result{RequeueAfter: time.Second}, nil - case snapshotprotocol.CheckpointObservationPhaseReady: - logger.Info("Checkpoint Job succeeded", "job", job.Name) - r.Recorder.Event(ckpt, corev1.EventTypeNormal, "CheckpointReady", observation.Message) + // Hang guard 1: the owned Job failed while the Snapshot is still non-terminal. + if jobFailed, message := checkpointJobFailed(job); jobFailed { + return r.failCreating(ctx, ckpt, "JobFailed", message) + } - now := metav1.Now() - ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseReady - ckpt.Status.CreatedAt = &now - ckpt.Status.Message = "" - meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{ - Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted), - Status: metav1.ConditionTrue, - Reason: observation.Reason, - Message: observation.Message, - }) - if err := r.Status().Update(ctx, ckpt); err != nil { - return ctrl.Result{}, err + // Hang guard 2: the Job ran past its deadline without a terminal Snapshot. + if job.Spec.ActiveDeadlineSeconds != nil { + deadline := job.CreationTimestamp.Add(time.Duration(*job.Spec.ActiveDeadlineSeconds) * time.Second) + if time.Now().After(deadline) { + return r.failCreating(ctx, ckpt, "CheckpointDeadlineExceeded", + fmt.Sprintf("checkpoint did not complete before the Job deadline (%s)", deadline.Format(time.RFC3339))) } - return ctrl.Result{}, nil - case snapshotprotocol.CheckpointObservationPhaseFailed: - logger.Info("Checkpoint Job failed", "job", job.Name, "message", observation.Message) - r.Recorder.Event(ckpt, corev1.EventTypeWarning, "CheckpointFailed", observation.Message) + } - ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseFailed - ckpt.Status.Message = observation.Message - meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{ - Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted), - Status: metav1.ConditionFalse, - Reason: observation.Reason, - Message: observation.Message, - }) - if err := r.Status().Update(ctx, ckpt); err != nil { - return ctrl.Result{}, err + return ctrl.Result{}, nil +} + +// failCreating marks the DynamoCheckpoint Failed with a completion-condition reason. +func (r *CheckpointReconciler) failCreating(ctx context.Context, ckpt *nvidiacomv1alpha1.DynamoCheckpoint, reason, message string) (ctrl.Result, error) { + log.FromContext(ctx).Info("Checkpoint failed", "reason", reason, "message", message) + r.Recorder.Event(ckpt, corev1.EventTypeWarning, "CheckpointFailed", message) + ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseFailed + ckpt.Status.Message = message + meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{ + Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted), + Status: metav1.ConditionFalse, + Reason: reason, + Message: message, + }) + return ctrl.Result{}, r.Status().Update(ctx, ckpt) +} + +// checkpointJobFailed reports whether the Job has a True JobFailed condition. +func checkpointJobFailed(job *batchv1.Job) (bool, string) { + for _, condition := range job.Status.Conditions { + if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue { + message := "checkpoint job failed" + if condition.Message != "" { + message = fmt.Sprintf("%s: %s", message, condition.Message) + } + return true, message } - return ctrl.Result{}, nil - default: - return ctrl.Result{}, nil } + return false, "" } //nolint:gocyclo @@ -504,6 +519,14 @@ func (r *CheckpointReconciler) SetupWithManager(mgr ctrl.Manager) error { UpdateFunc: func(ue event.UpdateEvent) bool { return true }, GenericFunc: func(ge event.GenericEvent) bool { return true }, })). + Owns(&nvidiacomv1alpha1.Snapshot{}, builder.WithPredicates(predicate.Funcs{ + // Status mirror cascades up via Snapshot status updates only; ignore + // create/delete so the mirror cannot storm. + CreateFunc: func(ce event.CreateEvent) bool { return false }, + DeleteFunc: func(de event.DeleteEvent) bool { return false }, + UpdateFunc: func(ue event.UpdateEvent) bool { return true }, + GenericFunc: func(ge event.GenericEvent) bool { return false }, + })). WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config, r.RuntimeConfig)). Complete(r) } diff --git a/deploy/operator/internal/controller/dynamocheckpoint_controller_test.go b/deploy/operator/internal/controller/dynamocheckpoint_controller_test.go index fdb63a8f93f1..fcde4baded99 100644 --- a/deploy/operator/internal/controller/dynamocheckpoint_controller_test.go +++ b/deploy/operator/internal/controller/dynamocheckpoint_controller_test.go @@ -119,19 +119,6 @@ func makeTestCheckpoint(phase nvidiacomv1alpha1.DynamoCheckpointPhase) *nvidiaco } } -func makeCheckpointLease(name string, renewTime time.Time, durationSeconds int32) *coordinationv1.Lease { - renewMicroTime := metav1.NewMicroTime(renewTime) - return &coordinationv1.Lease{ - ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: testNamespace}, - Spec: coordinationv1.LeaseSpec{ - HolderIdentity: ptr.To("snapshot-agent/test"), - LeaseDurationSeconds: &durationSeconds, - AcquireTime: &renewMicroTime, - RenewTime: &renewMicroTime, - }, - } -} - func requireCheckpointContainer(t *testing.T, containers []corev1.Container, name string) *corev1.Container { t.Helper() if container := findCheckpointContainer(containers, name); container != nil { @@ -846,195 +833,111 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) { assert.True(t, metav1.IsControlledBy(snap, ckpt)) }) - t.Run("succeeded job transitions to Ready", func(t *testing.T) { - ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) - job := &batchv1.Job{ + // ownedSnapshot returns a Snapshot owned by ckpt and bound to a SnapshotContent, + // carrying the given terminal condition (empty type leaves it Pending). + ownedSnapshot := func(ckpt *nvidiacomv1alpha1.DynamoCheckpoint, condType string) *nvidiacomv1alpha1.Snapshot { + bound := "snapshotcontent-" + testHash + snap := &nvidiacomv1alpha1.Snapshot{ ObjectMeta: metav1.ObjectMeta{ - Name: defaultCheckpointJobName, - Namespace: testNamespace, - Annotations: map[string]string{snapshotprotocol.CheckpointStatusAnnotation: snapshotprotocol.CheckpointStatusCompleted}, + Name: snapshotName(testHash), + Namespace: testNamespace, + OwnerReferences: []metav1.OwnerReference{{ + APIVersion: nvidiacomv1alpha1.GroupVersion.String(), + Kind: "DynamoCheckpoint", + Name: ckpt.Name, + UID: ckpt.UID, + Controller: ptr.To(true), + }}, }, - Status: batchv1.JobStatus{ - Succeeded: 1, - Conditions: []batchv1.JobCondition{ - {Type: batchv1.JobComplete, Status: corev1.ConditionTrue, LastTransitionTime: metav1.Now()}, - }, + Spec: nvidiacomv1alpha1.SnapshotSpec{ + CheckpointID: testHash, + Source: nvidiacomv1alpha1.SnapshotSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0"}}, }, + Status: nvidiacomv1alpha1.SnapshotStatus{BoundSnapshotContentName: &bound}, } + if condType != "" { + snap.Status.Conditions = []metav1.Condition{{ + Type: condType, + Status: metav1.ConditionTrue, + Reason: "Test", + Message: condType + " from agent", + }} + } + return snap + } - r := makeCheckpointReconciler(s, ckpt, job, newOwnedPod(podNameFromJob(job.Name), job)) + t.Run("Snapshot Ready transitions checkpoint to Ready", func(t *testing.T) { + ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) + job := newCheckpointJob(defaultCheckpointJobName) + snap := ownedSnapshot(ckpt, nvidiacomv1alpha1.SnapshotConditionReady) + + r := makeCheckpointReconciler(s, ckpt, job, snap, newOwnedPod(podNameFromJob(job.Name), job)) _, err := r.handleCreating(ctx, ckpt) require.NoError(t, err) updated := &nvidiacomv1alpha1.DynamoCheckpoint{} require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseReady, updated.Status.Phase) + assert.Equal(t, testHash, updated.Status.CheckpointID) assert.NotNil(t, updated.Status.CreatedAt) }) - t.Run("failed job transitions to Failed", func(t *testing.T) { - ckpt := makeCreatingCkpt(testHash, "job-fail") - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{Name: "job-fail", Namespace: testNamespace}, - Status: batchv1.JobStatus{ - Conditions: []batchv1.JobCondition{{Type: batchv1.JobFailed, Status: corev1.ConditionTrue}}, - }, - } - - r := makeCheckpointReconciler(s, ckpt, job, newOwnedPod(podNameFromJob(job.Name), job)) - _, err := r.handleCreating(ctx, ckpt) - require.NoError(t, err) - - updated := &nvidiacomv1alpha1.DynamoCheckpoint{} - require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) - assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseFailed, updated.Status.Phase) - }) - - t.Run("completed job without completion annotation waits while lease is active", func(t *testing.T) { - ckpt := makeCreatingCkpt(testHash, "job-missing-status-active-lease") - completionTime := metav1.NewTime(time.Now()) - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{Name: "job-missing-status-active-lease", Namespace: testNamespace}, - Status: batchv1.JobStatus{ - Succeeded: 1, - CompletionTime: &completionTime, - Conditions: []batchv1.JobCondition{ - {Type: batchv1.JobComplete, Status: corev1.ConditionTrue, LastTransitionTime: completionTime}, - }, - }, - } - lease := makeCheckpointLease("job-missing-status-active-lease", time.Now(), 30) - - r := makeCheckpointReconciler(s, ckpt, job, lease, newOwnedPod(podNameFromJob(job.Name), job)) - result, err := r.handleCreating(ctx, ckpt) - require.NoError(t, err) - assert.Equal(t, time.Second, result.RequeueAfter) - - updated := &nvidiacomv1alpha1.DynamoCheckpoint{} - require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) - assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseCreating, updated.Status.Phase) - }) + t.Run("Snapshot Failed transitions checkpoint to Failed", func(t *testing.T) { + ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) + job := newCheckpointJob(defaultCheckpointJobName) + snap := ownedSnapshot(ckpt, nvidiacomv1alpha1.SnapshotConditionFailed) - t.Run("completed job without completion annotation transitions to Failed once lease expires", func(t *testing.T) { - ckpt := makeCreatingCkpt(testHash, "job-missing-status") - completionTime := metav1.NewTime(time.Now().Add(-time.Minute)) - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{Name: "job-missing-status", Namespace: testNamespace}, - Status: batchv1.JobStatus{ - Succeeded: 1, - CompletionTime: &completionTime, - Conditions: []batchv1.JobCondition{ - {Type: batchv1.JobComplete, Status: corev1.ConditionTrue, LastTransitionTime: completionTime}, - }, - }, - } - r := makeCheckpointReconciler(s, ckpt, job, newOwnedPod(podNameFromJob(job.Name), job)) + r := makeCheckpointReconciler(s, ckpt, job, snap, newOwnedPod(podNameFromJob(job.Name), job)) _, err := r.handleCreating(ctx, ckpt) require.NoError(t, err) updated := &nvidiacomv1alpha1.DynamoCheckpoint{} require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseFailed, updated.Status.Phase) - assert.Contains(t, updated.Status.Message, "without snapshot-agent completion confirmation") + assert.Contains(t, updated.Status.Message, "from agent") }) - t.Run("completed job with failed completion annotation transitions to Failed", func(t *testing.T) { - ckpt := makeCreatingCkpt(testHash, "job-agent-failed") - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: "job-agent-failed", - Namespace: testNamespace, - Annotations: map[string]string{snapshotprotocol.CheckpointStatusAnnotation: snapshotprotocol.CheckpointStatusFailed}, - }, - Status: batchv1.JobStatus{ - Succeeded: 1, - Conditions: []batchv1.JobCondition{ - {Type: batchv1.JobComplete, Status: corev1.ConditionTrue, LastTransitionTime: metav1.Now()}, - }, - }, + t.Run("failed Job while Snapshot non-terminal transitions to Failed", func(t *testing.T) { + ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) + job := newCheckpointJob(defaultCheckpointJobName) + job.Status = batchv1.JobStatus{ + Conditions: []batchv1.JobCondition{{Type: batchv1.JobFailed, Status: corev1.ConditionTrue, Message: "deadline"}}, } + snap := ownedSnapshot(ckpt, "") - r := makeCheckpointReconciler(s, ckpt, job, newOwnedPod(podNameFromJob(job.Name), job)) + r := makeCheckpointReconciler(s, ckpt, job, snap, newOwnedPod(podNameFromJob(job.Name), job)) _, err := r.handleCreating(ctx, ckpt) require.NoError(t, err) updated := &nvidiacomv1alpha1.DynamoCheckpoint{} require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseFailed, updated.Status.Phase) - assert.Contains(t, updated.Status.Message, "snapshot-agent reported checkpoint failure") }) - t.Run("running job with failed checkpoint annotation transitions to Failed", func(t *testing.T) { - ckpt := makeCreatingCkpt(testHash, "job-running-agent-failed") - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: "job-running-agent-failed", - Namespace: testNamespace, - Annotations: map[string]string{snapshotprotocol.CheckpointStatusAnnotation: snapshotprotocol.CheckpointStatusFailed}, - }, - Status: batchv1.JobStatus{Active: 1}, - } + t.Run("past deadline without terminal Snapshot transitions to Failed", func(t *testing.T) { + ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) + job := newCheckpointJob(defaultCheckpointJobName) + job.CreationTimestamp = metav1.NewTime(time.Now().Add(-time.Hour)) + job.Spec.ActiveDeadlineSeconds = ptr.To(int64(60)) + snap := ownedSnapshot(ckpt, "") - r := makeCheckpointReconciler(s, ckpt, job, newOwnedPod(podNameFromJob(job.Name), job)) + r := makeCheckpointReconciler(s, ckpt, job, snap, newOwnedPod(podNameFromJob(job.Name), job)) _, err := r.handleCreating(ctx, ckpt) require.NoError(t, err) updated := &nvidiacomv1alpha1.DynamoCheckpoint{} require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseFailed, updated.Status.Phase) - assert.Equal(t, "Checkpoint job failed", updated.Status.Message) - }) - - t.Run("running job keeps Creating phase", func(t *testing.T) { - ckpt := makeCreatingCkpt(testHash, "job-run") - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{Name: "job-run", Namespace: testNamespace}, - Status: batchv1.JobStatus{Active: 1}, - } - - r := makeCheckpointReconciler(s, ckpt, job, newOwnedPod(podNameFromJob(job.Name), job)) - _, err := r.handleCreating(ctx, ckpt) - require.NoError(t, err) - - updated := &nvidiacomv1alpha1.DynamoCheckpoint{} - require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) - assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseCreating, updated.Status.Phase) + assert.Contains(t, updated.Status.Message, "deadline") }) - t.Run("in-flight version changes do not relabel the running job's artifact", func(t *testing.T) { + t.Run("Snapshot not yet found requeues without changing phase", func(t *testing.T) { ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) - ckpt.Annotations = map[string]string{snapshotprotocol.CheckpointArtifactVersionAnnotation: "2"} - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: defaultCheckpointJobName, - Namespace: testNamespace, - Annotations: map[string]string{snapshotprotocol.CheckpointStatusAnnotation: snapshotprotocol.CheckpointStatusCompleted}, - }, - Status: batchv1.JobStatus{ - Succeeded: 1, - Conditions: []batchv1.JobCondition{ - {Type: batchv1.JobComplete, Status: corev1.ConditionTrue, LastTransitionTime: metav1.Now()}, - }, - }, - } - - r := makeCheckpointReconciler(s, ckpt, job, newOwnedPod(podNameFromJob(job.Name), job)) - _, err := r.handleCreating(ctx, ckpt) - require.NoError(t, err) - - updated := &nvidiacomv1alpha1.DynamoCheckpoint{} - require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) - assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseReady, updated.Status.Phase) - }) - - t.Run("succeeded count without complete condition keeps Creating phase", func(t *testing.T) { - ckpt := makeCreatingCkpt(testHash, "job-succeeded-not-complete") - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{Name: "job-succeeded-not-complete", Namespace: testNamespace}, - Status: batchv1.JobStatus{Succeeded: 1}, - } + job := newCheckpointJob(defaultCheckpointJobName) r := makeCheckpointReconciler(s, ckpt, job, newOwnedPod(podNameFromJob(job.Name), job)) + // ensureSnapshot will create the Snapshot; without a status it stays Pending, + // so the checkpoint remains Creating. _, err := r.handleCreating(ctx, ckpt) require.NoError(t, err) diff --git a/deploy/operator/internal/controller/snapshot_reconciler.go b/deploy/operator/internal/controller/snapshot_reconciler.go new file mode 100644 index 000000000000..6377960b306a --- /dev/null +++ b/deploy/operator/internal/controller/snapshot_reconciler.go @@ -0,0 +1,357 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controller + +import ( + "context" + "errors" + "fmt" + "math/rand" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/validation" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" + snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" +) + +const ( + // snapshotFinalizer guards SnapshotContent cleanup before a Snapshot is removed. + snapshotFinalizer = "nvidia.com/snapshot-content-cleanup" + + // snapshotContentFieldManager is the Server-Side Apply field owner for SnapshotContents. + snapshotContentFieldManager = "dynamo-snapshot-controller" + + // snapshotPodResolveBackoffBase is the minimum requeue delay while waiting for the + // source pod to be scheduled; jitter is added on top to avoid a synchronized hot loop. + snapshotPodResolveBackoffBase = 2 * time.Second + + // snapshotContentDeleteRequeue is the delay between cascade-delete progress checks. + snapshotContentDeleteRequeue = time.Second + + // maxResourceNameLength is the Kubernetes object name limit (RFC 1123 subdomain). + maxResourceNameLength = 253 +) + +// errSnapshotPodUnscheduled signals that the source pod is not yet scheduled and the +// reconcile should retry with backoff rather than fail. +var errSnapshotPodUnscheduled = errors.New("source pod is not yet scheduled to a node") + +// SnapshotReconciler reconciles a Snapshot: it creates the bound, cluster-scoped +// SnapshotContent work order for the node agent, mirrors the agent's terminal status +// back to the Snapshot, and cascades deletion to the SnapshotContent. +type SnapshotReconciler struct { + client.Client + Recorder record.EventRecorder +} + +// +kubebuilder:rbac:groups=nvidia.com,resources=snapshots,verbs=get;list;watch;update;patch +// +kubebuilder:rbac:groups=nvidia.com,resources=snapshots/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=nvidia.com,resources=snapshots/finalizers,verbs=update +// +kubebuilder:rbac:groups=nvidia.com,resources=snapshotcontents,verbs=create;get;list;watch;update;patch;delete +// +kubebuilder:rbac:groups=nvidia.com,resources=snapshotcontents/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=nvidia.com,resources=snapshotcontents/finalizers,verbs=update +// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch + +// Reconcile drives a Snapshot through binding, status mirroring, and cascade deletion. +func (sr *SnapshotReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + snap := &nvidiacomv1alpha1.Snapshot{} + if err := sr.Get(ctx, req.NamespacedName, snap); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + if !snap.GetDeletionTimestamp().IsZero() { + return sr.handleDelete(ctx, snap) + } + + if !controllerutil.ContainsFinalizer(snap, snapshotFinalizer) { + controllerutil.AddFinalizer(snap, snapshotFinalizer) + if err := sr.Update(ctx, snap); err != nil { + return ctrl.Result{}, fmt.Errorf("add snapshot finalizer: %w", err) + } + return ctrl.Result{}, nil + } + + pod, err := sr.resolveSourcePod(ctx, snap) + if err != nil { + if errors.Is(err, errSnapshotPodUnscheduled) || apierrors.IsNotFound(err) { + logger.V(1).Info("Source pod not ready, backing off", "snapshot", snap.Name, "reason", err.Error()) + return ctrl.Result{RequeueAfter: jitteredBackoff(snapshotPodResolveBackoffBase)}, nil + } + return ctrl.Result{}, err + } + + contentName := snapshotContentName(snap.Spec.CheckpointID) + if errs := validation.IsDNS1123Subdomain(contentName); len(errs) > 0 || len(contentName) > maxResourceNameLength { + return sr.failSnapshot(ctx, snap, "InvalidContentName", + fmt.Errorf("composed SnapshotContent name %q is invalid: too long or not a DNS subdomain", contentName)) + } + + bound, err := sr.findBoundContent(ctx, contentName) + if err != nil { + return ctrl.Result{}, err + } + if bound != nil && bound.Spec.Source.NodeName != pod.Spec.NodeName { + return sr.failSnapshot(ctx, snap, "PodRescheduled", + fmt.Errorf("source pod moved from node %q to %q; CRIU checkpoint cannot survive migration", + bound.Spec.Source.NodeName, pod.Spec.NodeName)) + } + + if err := sr.ensureSnapshotContent(ctx, snap, contentName, pod); err != nil { + return ctrl.Result{}, err + } + + if err := sr.bindAndMirror(ctx, snap, contentName); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil +} + +// resolveSourcePod loads the source pod and requires it be scheduled to a node. +func (sr *SnapshotReconciler) resolveSourcePod(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot) (*corev1.Pod, error) { + pod := &corev1.Pod{} + key := client.ObjectKey{Namespace: snap.Namespace, Name: snap.Spec.Source.PodRef.Name} + if err := sr.Get(ctx, key, pod); err != nil { + return nil, err + } + if pod.Spec.NodeName == "" { + return nil, errSnapshotPodUnscheduled + } + return pod, nil +} + +// findBoundContent returns the bound SnapshotContent if it already exists, or nil. +func (sr *SnapshotReconciler) findBoundContent(ctx context.Context, contentName string) (*nvidiacomv1alpha1.SnapshotContent, error) { + content := &nvidiacomv1alpha1.SnapshotContent{} + if err := sr.Get(ctx, client.ObjectKey{Name: contentName}, content); err != nil { + if apierrors.IsNotFound(err) { + return nil, nil + } + return nil, err + } + return content, nil +} + +// ensureSnapshotContent applies the SnapshotContent work order via a single Server-Side +// Apply carrying source, the node mirror label, storage-coord metadata, and the finalizer. +func (sr *SnapshotReconciler) ensureSnapshotContent(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot, contentName string, pod *corev1.Pod) error { + content := sr.buildSnapshotContent(snap, contentName, pod) + if err := sr.Patch(ctx, content, client.Apply, + client.FieldOwner(snapshotContentFieldManager), client.ForceOwnership); err != nil { + sr.Recorder.Event(snap, corev1.EventTypeWarning, "SnapshotContentCreateFailed", err.Error()) + return fmt.Errorf("apply SnapshotContent %q: %w", contentName, err) + } + return nil +} + +// buildSnapshotContent constructs the desired cluster-scoped SnapshotContent for a Snapshot. +func (sr *SnapshotReconciler) buildSnapshotContent(snap *nvidiacomv1alpha1.Snapshot, contentName string, pod *corev1.Pod) *nvidiacomv1alpha1.SnapshotContent { + return &nvidiacomv1alpha1.SnapshotContent{ + TypeMeta: metav1.TypeMeta{ + APIVersion: nvidiacomv1alpha1.GroupVersion.String(), + Kind: "SnapshotContent", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: contentName, + Labels: map[string]string{ + snapshotprotocol.SnapshotNodeLabel: pod.Spec.NodeName, + snapshotprotocol.CheckpointIDLabel: snap.Spec.CheckpointID, + }, + Annotations: map[string]string{ + snapshotprotocol.CheckpointArtifactVersionAnnotation: snapshotprotocol.ArtifactVersion(snap.Annotations[snapshotprotocol.CheckpointArtifactVersionAnnotation]), + }, + Finalizers: []string{snapshotFinalizer}, + }, + Spec: nvidiacomv1alpha1.SnapshotContentSpec{ + SnapshotRef: nvidiacomv1alpha1.SnapshotReference{ + Namespace: snap.Namespace, + Name: snap.Name, + UID: snap.UID, + }, + Source: nvidiacomv1alpha1.SnapshotContentSource{ + PodRef: nvidiacomv1alpha1.PodReference{Name: pod.Name, UID: pod.UID}, + NodeName: pod.Spec.NodeName, + }, + }, + } +} + +// bindAndMirror records the binding and mirrors the SnapshotContent's terminal status to +// the Snapshot, defaulting to a Pending condition until the agent writes a result. +func (sr *SnapshotReconciler) bindAndMirror(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot, contentName string) error { + content := &nvidiacomv1alpha1.SnapshotContent{} + if err := sr.Get(ctx, client.ObjectKey{Name: contentName}, content); err != nil { + return client.IgnoreNotFound(err) + } + + changed := false + if snap.Status.BoundSnapshotContentName == nil || *snap.Status.BoundSnapshotContentName != contentName { + snap.Status.BoundSnapshotContentName = &contentName + changed = true + } + + ready := meta.FindStatusCondition(content.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady) + failed := meta.FindStatusCondition(content.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + switch { + case ready != nil && ready.Status == metav1.ConditionTrue: + changed = sr.setCondition(snap, nvidiacomv1alpha1.SnapshotConditionReady, metav1.ConditionTrue, ready.Reason, ready.Message) || changed + case failed != nil && failed.Status == metav1.ConditionTrue: + changed = sr.setCondition(snap, nvidiacomv1alpha1.SnapshotConditionFailed, metav1.ConditionTrue, failed.Reason, failed.Message) || changed + default: + changed = sr.setCondition(snap, nvidiacomv1alpha1.SnapshotConditionReady, metav1.ConditionFalse, "Pending", "Waiting for node agent to capture the checkpoint") || changed + } + + if !changed { + return nil + } + if err := sr.Status().Update(ctx, snap); err != nil { + return fmt.Errorf("update snapshot status: %w", err) + } + return nil +} + +// setCondition sets a status condition and reports whether it changed. +func (sr *SnapshotReconciler) setCondition(snap *nvidiacomv1alpha1.Snapshot, condType string, status metav1.ConditionStatus, reason, message string) bool { + return meta.SetStatusCondition(&snap.Status.Conditions, metav1.Condition{ + Type: condType, + Status: status, + Reason: reason, + Message: message, + }) +} + +// failSnapshot marks the Snapshot Failed terminally and records an event. +func (sr *SnapshotReconciler) failSnapshot(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot, reason string, cause error) (ctrl.Result, error) { + sr.Recorder.Event(snap, corev1.EventTypeWarning, reason, cause.Error()) + sr.setCondition(snap, nvidiacomv1alpha1.SnapshotConditionFailed, metav1.ConditionTrue, reason, cause.Error()) + if err := sr.Status().Update(ctx, snap); err != nil { + return ctrl.Result{}, fmt.Errorf("mark snapshot failed: %w", err) + } + return ctrl.Result{}, nil +} + +// handleDelete cascades deletion to the bound SnapshotContent, waits for it to be gone, +// then drops the Snapshot finalizer. +func (sr *SnapshotReconciler) handleDelete(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot) (ctrl.Result, error) { + if !controllerutil.ContainsFinalizer(snap, snapshotFinalizer) { + return ctrl.Result{}, nil + } + + contentName := snapshotContentName(snap.Spec.CheckpointID) + content := &nvidiacomv1alpha1.SnapshotContent{} + err := sr.Get(ctx, client.ObjectKey{Name: contentName}, content) + switch { + case err == nil: + // Clear the controller finalizer first so the subsequent Delete is not + // blocked, then issue the Delete. The reconcile requeues until the + // SnapshotContent is fully gone. + if controllerutil.ContainsFinalizer(content, snapshotFinalizer) { + controllerutil.RemoveFinalizer(content, snapshotFinalizer) + if updErr := sr.Update(ctx, content); updErr != nil && !apierrors.IsNotFound(updErr) { + return ctrl.Result{}, fmt.Errorf("clear SnapshotContent %q finalizer: %w", contentName, updErr) + } + } + if content.GetDeletionTimestamp().IsZero() { + if delErr := sr.Delete(ctx, content); delErr != nil && !apierrors.IsNotFound(delErr) { + return ctrl.Result{}, fmt.Errorf("delete SnapshotContent %q: %w", contentName, delErr) + } + } + return ctrl.Result{RequeueAfter: snapshotContentDeleteRequeue}, nil + case apierrors.IsNotFound(err): + // SnapshotContent gone; drop the Snapshot finalizer. + default: + return ctrl.Result{}, fmt.Errorf("get SnapshotContent %q: %w", contentName, err) + } + + controllerutil.RemoveFinalizer(snap, snapshotFinalizer) + if err := sr.Update(ctx, snap); err != nil { + return ctrl.Result{}, fmt.Errorf("remove snapshot finalizer: %w", err) + } + return ctrl.Result{}, nil +} + +// SetupWithManager wires the controller: it owns Snapshots and watches SnapshotContents, +// mapping a SnapshotContent back to its bound Snapshot via spec.snapshotRef. +func (sr *SnapshotReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&nvidiacomv1alpha1.Snapshot{}). + Watches( + &nvidiacomv1alpha1.SnapshotContent{}, + handler.EnqueueRequestsFromMapFunc(snapshotContentToSnapshot), + builder.WithPredicates(predicate.Funcs{ + CreateFunc: func(event.CreateEvent) bool { return false }, + UpdateFunc: func(ue event.UpdateEvent) bool { return true }, + DeleteFunc: func(event.DeleteEvent) bool { return true }, + GenericFunc: func(event.GenericEvent) bool { return false }, + }), + ). + Complete(sr) +} + +// snapshotContentToSnapshot maps a SnapshotContent (including a delete-event tombstone) back +// to its bound Snapshot. It MUST unwrap cache.DeletedFinalStateUnknown so that the final +// SnapshotContent delete still re-enqueues the Snapshot and the cascade can complete. +func snapshotContentToSnapshot(_ context.Context, obj client.Object) []reconcile.Request { + ref, ok := snapshotRefFromContentObj(obj) + if !ok || ref.Name == "" { + return nil + } + return []reconcile.Request{{NamespacedName: types.NamespacedName{Namespace: ref.Namespace, Name: ref.Name}}} +} + +// snapshotRefFromContentObj extracts the bound Snapshot reference from a SnapshotContent, +// unwrapping a cache.DeletedFinalStateUnknown tombstone first so the final delete event +// still re-enqueues the Snapshot and the cascade can complete (F-2.2). +func snapshotRefFromContentObj(obj any) (nvidiacomv1alpha1.SnapshotReference, bool) { + if tombstone, isTombstone := obj.(cache.DeletedFinalStateUnknown); isTombstone { + obj = tombstone.Obj + } + content, ok := obj.(*nvidiacomv1alpha1.SnapshotContent) + if !ok { + return nvidiacomv1alpha1.SnapshotReference{}, false + } + return content.Spec.SnapshotRef, true +} + +// snapshotContentName composes the deterministic cluster-scoped SnapshotContent name. +func snapshotContentName(checkpointID string) string { + return "snapshotcontent-" + checkpointID +} + +// jitteredBackoff adds up to 50% jitter to a base delay to avoid synchronized requeues. +func jitteredBackoff(base time.Duration) time.Duration { + return base + time.Duration(rand.Int63n(int64(base/2)+1)) +} diff --git a/deploy/operator/internal/controller/snapshot_reconciler_test.go b/deploy/operator/internal/controller/snapshot_reconciler_test.go new file mode 100644 index 000000000000..95bd75876677 --- /dev/null +++ b/deploy/operator/internal/controller/snapshot_reconciler_test.go @@ -0,0 +1,258 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controller + +import ( + "context" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + + nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" + snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" +) + +func snapshotReconcilerScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = nvidiacomv1alpha1.AddToScheme(s) + _ = corev1.AddToScheme(s) + return s +} + +func makeSnapshotReconciler(s *runtime.Scheme, objs ...client.Object) *SnapshotReconciler { + return &SnapshotReconciler{ + Client: fake.NewClientBuilder().WithScheme(s).WithObjects(objs...). + WithStatusSubresource(&nvidiacomv1alpha1.Snapshot{}, &nvidiacomv1alpha1.SnapshotContent{}).Build(), + Recorder: record.NewFakeRecorder(10), + } +} + +func makeSnapshotForReconcile(checkpointID, podName string) *nvidiacomv1alpha1.Snapshot { + return &nvidiacomv1alpha1.Snapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "snapshot-" + checkpointID, + Namespace: "inference", + UID: types.UID("snap-uid"), + Finalizers: []string{snapshotFinalizer}, + Annotations: map[string]string{snapshotprotocol.CheckpointArtifactVersionAnnotation: "3"}, + }, + Spec: nvidiacomv1alpha1.SnapshotSpec{ + CheckpointID: checkpointID, + Source: nvidiacomv1alpha1.SnapshotSource{PodRef: nvidiacomv1alpha1.PodReference{Name: podName}}, + }, + } +} + +func scheduledPod(name, node string) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "inference", UID: types.UID("pod-uid-9")}, + Spec: corev1.PodSpec{NodeName: node}, + } +} + +func reconcileSnapshot(t *testing.T, r *SnapshotReconciler, name string) ctrl.Result { + t.Helper() + res, err := r.Reconcile(context.Background(), + ctrl.Request{NamespacedName: types.NamespacedName{Namespace: "inference", Name: name}}) + require.NoError(t, err) + return res +} + +func TestSnapshotReconciler_PodUnscheduledBacksOff(t *testing.T) { + s := snapshotReconcilerScheme() + snap := makeSnapshotForReconcile("abc123", "worker-0") + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference"}} + r := makeSnapshotReconciler(s, snap, pod) + + res := reconcileSnapshot(t, r, snap.Name) + assert.Positive(t, res.RequeueAfter) + + var contents nvidiacomv1alpha1.SnapshotContentList + require.NoError(t, r.List(context.Background(), &contents)) + assert.Empty(t, contents.Items) +} + +func TestSnapshotReconciler_BuildsWorkOrderAndBinds(t *testing.T) { + s := snapshotReconcilerScheme() + snap := makeSnapshotForReconcile("abc123", "worker-0") + r := makeSnapshotReconciler(s, snap, scheduledPod("worker-0", "node-a")) + + reconcileSnapshot(t, r, snap.Name) + + content := &nvidiacomv1alpha1.SnapshotContent{} + require.NoError(t, r.Get(context.Background(), types.NamespacedName{Name: "snapshotcontent-abc123"}, content)) + assert.Equal(t, "worker-0", content.Spec.Source.PodRef.Name) + assert.Equal(t, types.UID("pod-uid-9"), content.Spec.Source.PodRef.UID) + assert.Equal(t, "node-a", content.Spec.Source.NodeName) + assert.Equal(t, "node-a", content.Labels[snapshotprotocol.SnapshotNodeLabel]) + assert.Equal(t, "abc123", content.Labels[snapshotprotocol.CheckpointIDLabel]) + assert.Equal(t, "3", content.Annotations[snapshotprotocol.CheckpointArtifactVersionAnnotation]) + assert.Contains(t, content.Finalizers, snapshotFinalizer) + assert.Equal(t, "inference", content.Spec.SnapshotRef.Namespace) + assert.Equal(t, snap.Name, content.Spec.SnapshotRef.Name) + + updated := &nvidiacomv1alpha1.Snapshot{} + require.NoError(t, r.Get(context.Background(), types.NamespacedName{Namespace: "inference", Name: snap.Name}, updated)) + require.NotNil(t, updated.Status.BoundSnapshotContentName) + assert.Equal(t, "snapshotcontent-abc123", *updated.Status.BoundSnapshotContentName) +} + +func TestSnapshotReconciler_MirrorsReadyAndFailed(t *testing.T) { + for _, tc := range []struct { + name string + condType string + wantReady metav1.ConditionStatus + }{ + {name: "ready", condType: nvidiacomv1alpha1.SnapshotConditionReady}, + {name: "failed", condType: nvidiacomv1alpha1.SnapshotConditionFailed}, + } { + t.Run(tc.name, func(t *testing.T) { + s := snapshotReconcilerScheme() + snap := makeSnapshotForReconcile("abc123", "worker-0") + content := &nvidiacomv1alpha1.SnapshotContent{ + ObjectMeta: metav1.ObjectMeta{Name: "snapshotcontent-abc123", Finalizers: []string{snapshotFinalizer}}, + Spec: nvidiacomv1alpha1.SnapshotContentSpec{ + SnapshotRef: nvidiacomv1alpha1.SnapshotReference{Namespace: "inference", Name: snap.Name}, + Source: nvidiacomv1alpha1.SnapshotContentSource{ + PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0"}, NodeName: "node-a", + }, + }, + Status: nvidiacomv1alpha1.SnapshotContentStatus{ + Conditions: []metav1.Condition{{Type: tc.condType, Status: metav1.ConditionTrue, Reason: "Agent", Message: "done"}}, + }, + } + r := makeSnapshotReconciler(s, snap, content, scheduledPod("worker-0", "node-a")) + + reconcileSnapshot(t, r, snap.Name) + + updated := &nvidiacomv1alpha1.Snapshot{} + require.NoError(t, r.Get(context.Background(), types.NamespacedName{Namespace: "inference", Name: snap.Name}, updated)) + cond := meta.FindStatusCondition(updated.Status.Conditions, tc.condType) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionTrue, cond.Status) + }) + } +} + +func TestSnapshotReconciler_RescheduleFailsSnapshot(t *testing.T) { + s := snapshotReconcilerScheme() + snap := makeSnapshotForReconcile("abc123", "worker-0") + content := &nvidiacomv1alpha1.SnapshotContent{ + ObjectMeta: metav1.ObjectMeta{Name: "snapshotcontent-abc123", Finalizers: []string{snapshotFinalizer}}, + Spec: nvidiacomv1alpha1.SnapshotContentSpec{ + SnapshotRef: nvidiacomv1alpha1.SnapshotReference{Namespace: "inference", Name: snap.Name}, + Source: nvidiacomv1alpha1.SnapshotContentSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0"}, NodeName: "node-a"}, + }, + } + // Pod now runs on a different node than the bound content. + r := makeSnapshotReconciler(s, snap, content, scheduledPod("worker-0", "node-b")) + + reconcileSnapshot(t, r, snap.Name) + + updated := &nvidiacomv1alpha1.Snapshot{} + require.NoError(t, r.Get(context.Background(), types.NamespacedName{Namespace: "inference", Name: snap.Name}, updated)) + cond := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionTrue, cond.Status) + assert.Equal(t, "PodRescheduled", cond.Reason) +} + +func TestSnapshotReconciler_ComposedNameTooLongFails(t *testing.T) { + s := snapshotReconcilerScheme() + longID := strings.Repeat("a", 250) // "snapshotcontent-" + 250 = 266 > 253 + snap := &nvidiacomv1alpha1.Snapshot{ + ObjectMeta: metav1.ObjectMeta{Name: "snapshot-x", Namespace: "inference", Finalizers: []string{snapshotFinalizer}}, + Spec: nvidiacomv1alpha1.SnapshotSpec{ + CheckpointID: longID, + Source: nvidiacomv1alpha1.SnapshotSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0"}}, + }, + } + r := makeSnapshotReconciler(s, snap, scheduledPod("worker-0", "node-a")) + + reconcileSnapshot(t, r, snap.Name) + + updated := &nvidiacomv1alpha1.Snapshot{} + require.NoError(t, r.Get(context.Background(), types.NamespacedName{Namespace: "inference", Name: snap.Name}, updated)) + cond := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "InvalidContentName", cond.Reason) +} + +func TestSnapshotReconciler_CascadeDelete(t *testing.T) { + s := snapshotReconcilerScheme() + now := metav1.Now() + snap := makeSnapshotForReconcile("abc123", "worker-0") + snap.DeletionTimestamp = &now + content := &nvidiacomv1alpha1.SnapshotContent{ + ObjectMeta: metav1.ObjectMeta{Name: "snapshotcontent-abc123", Finalizers: []string{snapshotFinalizer}}, + Spec: nvidiacomv1alpha1.SnapshotContentSpec{ + SnapshotRef: nvidiacomv1alpha1.SnapshotReference{Namespace: "inference", Name: snap.Name}, + Source: nvidiacomv1alpha1.SnapshotContentSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0"}, NodeName: "node-a"}, + }, + } + r := makeSnapshotReconciler(s, snap, content) + + // First pass deletes the content and clears its finalizer; it requeues. + res := reconcileSnapshot(t, r, snap.Name) + assert.Positive(t, res.RequeueAfter) + err := r.Get(context.Background(), types.NamespacedName{Name: "snapshotcontent-abc123"}, &nvidiacomv1alpha1.SnapshotContent{}) + assert.True(t, apierrors.IsNotFound(err)) + + // Second pass drops the Snapshot finalizer now that the content is gone. + reconcileSnapshot(t, r, snap.Name) + gone := &nvidiacomv1alpha1.Snapshot{} + err = r.Get(context.Background(), types.NamespacedName{Namespace: "inference", Name: snap.Name}, gone) + if err == nil { + assert.False(t, controllerutil.ContainsFinalizer(gone, snapshotFinalizer)) + } else { + assert.True(t, apierrors.IsNotFound(err)) + } +} + +func TestSnapshotContentToSnapshot_UnwrapsTombstone(t *testing.T) { + content := &nvidiacomv1alpha1.SnapshotContent{ + ObjectMeta: metav1.ObjectMeta{Name: "snapshotcontent-abc123"}, + Spec: nvidiacomv1alpha1.SnapshotContentSpec{ + SnapshotRef: nvidiacomv1alpha1.SnapshotReference{Namespace: "inference", Name: "snapshot-abc123"}, + }, + } + + direct := snapshotContentToSnapshot(context.Background(), content) + require.Len(t, direct, 1) + assert.Equal(t, "snapshot-abc123", direct[0].Name) + + tombstone := cache.DeletedFinalStateUnknown{Key: "snapshotcontent-abc123", Obj: content} + ref, ok := snapshotRefFromContentObj(tombstone) + require.True(t, ok) + assert.Equal(t, "snapshot-abc123", ref.Name) + assert.Equal(t, "inference", ref.Namespace) +} From c00707126ae7fdbe471e508854788af542bef63d Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Mon, 15 Jun 2026 17:37:56 +0300 Subject: [PATCH 03/14] refactor(snapshot): remove Job-observation checkpoint completion Signed-off-by: Ron Kahn --- deploy/snapshot/protocol/checkpoint.go | 74 -------------- .../protocol/checkpoint_observation_test.go | 99 ------------------- deploy/snapshot/protocol/common.go | 4 + 3 files changed, 4 insertions(+), 173 deletions(-) delete mode 100644 deploy/snapshot/protocol/checkpoint_observation_test.go diff --git a/deploy/snapshot/protocol/checkpoint.go b/deploy/snapshot/protocol/checkpoint.go index 14221be301bd..80f37da74286 100644 --- a/deploy/snapshot/protocol/checkpoint.go +++ b/deploy/snapshot/protocol/checkpoint.go @@ -24,21 +24,6 @@ type CheckpointJobOptions struct { WrapLaunchJob bool } -type CheckpointObservationPhase string - -const ( - CheckpointObservationPhaseRunning CheckpointObservationPhase = "running" - CheckpointObservationPhaseWaitingForConfirmation CheckpointObservationPhase = "waiting_for_confirmation" - CheckpointObservationPhaseReady CheckpointObservationPhase = "ready" - CheckpointObservationPhaseFailed CheckpointObservationPhase = "failed" -) - -type CheckpointObservation struct { - Phase CheckpointObservationPhase - Reason string - Message string -} - func GetCheckpointJobName(checkpointID string, artifactVersion string) string { return "checkpoint-job-" + checkpointID + "-" + ArtifactVersion(artifactVersion) } @@ -126,65 +111,6 @@ func NewCheckpointJob(podTemplate *corev1.PodTemplateSpec, opts CheckpointJobOpt }, nil } -func ObserveCheckpointJob(job *batchv1.Job, checkpointWorkerActive bool) CheckpointObservation { - jobComplete := false - jobFailed := false - for _, condition := range job.Status.Conditions { - if condition.Status != corev1.ConditionTrue { - continue - } - if condition.Type == batchv1.JobComplete { - jobComplete = true - continue - } - if condition.Type == batchv1.JobFailed { - jobFailed = true - } - } - - status := job.Annotations[CheckpointStatusAnnotation] - if status == CheckpointStatusFailed { - observation := CheckpointObservation{ - Phase: CheckpointObservationPhaseFailed, - Reason: "JobFailed", - Message: "Checkpoint job failed", - } - if jobComplete { - observation.Reason = "CheckpointVerificationFailed" - observation.Message = "Checkpoint job completed but snapshot-agent reported checkpoint failure" - } - return observation - } - - if jobComplete { - if status == CheckpointStatusCompleted { - return CheckpointObservation{ - Phase: CheckpointObservationPhaseReady, - Reason: "JobSucceeded", - Message: "Checkpoint job completed successfully", - } - } - if checkpointWorkerActive { - return CheckpointObservation{Phase: CheckpointObservationPhaseWaitingForConfirmation} - } - return CheckpointObservation{ - Phase: CheckpointObservationPhaseFailed, - Reason: "CheckpointVerificationFailed", - Message: "Checkpoint job completed without snapshot-agent completion confirmation", - } - } - - if jobFailed { - return CheckpointObservation{ - Phase: CheckpointObservationPhaseFailed, - Reason: "JobFailed", - Message: "Checkpoint job failed", - } - } - - return CheckpointObservation{Phase: CheckpointObservationPhaseRunning} -} - // EnsureLocalhostSeccompProfile sets the pod-level localhost seccomp profile // to the given path, allocating PodSecurityContext if needed. An empty profile // is a no-op so callers can disable injection entirely without conditional diff --git a/deploy/snapshot/protocol/checkpoint_observation_test.go b/deploy/snapshot/protocol/checkpoint_observation_test.go deleted file mode 100644 index 2b0f40f3c0fc..000000000000 --- a/deploy/snapshot/protocol/checkpoint_observation_test.go +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -package protocol - -import ( - "testing" - - batchv1 "k8s.io/api/batch/v1" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -func TestObserveCheckpointJob(t *testing.T) { - makeJob := func(annotation string, conditions ...batchv1.JobCondition) *batchv1.Job { - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{}, - }, - Status: batchv1.JobStatus{ - Conditions: conditions, - }, - } - if annotation != "" { - job.Annotations[CheckpointStatusAnnotation] = annotation - } - return job - } - - tests := []struct { - name string - job *batchv1.Job - checkpointWorkerActive bool - wantPhase CheckpointObservationPhase - wantReason string - wantMessage string - }{ - { - name: "running job stays running", - job: makeJob(""), - wantPhase: CheckpointObservationPhaseRunning, - }, - { - name: "completed job with completion annotation is ready", - job: makeJob( - CheckpointStatusCompleted, - batchv1.JobCondition{Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, - ), - wantPhase: CheckpointObservationPhaseReady, - wantReason: "JobSucceeded", - wantMessage: "Checkpoint job completed successfully", - }, - { - name: "completed job waits for terminal confirmation while worker is active", - job: makeJob( - "", - batchv1.JobCondition{Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, - ), - checkpointWorkerActive: true, - wantPhase: CheckpointObservationPhaseWaitingForConfirmation, - }, - { - name: "completed job fails without confirmation once worker is inactive", - job: makeJob( - "", - batchv1.JobCondition{Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, - ), - wantPhase: CheckpointObservationPhaseFailed, - wantReason: "CheckpointVerificationFailed", - wantMessage: "Checkpoint job completed without snapshot-agent completion confirmation", - }, - { - name: "failed checkpoint annotation wins over completed job", - job: makeJob( - CheckpointStatusFailed, - batchv1.JobCondition{Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, - ), - checkpointWorkerActive: true, - wantPhase: CheckpointObservationPhaseFailed, - wantReason: "CheckpointVerificationFailed", - wantMessage: "Checkpoint job completed but snapshot-agent reported checkpoint failure", - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - observation := ObserveCheckpointJob(tc.job, tc.checkpointWorkerActive) - if observation.Phase != tc.wantPhase { - t.Fatalf("phase = %q, want %q", observation.Phase, tc.wantPhase) - } - if observation.Reason != tc.wantReason { - t.Fatalf("reason = %q, want %q", observation.Reason, tc.wantReason) - } - if observation.Message != tc.wantMessage { - t.Fatalf("message = %q, want %q", observation.Message, tc.wantMessage) - } - }) - } -} diff --git a/deploy/snapshot/protocol/common.go b/deploy/snapshot/protocol/common.go index 43416eb9c601..9248887ecdca 100644 --- a/deploy/snapshot/protocol/common.go +++ b/deploy/snapshot/protocol/common.go @@ -19,6 +19,10 @@ const ( CheckpointArtifactVersionAnnotation = "nvidia.com/snapshot-artifact-version" + // SnapshotNodeLabel mirrors SnapshotContent.spec.source.nodeName onto the + // object so the per-node agent's cache can label-select work for its node. + SnapshotNodeLabel = "nvidia.com/snapshot-node" + // Required comma-separated checkpoint/restore target container list. TargetContainersAnnotation = "nvidia.com/snapshot-target-containers" From e5a7d78e8297a12bc384d0325871ee47ef682f9c Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Mon, 15 Jun 2026 17:37:56 +0300 Subject: [PATCH 04/14] feat(snapshot): capture checkpoints via SnapshotContent work order Signed-off-by: Ron Kahn --- .../helm/charts/snapshot/templates/role.yaml | 26 +- .../snapshot/templates/rolebinding.yaml | 19 + deploy/snapshot/cmd/agent/main.go | 53 +-- deploy/snapshot/go.mod | 54 +-- deploy/snapshot/go.sum | 131 ++++--- .../internal/controller/controller.go | 340 +----------------- .../internal/controller/controller_test.go | 309 +--------------- .../snapshot/internal/controller/manager.go | 74 ++++ .../internal/controller/nodecheckpointer.go | 124 +++++++ .../controller/snapshotcontent_reconciler.go | 300 ++++++++++++++++ .../snapshotcontent_reconciler_test.go | 238 ++++++++++++ deploy/snapshot/internal/controller/util.go | 160 +++------ 12 files changed, 974 insertions(+), 854 deletions(-) create mode 100644 deploy/snapshot/internal/controller/manager.go create mode 100644 deploy/snapshot/internal/controller/nodecheckpointer.go create mode 100644 deploy/snapshot/internal/controller/snapshotcontent_reconciler.go create mode 100644 deploy/snapshot/internal/controller/snapshotcontent_reconciler_test.go diff --git a/deploy/helm/charts/snapshot/templates/role.yaml b/deploy/helm/charts/snapshot/templates/role.yaml index 282a402b51e1..1830f487b346 100644 --- a/deploy/helm/charts/snapshot/templates/role.yaml +++ b/deploy/helm/charts/snapshot/templates/role.yaml @@ -31,7 +31,31 @@ rules: - apiGroups: ["resource.k8s.io"] resources: ["resourceclaims"] verbs: ["get", "list"] -{{- else }} +{{- end }} +{{- end }} + +{{- if .Values.rbac.create }} +--- +# SnapshotContent is cluster-scoped, so the agent always needs a ClusterRole for it. +# The agent reads work orders and writes only their status; it never creates, deletes, +# or touches Snapshots (the work order is self-contained). +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "snapshot.fullname" . }}-agent-snapshotcontents + labels: + {{- include "snapshot.labels" . | nindent 4 }} +rules: + - apiGroups: ["nvidia.com"] + resources: ["snapshotcontents"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: ["nvidia.com"] + resources: ["snapshotcontents/status"] + verbs: ["update", "patch"] +{{- end }} + +{{- if .Values.rbac.create }} +{{- if not .Values.rbac.namespaceRestricted }} apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: diff --git a/deploy/helm/charts/snapshot/templates/rolebinding.yaml b/deploy/helm/charts/snapshot/templates/rolebinding.yaml index b65dba17952b..f78af5e7579e 100644 --- a/deploy/helm/charts/snapshot/templates/rolebinding.yaml +++ b/deploy/helm/charts/snapshot/templates/rolebinding.yaml @@ -1,6 +1,25 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +{{- if .Values.rbac.create }} +--- +# Bind agent to the cluster-scoped SnapshotContent ClusterRole (capture work orders). +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "snapshot.fullname" . }}-agent-snapshotcontents + labels: + {{- include "snapshot.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "snapshot.fullname" . }}-agent-snapshotcontents +subjects: + - kind: ServiceAccount + name: {{ include "snapshot.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} +{{- end }} + {{- if .Values.rbac.create }} {{- if .Values.rbac.namespaceRestricted }} apiVersion: rbac.authorization.k8s.io/v1 diff --git a/deploy/snapshot/cmd/agent/main.go b/deploy/snapshot/cmd/agent/main.go index e72bbf14f0ce..0a3c77f73f7f 100644 --- a/deploy/snapshot/cmd/agent/main.go +++ b/deploy/snapshot/cmd/agent/main.go @@ -47,11 +47,21 @@ func main() { } }() - ctx, cancel := context.WithCancel(context.Background()) + // rootCtx is cancelled on signal. The restore informer's lifetime is bound to + // informerCtx, which is only cancelled after the manager's Start returns, so the + // restore path keeps running until the capture manager has fully shut down. + rootCtx, cancel := context.WithCancel(context.Background()) defer cancel() + informerCtx, stopInformer := context.WithCancel(context.Background()) + defer stopInformer() sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + go func() { + <-sigChan + agentLog.Info("Shutting down") + cancel() + }() agentLog.Info("Starting snapshot agent", "node", cfg.NodeName, @@ -59,34 +69,33 @@ func main() { "runtime", *runtimeType, ) + // Restore path: the existing node-local client-go controller. nodeController, err := controller.NewNodeController(cfg, rt, rootLog.WithName("controller")) if err != nil { fatal(agentLog, err, "Failed to create snapshot node controller") } - - // Run the node-local controller in the background. - controllerDone := make(chan error, 1) + restoreDone := make(chan error, 1) go func() { - agentLog.Info("Snapshot node controller started") - controllerDone <- nodeController.Run(ctx) + agentLog.Info("Snapshot restore controller started") + restoreDone <- nodeController.Run(informerCtx) }() - // Wait for signal or controller exit. - select { - case <-sigChan: - agentLog.Info("Shutting down") - cancel() - select { - case err := <-controllerDone: - if err != nil { - agentLog.Error(err, "Snapshot node controller exited with error during shutdown") - } - default: - } - case err := <-controllerDone: - if err != nil { - fatal(agentLog, err, "Snapshot node controller exited with error") - } + // Capture path: the per-node SnapshotContent controller-runtime manager. + mgr, err := controller.NewSnapshotContentManager(cfg, rt) + if err != nil { + fatal(agentLog, err, "Failed to create snapshot-content manager") + } + + agentLog.Info("Starting snapshot-content manager") + startErr := mgr.Start(rootCtx) + + // Manager has returned; now tear down the restore informer. + stopInformer() + if restoreErr := <-restoreDone; restoreErr != nil { + agentLog.Error(restoreErr, "Snapshot restore controller exited with error") + } + if startErr != nil { + fatal(agentLog, startErr, "Snapshot-content manager exited with error") } agentLog.Info("Agent stopped") diff --git a/deploy/snapshot/go.mod b/deploy/snapshot/go.mod index 3965404580e8..a4fa664d78c8 100644 --- a/deploy/snapshot/go.mod +++ b/deploy/snapshot/go.mod @@ -3,6 +3,7 @@ module github.com/ai-dynamo/dynamo/deploy/snapshot go 1.26.3 require ( + github.com/ai-dynamo/dynamo/deploy/operator v0.0.0 github.com/checkpoint-restore/go-criu/v8 v8.2.0 github.com/containerd/containerd v1.7.30 github.com/cyphar/filepath-securejoin v0.5.1 @@ -11,9 +12,10 @@ require ( github.com/google/uuid v1.6.0 github.com/moby/sys/mountinfo v0.7.1 github.com/opencontainers/runtime-spec v1.2.0 - github.com/prometheus/procfs v0.16.1 + github.com/prometheus/procfs v0.17.0 + github.com/stretchr/testify v1.11.1 go.uber.org/zap v1.27.1 - golang.org/x/sys v0.40.0 + golang.org/x/sys v0.41.0 google.golang.org/grpc v1.79.3 google.golang.org/protobuf v1.36.11 gopkg.in/yaml.v3 v3.0.1 @@ -23,7 +25,7 @@ require ( k8s.io/cri-api v0.34.3 k8s.io/cri-client v0.34.3 k8s.io/kubelet v0.34.3 - k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 + k8s.io/utils v0.0.0-20260108192941-914a6e750570 sigs.k8s.io/controller-runtime v0.22.4 sigs.k8s.io/yaml v1.6.0 ) @@ -35,7 +37,7 @@ require ( github.com/Microsoft/hcsshim v0.11.7 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect - github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/containerd/cgroups v1.1.0 // indirect github.com/containerd/containerd/api v1.8.0 // indirect @@ -46,26 +48,29 @@ require ( github.com/containerd/platforms v0.2.1 // indirect github.com/containerd/ttrpc v1.2.7 // indirect github.com/containerd/typeurl/v2 v2.1.1 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/distribution/reference v0.6.0 // indirect github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c // indirect - github.com/emicklei/go-restful/v3 v3.12.2 // indirect + github.com/emicklei/go-restful/v3 v3.13.0 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-logr/stdr v1.2.2 // indirect - github.com/go-openapi/jsonpointer v0.21.0 // indirect - github.com/go-openapi/jsonreference v0.20.2 // indirect - github.com/go-openapi/swag v0.23.0 // indirect + github.com/go-openapi/jsonpointer v0.21.2 // indirect + github.com/go-openapi/jsonreference v0.21.0 // indirect + github.com/go-openapi/swag v0.23.1 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect + github.com/google/btree v1.1.3 // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect - github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect + github.com/imdario/mergo v0.3.16 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.18.0 // indirect - github.com/mailru/easyjson v0.7.7 // indirect + github.com/mailru/easyjson v0.9.0 // indirect github.com/moby/locker v1.0.1 // indirect github.com/moby/sys/sequential v0.5.0 // indirect github.com/moby/sys/signal v0.7.0 // indirect @@ -78,42 +83,47 @@ require ( github.com/opencontainers/image-spec v1.1.0 // indirect github.com/opencontainers/selinux v1.13.1 // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_golang v1.23.2 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/common v0.67.5 // indirect github.com/sirupsen/logrus v1.9.3 // indirect - github.com/spf13/pflag v1.0.9 // indirect + github.com/spf13/pflag v1.0.10 // indirect github.com/x448/float16 v0.8.4 // indirect go.opencensus.io v0.24.0 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect go.opentelemetry.io/otel v1.41.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 // indirect go.opentelemetry.io/otel/metric v1.41.0 // indirect go.opentelemetry.io/otel/sdk v1.40.0 // indirect go.opentelemetry.io/otel/trace v1.41.0 // indirect - go.opentelemetry.io/proto/otlp v1.5.0 // indirect + go.opentelemetry.io/proto/otlp v1.7.1 // indirect go.uber.org/multierr v1.11.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/net v0.48.0 // indirect + golang.org/x/net v0.49.0 // indirect golang.org/x/oauth2 v0.34.0 // indirect golang.org/x/sync v0.19.0 // indirect - golang.org/x/term v0.38.0 // indirect - golang.org/x/text v0.32.0 // indirect - golang.org/x/time v0.12.0 // indirect + golang.org/x/term v0.40.0 // indirect + golang.org/x/text v0.34.0 // indirect + golang.org/x/time v0.13.0 // indirect + gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/genproto v0.0.0-20231211222908-989df2bf70f3 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect + k8s.io/apiextensions-apiserver v0.34.3 // indirect k8s.io/component-base v0.34.3 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect + sigs.k8s.io/gateway-api-inference-extension v1.2.0 // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/structured-merge-diff/v6 v6.3.1 // indirect ) + +replace github.com/ai-dynamo/dynamo/deploy/operator => ../operator diff --git a/deploy/snapshot/go.sum b/deploy/snapshot/go.sum index f87802f8d434..847f410cedfe 100644 --- a/deploy/snapshot/go.sum +++ b/deploy/snapshot/go.sum @@ -4,6 +4,9 @@ github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24/go.mod h github.com/AdamKorcz/go-118-fuzz-build v0.0.0-20230306123547-8075edf89bb0 h1:59MxjQVfjXsBpLy+dbd2/ELV5ofnUkUZBvWSC85sheA= github.com/AdamKorcz/go-118-fuzz-build v0.0.0-20230306123547-8075edf89bb0/go.mod h1:OahwfttHWG6eJ0clwcfBAHoDI6X/LV/15hx/wlMZSrU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= github.com/Microsoft/hcsshim v0.11.7 h1:vl/nj3Bar/CvJSYo7gIQPyRWc9f3c6IeSNavBTSZNZQ= @@ -12,8 +15,8 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= -github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= -github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= @@ -41,26 +44,30 @@ github.com/containerd/ttrpc v1.2.7 h1:qIrroQvuOL9HQ1X6KHe2ohc7p+HP/0VE6XPU7elJRq github.com/containerd/ttrpc v1.2.7/go.mod h1:YCXHsb32f+Sq5/72xHubdiJRQY9inL4a4ZQrAbN1q9o= github.com/containerd/typeurl/v2 v2.1.1 h1:3Q4Pt7i8nYwy2KmQWIw2+1hTvwTE/6w9FqcttATPO/4= github.com/containerd/typeurl/v2 v2.1.1/go.mod h1:IDp2JFvbwZ31H8dQbEIY7sDl2L3o3HZj1hsSQlywkQ0= -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/cyphar/filepath-securejoin v0.5.1 h1:eYgfMq5yryL4fbWfkLpFFy2ukSELzaJOTaUTuh+oF48= github.com/cyphar/filepath-securejoin v0.5.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c h1:+pKlWGMw7gf6bQ+oDZB4KHQFypsfjYlq/C4rfL7D3g8= github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c/go.mod h1:Uw6UezgYA44ePAFQYUehOuCzmy5zmg/+nl2ZfMWGkpA= -github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= -github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= +github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= +github.com/evanphx/json-patch v5.7.0+incompatible h1:vgGkfT/9f8zE6tvSCe74nfpAVDQ2tG6yudJd8LBksgI= +github.com/evanphx/json-patch v5.7.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= @@ -70,14 +77,12 @@ github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= -github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= -github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= -github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= -github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= -github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= -github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= -github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= -github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/go-openapi/jsonpointer v0.21.2 h1:AqQaNADVwq/VnkCmQg6ogE+M3FOsKTytwges0JdwVuA= +github.com/go-openapi/jsonpointer v0.21.2/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk= +github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= +github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= +github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZU= +github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= @@ -99,6 +104,8 @@ github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= +github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= @@ -111,13 +118,17 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo= -github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8 h1:ZI8gCoCjGzPsum4L21jHdQs8shFBIQih1TM9Rd/c+EQ= +github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs= +github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= +github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= @@ -126,17 +137,14 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= -github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= -github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= -github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= +github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= github.com/moby/locker v1.0.1 h1:fOXqR41zeveg4fFODix+1Ch4mj/gT0NE1XJbp/epuBg= github.com/moby/locker v1.0.1/go.mod h1:S7SDdo5zpBK84bzzVlKr2V0hz+7x9hWbYC/kq7oQppc= github.com/moby/sys/mountinfo v0.7.1 h1:/tTvQaSJRr2FshkhXiIpux6fQ2Zvc4j7tAhMTStAG2g= @@ -157,10 +165,10 @@ github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFd github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg= -github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= -github.com/onsi/gomega v1.36.1 h1:bJDPBO7ibjxcbHMgSCoo4Yj18UWbKDlLwX1x9sybDcw= -github.com/onsi/gomega v1.36.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= +github.com/onsi/ginkgo/v2 v2.27.3 h1:ICsZJ8JoYafeXFFlFAG75a7CxMsJHwgKwtO+82SE9L8= +github.com/onsi/ginkgo/v2 v2.27.3/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= +github.com/onsi/gomega v1.38.3 h1:eTX+W6dobAYfFeGC2PV6RwXRu/MyT+cQguijutvkpSM= +github.com/onsi/gomega v1.38.3/go.mod h1:ZCU1pkQcXDO5Sl9/VVEGlDyp+zm0m1cmeG5TOzLgdh4= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= @@ -171,23 +179,24 @@ github.com/opencontainers/selinux v1.13.1 h1:A8nNeceYngH9Ow++M+VVEwJVpdFmrlxsN22 github.com/opencontainers/selinux v1.13.1/go.mod h1:S10WXZ/osk2kWOYKy1x2f/eXF5ZHJoUs8UU/2caNRbg= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= -github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= -github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= -github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= +github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= +github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= +github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= +github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= -github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= +github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= @@ -210,14 +219,14 @@ go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 h1:x7wzEgXfnzJcHDwStJT+mxOz4etr2EcexjqhBvmoakw= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0/go.mod h1:rg+RlpR5dKwaS95IyyZqj5Wd4E13lk/msnTS0Xl9lJM= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg= go.opentelemetry.io/otel v1.41.0 h1:YlEwVsGAlCvczDILpUXpIpPSL/VPugt7zHThEMLce1c= go.opentelemetry.io/otel v1.41.0/go.mod h1:Yt4UwgEKeT05QbLwbyHXEwhnjxNO6D8L5PQP51/46dE= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 h1:OeNbIYk/2C15ckl7glBlOBp5+WlYsOElzTNmiPW/x60= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0/go.mod h1:7Bept48yIeqxP2OZ9/AqIpYS94h2or0aB4FypJTc8ZM= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 h1:tgJ0uaNS4c98WRNUEx5U3aDlrDOI5Rs+1Vifcw4DJ8U= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0/go.mod h1:U7HYyW0zt/a9x5J1Kjs+r1f/d4ZHnYFclhYY2+YbeoE= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk= go.opentelemetry.io/otel/metric v1.41.0 h1:rFnDcs4gRzBcsO9tS8LCpgR0dxg4aaxWlJxCno7JlTQ= go.opentelemetry.io/otel/metric v1.41.0/go.mod h1:xPvCwd9pU0VN8tPZYzDZV/BMj9CM9vs00GuBjeKhJps= go.opentelemetry.io/otel/sdk v1.40.0 h1:KHW/jUzgo6wsPh9At46+h4upjtccTmuZCFAc9OJ71f8= @@ -226,8 +235,8 @@ go.opentelemetry.io/otel/sdk/metric v1.40.0 h1:mtmdVqgQkeRxHgRv4qhyJduP3fYJRMX4A go.opentelemetry.io/otel/sdk/metric v1.40.0/go.mod h1:4Z2bGMf0KSK3uRjlczMOeMhKU2rhUqdWNoKcYrtcBPg= go.opentelemetry.io/otel/trace v1.41.0 h1:Vbk2co6bhj8L59ZJ6/xFTskY+tGAbOnCtQGVVa9TIN0= go.opentelemetry.io/otel/trace v1.41.0/go.mod h1:U1NU4ULCoxeDKc09yCWdWe+3QoyweJcISEVa1RBzOis= -go.opentelemetry.io/proto/otlp v1.5.0 h1:xJvq7gMzB31/d406fB8U5CBdyQGw4P399D1aQWU/3i4= -go.opentelemetry.io/proto/otlp v1.5.0/go.mod h1:keN8WnHxOy8PG0rQZjJJ5A2ebUoafqWp0eVQ4yIXvJ4= +go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4= +go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= @@ -247,6 +256,8 @@ golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvx golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c= +golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -256,8 +267,8 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU= -golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY= +golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= +golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= @@ -275,16 +286,16 @@ golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= -golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= -golang.org/x/term v0.38.0 h1:PQ5pkm/rLO6HnxFR7N2lJHOZX6Kez5Y1gDSJla6jo7Q= -golang.org/x/term v0.38.0/go.mod h1:bSEAKrOT1W+VSu9TSCMtoGEOUcKxOKgl3LE5QEF/xVg= +golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= +golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.40.0 h1:36e4zGLqU4yhjlmxEaagx2KuYbJq3EwY8K943ZsHcvg= +golang.org/x/term v0.40.0/go.mod h1:w2P8uVp06p2iyKKuvXIm7N/y0UCRt3UfJTfZ7oOpglM= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU= -golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY= -golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= -golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= +golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= +golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= +golang.org/x/time v0.13.0 h1:eUlYslOIt32DgYD6utsuUeHs4d7AsEYLuIAdg7FlYgI= +golang.org/x/time v0.13.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= @@ -293,12 +304,14 @@ golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBn golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.39.0 h1:ik4ho21kwuQln40uelmciQPp9SipgNDdrafrYA4TmQQ= -golang.org/x/tools v0.39.0/go.mod h1:JnefbkDPyD8UU2kI5fuf8ZX4/yUeh9W877ZeBONxUqQ= +golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc= +golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= +gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= @@ -346,8 +359,8 @@ honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWh honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= k8s.io/api v0.34.3 h1:D12sTP257/jSH2vHV2EDYrb16bS7ULlHpdNdNhEw2S4= k8s.io/api v0.34.3/go.mod h1:PyVQBF886Q5RSQZOim7DybQjAbVs8g7gwJNhGtY5MBk= -k8s.io/apiextensions-apiserver v0.34.1 h1:NNPBva8FNAPt1iSVwIE0FsdrVriRXMsaWFMqJbII2CI= -k8s.io/apiextensions-apiserver v0.34.1/go.mod h1:hP9Rld3zF5Ay2Of3BeEpLAToP+l4s5UlxiHfqRaRcMc= +k8s.io/apiextensions-apiserver v0.34.3 h1:p10fGlkDY09eWKOTeUSioxwLukJnm+KuDZdrW71y40g= +k8s.io/apiextensions-apiserver v0.34.3/go.mod h1:aujxvqGFRdb/cmXYfcRTeppN7S2XV/t7WMEc64zB5A0= k8s.io/apimachinery v0.34.3 h1:/TB+SFEiQvN9HPldtlWOTp0hWbJ+fjU+wkxysf/aQnE= k8s.io/apimachinery v0.34.3/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= k8s.io/client-go v0.34.3 h1:wtYtpzy/OPNYf7WyNBTj3iUA0XaBHVqhv4Iv3tbrF5A= @@ -364,10 +377,12 @@ k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZ k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= k8s.io/kubelet v0.34.3 h1:8QRev2FmasZ05yCC774qn6ULche72PYM7AQv0CVt9CM= k8s.io/kubelet v0.34.3/go.mod h1:pMgblr+nVQ02UkyaTcgqzS3AIYVQkjlMFg1Pd5rGC1Q= -k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= -k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/utils v0.0.0-20260108192941-914a6e750570 h1:JT4W8lsdrGENg9W+YwwdLJxklIuKWdRm+BC+xt33FOY= +k8s.io/utils v0.0.0-20260108192941-914a6e750570/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk= sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A= sigs.k8s.io/controller-runtime v0.22.4/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8= +sigs.k8s.io/gateway-api-inference-extension v1.2.0 h1:7H+ijrUImnW2ubcTakNgV723xDIdQx1Umv4vDVB+tTk= +sigs.k8s.io/gateway-api-inference-extension v1.2.0/go.mod h1:/HWeqxuOMjFM56YwJ2Spt3qceK7Spz4hk6ZfXYgE9a8= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= diff --git a/deploy/snapshot/internal/controller/controller.go b/deploy/snapshot/internal/controller/controller.go index 9f4c5d27d3f2..80eee39a39fc 100644 --- a/deploy/snapshot/internal/controller/controller.go +++ b/deploy/snapshot/internal/controller/controller.go @@ -16,7 +16,6 @@ import ( "github.com/go-logr/logr" "github.com/google/uuid" - batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -25,7 +24,6 @@ import ( "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" - "k8s.io/client-go/util/retry" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/executor" snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime" @@ -104,43 +102,8 @@ func (w *NodeController) Run(ctx context.Context) error { var syncFuncs []cache.InformerSynced - // Checkpoint informer - checkpointSelector := labels.SelectorFromSet(labels.Set{ - snapshotprotocol.CheckpointSourceLabel: "true", - }).String() - - ckptFactoryOpts := append([]informers.SharedInformerOption{ - informers.WithTweakListOptions(func(opts *metav1.ListOptions) { - opts.LabelSelector = checkpointSelector - }), - }, nsOptions...) - - ckptFactory := informers.NewSharedInformerFactoryWithOptions( - w.clientset, 30*time.Second, ckptFactoryOpts..., - ) - - ckptInformer := ckptFactory.Core().V1().Pods().Informer() - if _, err := ckptInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - pod, ok := podFromInformerObj(obj) - if !ok { - return - } - w.reconcileCheckpointPod(ctx, pod) - }, - UpdateFunc: func(_, newObj interface{}) { - pod, ok := podFromInformerObj(newObj) - if !ok { - return - } - w.reconcileCheckpointPod(ctx, pod) - }, - }); err != nil { - return fmt.Errorf("failed to add checkpoint informer handler: %w", err) - } - go ckptFactory.Start(w.stopCh) - syncFuncs = append(syncFuncs, ckptInformer.HasSynced) - + // Capture is driven by the SnapshotContent controller-runtime reconciler; this + // client-go controller only handles the restore path. // Restore pods carry a checkpoint ID but are not checkpoint sources. restoreSel, err := labels.Parse(snapshotprotocol.CheckpointIDLabel + ",!" + snapshotprotocol.CheckpointSourceLabel) if err != nil { @@ -190,111 +153,6 @@ func (w *NodeController) Run(ctx context.Context) error { return nil } -func (w *NodeController) reconcileCheckpointPod(ctx context.Context, pod *corev1.Pod) { - if pod.Spec.NodeName != w.config.NodeName { - return - } - - podKey := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name) - - checkpointID, ok := pod.Labels[snapshotprotocol.CheckpointIDLabel] - if !ok || checkpointID == "" { - w.log.Info("Pod has checkpoint label but no checkpoint-id label", "pod", podKey) - return - } - - job, err := getCheckpointJob(ctx, w.clientset, pod) - if err != nil { - w.log.Error(err, "Failed to resolve checkpoint job", "pod", podKey) - return - } - - jobStatus := job.Annotations[snapshotprotocol.CheckpointStatusAnnotation] - if jobStatus == snapshotprotocol.CheckpointStatusFailed { - return - } - - for i := range pod.Status.ContainerStatuses { - failed := &pod.Status.ContainerStatuses[i] - term := failed.State.Terminated - if term == nil || term.ExitCode == 0 { - continue - } - message := fmt.Sprintf("Checkpoint container %q terminated with exit code %d", failed.Name, term.ExitCode) - if term.Reason != "" { - message = fmt.Sprintf("%s: %s", message, term.Reason) - } - opLog := w.log.WithValues("pod", podKey, "checkpoint_id", checkpointID, "container", failed.Name) - opLog.Info("Checkpoint pod container failed", "exit_code", term.ExitCode, "reason", term.Reason) - emitPodEvent(ctx, w.clientset, opLog, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", message) - if err := annotateJob(ctx, w.clientset, opLog, job, map[string]string{ - snapshotprotocol.CheckpointStatusAnnotation: snapshotprotocol.CheckpointStatusFailed, - }); err != nil { - opLog.Error(err, "Failed to mark checkpoint job failed") - } - reason := fmt.Sprintf("checkpoint container %s failed", failed.Name) - for _, status := range pod.Status.ContainerStatuses { - if status.State.Running == nil || status.ContainerID == "" { - continue - } - containerID := snapshotruntime.StripCRIScheme(status.ContainerID) - resolveCtx, cancel := context.WithTimeout(ctx, containerResolveAttemptTimeout) - pid, _, err := w.runtime.ResolveContainer(resolveCtx, containerID) - cancel() - if err != nil { - opLog.Error(err, "Failed to resolve running checkpoint container", "container", status.Name) - continue - } - if err := snapshotruntime.SendSignalToPID(opLog, pid, syscall.SIGKILL, reason); err != nil { - opLog.Error(err, "Failed to signal running checkpoint container", "container", status.Name) - } - } - return - } - - if jobStatus == snapshotprotocol.CheckpointStatusCompleted { - return - } - - // Checkpoint contract: exactly one target container per job. - targets, err := snapshotprotocol.TargetContainersFromAnnotations(pod.Annotations, 1, 1) - if err != nil { - w.log.Error(err, "Checkpoint pod missing target-containers annotation", "pod", podKey) - return - } - containerName := targets[0] - if !isContainerReady(pod, containerName) { - return - } - - if !w.tryAcquire(podKey) { - return - } - - acquiredLease, err := acquireCheckpointLease(ctx, w.clientset, w.log, job, w.holderID) - if err != nil { - w.release(podKey) - w.log.Error(err, "Failed to acquire checkpoint lease", "pod", podKey, "checkpoint_id", checkpointID) - return - } - if !acquiredLease { - w.release(podKey) - return - } - - startedAt := time.Now() - w.log.Info("Checkpoint target detected, triggering checkpoint", "pod", podKey, "checkpoint_id", checkpointID) - emitPodEvent(ctx, w.clientset, w.log, pod, "snapshot", corev1.EventTypeNormal, "CheckpointRequested", fmt.Sprintf("Checkpoint requested: %s", checkpointID)) - - go func() { - if err := w.runCheckpoint(ctx, pod, job, checkpointID, containerName, podKey, startedAt); err != nil { - opLog := w.log.WithValues("pod", podKey, "checkpoint_id", checkpointID) - opLog.Error(err, "Checkpoint controller worker failed") - emitPodEvent(ctx, w.clientset, opLog, pod, "snapshot", corev1.EventTypeWarning, "CheckpointWorkerFailed", err.Error()) - } - }() -} - func (w *NodeController) reconcileRestorePod(ctx context.Context, pod *corev1.Pod) { if pod.Spec.NodeName != w.config.NodeName { return @@ -523,200 +381,6 @@ func (w *NodeController) startRestoreForContainer( }() } -// runCheckpoint runs the full checkpoint workflow for a pod: -// 1. Hold and renew the checkpoint lease -// 2. Resolve the container ID and host PID -// 3. Call executor.Checkpoint (inspect → configure → CUDA lock/checkpoint → CRIU dump → rootfs diff) -// 4. Write a snapshot-complete sentinel into the pod's snapshot-control -// volume on success (observed by the workload via polling), or SIGKILL -// on failure (unrecoverable CUDA-locked process) -// 5. Mark job as completed or failed -func (w *NodeController) runCheckpoint(ctx context.Context, pod *corev1.Pod, job *batchv1.Job, checkpointID, containerName, podKey string, startedAt time.Time) error { - releasePodOnExit := true - defer func() { - if releasePodOnExit { - w.release(podKey) - } - }() - log := w.log.WithValues("pod", podKey, "checkpoint_id", checkpointID) - leaseCtx, stopLease := context.WithCancelCause(ctx) - defer stopLease(nil) - - releaseLeaseOnExit := true - defer func() { - if !releaseLeaseOnExit { - return - } - releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - if err := releaseCheckpointLease(releaseCtx, w.clientset, log, job, w.holderID); err != nil { - log.Error(err, "Failed to release checkpoint lease") - } - }() - - go w.renewCheckpointLease(leaseCtx, log, job, stopLease) - - setCheckpointStatus := func(value string) (bool, error) { - if value != snapshotprotocol.CheckpointStatusCompleted { - if err := annotateJob(ctx, w.clientset, log, job, map[string]string{ - snapshotprotocol.CheckpointStatusAnnotation: value, - }); err != nil { - releasePodOnExit = false - releaseLeaseOnExit = false - return false, fmt.Errorf("failed to persist terminal checkpoint status %q: %w", value, err) - } - return true, nil - } - - updated := false - jobClient := w.clientset.BatchV1().Jobs(job.Namespace) - err := retry.RetryOnConflict(retry.DefaultRetry, func() error { - current, err := jobClient.Get(ctx, job.Name, metav1.GetOptions{}) - if err != nil { - return fmt.Errorf("failed to get current checkpoint job %s/%s: %w", job.Namespace, job.Name, err) - } - if current.Annotations[snapshotprotocol.CheckpointStatusAnnotation] == snapshotprotocol.CheckpointStatusFailed { - updated = false - return nil - } - if current.Annotations == nil { - current.Annotations = map[string]string{} - } - current.Annotations[snapshotprotocol.CheckpointStatusAnnotation] = value - if _, err := jobClient.Update(ctx, current, metav1.UpdateOptions{}); err != nil { - return err - } - updated = true - return nil - }) - if err != nil { - releasePodOnExit = false - releaseLeaseOnExit = false - return false, fmt.Errorf("failed to persist terminal checkpoint status %q: %w", value, err) - } - if !updated { - log.Info("Skipping checkpoint completion because checkpoint job is already failed", - "job", fmt.Sprintf("%s/%s", job.Namespace, job.Name), - ) - } - return updated, nil - } - - // Resolve the target container ID from pod status. - var containerID string - for _, cs := range pod.Status.ContainerStatuses { - if cs.Name == containerName { - containerID = snapshotruntime.StripCRIScheme(cs.ContainerID) - break - } - } - if containerID == "" { - emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", fmt.Sprintf("Could not resolve container %q ID", containerName)) - if _, statusErr := setCheckpointStatus(snapshotprotocol.CheckpointStatusFailed); statusErr != nil { - return statusErr - } - return nil - } - - // Resolve the container's host PID (needed for signaling after checkpoint) - containerPID, _, err := w.runtime.ResolveContainer(ctx, containerID) - if err != nil { - log.Error(err, "Failed to resolve container") - emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", fmt.Sprintf("Container resolve failed: %v", err)) - if _, statusErr := setCheckpointStatus(snapshotprotocol.CheckpointStatusFailed); statusErr != nil { - return statusErr - } - return nil - } - - checkpointLocation, err := w.checkpointLocationsFromPod(pod, checkpointID, containerPID) - if err != nil { - log.Error(err, "Checkpoint pod is missing storage metadata") - emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error()) - if _, statusErr := setCheckpointStatus(snapshotprotocol.CheckpointStatusFailed); statusErr != nil { - return statusErr - } - return nil - } - if err := w.validatePodMountContainerPID(ctx, containerID, containerPID); err != nil { - log.Error(err, "Checkpoint container changed before storage access") - emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error()) - if _, statusErr := setCheckpointStatus(snapshotprotocol.CheckpointStatusFailed); statusErr != nil { - return statusErr - } - return nil - } - - // Step 1: Run the checkpoint orchestrator - req := executor.CheckpointRequest{ - ContainerID: containerID, - ContainerName: containerName, - CheckpointID: checkpointID, - CheckpointLocation: checkpointLocation.HostPath, - StartedAt: startedAt, - NodeName: w.config.NodeName, - PodName: pod.Name, - PodNamespace: pod.Namespace, - Clientset: w.clientset, - } - if err := executor.Checkpoint(leaseCtx, w.runtime, log, req, w.config); err != nil { - if cause := context.Cause(leaseCtx); cause != nil && cause != context.Canceled { - err = fmt.Errorf("checkpoint lease lost: %w", cause) - } - log.Error(err, "Checkpoint failed") - emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error()) - // SIGKILL on failure: process is unrecoverable (CUDA locked), terminate immediately - if signalErr := snapshotruntime.SendSignalToPID(log, containerPID, syscall.SIGKILL, "checkpoint failed"); signalErr != nil { - log.Error(signalErr, "Failed to signal checkpoint failure to runtime process") - } - if _, statusErr := setCheckpointStatus(snapshotprotocol.CheckpointStatusFailed); statusErr != nil { - return statusErr - } - return nil - } - - info, err := os.Stat(checkpointLocation.HostPath) - if err != nil || !info.IsDir() { - if err == nil { - err = fmt.Errorf("published checkpoint path %s is not a directory", checkpointLocation.HostPath) - } else { - err = fmt.Errorf("published checkpoint path %s is missing: %w", checkpointLocation.HostPath, err) - } - log.Error(err, "Checkpoint failed verification") - emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error()) - if signalErr := snapshotruntime.SendSignalToPID(log, containerPID, syscall.SIGKILL, "checkpoint verification failed"); signalErr != nil { - log.Error(signalErr, "Failed to signal checkpoint verification failure to runtime process") - } - if _, statusErr := setCheckpointStatus(snapshotprotocol.CheckpointStatusFailed); statusErr != nil { - return statusErr - } - return nil - } - // Step 2: Sentinel on success. Workload observes via polling on the - // snapshot-control volume; containerPID is a PID inside the container's - // mount namespace, which is all the /host/proc//root write path - // requires. The Succeeded event is emitted only after the sentinel has - // been written and the terminal status has been persisted so failures don't - // produce conflicting Succeeded+Failed events for the same operation. - if err := snapshotruntime.WriteControlSentinel(containerPID, snapshotprotocol.SnapshotCompleteFile); err != nil { - log.Error(err, "Failed to write snapshot-complete sentinel") - emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error()) - if _, statusErr := setCheckpointStatus(snapshotprotocol.CheckpointStatusFailed); statusErr != nil { - return statusErr - } - return nil - } - - updated, err := setCheckpointStatus(snapshotprotocol.CheckpointStatusCompleted) - if err != nil { - return err - } - if updated { - emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeNormal, "CheckpointSucceeded", fmt.Sprintf("Checkpoint completed: %s", checkpointID)) - } - return nil -} - // runRestore runs the full restore workflow for one target container: // 1. Annotate the pod with restore in_progress // 2. Call executor.Restore (inspect placeholder → nsrestore inside namespace) diff --git a/deploy/snapshot/internal/controller/controller_test.go b/deploy/snapshot/internal/controller/controller_test.go index 19a7632e6c49..d955b18e77c9 100644 --- a/deploy/snapshot/internal/controller/controller_test.go +++ b/deploy/snapshot/internal/controller/controller_test.go @@ -11,8 +11,6 @@ import ( "github.com/go-logr/logr/testr" specs "github.com/opencontainers/runtime-spec/specs-go" - batchv1 "k8s.io/api/batch/v1" - coordinationv1 "k8s.io/api/coordination/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -52,8 +50,8 @@ func (r *fakeRuntime) ResolveContainerByPod(ctx context.Context, pod, ns, ctr st func (r *fakeRuntime) Close() error { return nil } // makeTestController creates a NodeController with a fake k8s client and nil executors. -// The fake clientset is empty so any goroutine launched by runCheckpoint/runRestore -// will fail on the first annotatePod call and exit cleanly. +// The fake clientset is empty so any goroutine launched by the restore path will fail on +// the first annotatePod call and exit cleanly. func makeTestController(t *testing.T, objs ...runtime.Object) *NodeController { t.Helper() return &NodeController{ @@ -87,23 +85,6 @@ func sawEventReason(clientset *fake.Clientset, reason string) bool { return false } -func makeLease(namespace, name, holder string, renewTime time.Time) *coordinationv1.Lease { - leaseDurationSeconds := int32(checkpointLeaseDuration.Seconds()) - renewMicroTime := metav1.NewMicroTime(renewTime) - return &coordinationv1.Lease{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: namespace, - }, - Spec: coordinationv1.LeaseSpec{ - HolderIdentity: &holder, - LeaseDurationSeconds: &leaseDurationSeconds, - AcquireTime: &renewMicroTime, - RenewTime: &renewMicroTime, - }, - } -} - func makePod(name, namespace, nodeName string, phase corev1.PodPhase, ready bool, labels, annotations map[string]string) *corev1.Pod { var conditions []corev1.PodCondition if ready { @@ -315,230 +296,6 @@ func TestRestoreCheckpointReady(t *testing.T) { }) } -func TestReconcileCheckpointPod(t *testing.T) { - tests := []struct { - name string - nodeName string - phase corev1.PodPhase - ready bool - hash string - annotation string - lease *coordinationv1.Lease - preSeed bool // pre-populate inFlight to test deduplication - want bool // true = pod passes filtering and triggers checkpoint - }{ - { - name: "happy path", - nodeName: testNodeName, - phase: corev1.PodRunning, - ready: true, - hash: "abc123", - want: true, - }, - { - name: "wrong node", - nodeName: "other-node", - phase: corev1.PodRunning, - ready: true, - hash: "abc123", - want: false, - }, - { - name: "not running", - nodeName: testNodeName, - phase: corev1.PodPending, - ready: false, - hash: "abc123", - want: false, - }, - { - name: "running but not ready", - nodeName: testNodeName, - phase: corev1.PodRunning, - ready: false, - hash: "abc123", - want: false, - }, - { - name: "missing hash label", - nodeName: testNodeName, - phase: corev1.PodRunning, - ready: true, - hash: "", - want: false, - }, - { - name: "already completed", - nodeName: testNodeName, - phase: corev1.PodRunning, - ready: true, - hash: "abc123", - annotation: "completed", - want: false, - }, - { - name: "already failed", - nodeName: testNodeName, - phase: corev1.PodRunning, - ready: true, - hash: "abc123", - annotation: "failed", - want: false, - }, - { - name: "active lease held elsewhere", - nodeName: testNodeName, - phase: corev1.PodRunning, - ready: true, - hash: "abc123", - lease: makeLease("default", "checkpoint-job", "other-holder", time.Now()), - want: false, - }, - { - name: "expired lease can be reclaimed", - nodeName: testNodeName, - phase: corev1.PodRunning, - ready: true, - hash: "abc123", - lease: makeLease("default", "checkpoint-job", "other-holder", time.Now().Add(-checkpointLeaseDuration-time.Second)), - want: true, - }, - { - name: "duplicate in-flight", - nodeName: testNodeName, - phase: corev1.PodRunning, - ready: true, - hash: "abc123", - preSeed: true, - want: false, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - labels := map[string]string{ - snapshotprotocol.CheckpointSourceLabel: "true", - "batch.kubernetes.io/job-name": "checkpoint-job", - } - if tc.hash != "" { - labels[snapshotprotocol.CheckpointIDLabel] = tc.hash - } - - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: "checkpoint-job", - Namespace: "default", - }, - } - if tc.annotation != "" { - job.Annotations = map[string]string{ - snapshotprotocol.CheckpointStatusAnnotation: tc.annotation, - } - } - - pod := makePod("test-pod", "default", tc.nodeName, tc.phase, tc.ready, labels, nil) - objs := []runtime.Object{job} - if tc.lease != nil { - objs = append(objs, tc.lease) - } - - w := makeTestController(t, objs...) - ctx := context.Background() - - if tc.preSeed { - w.inFlight["default/test-pod"] = struct{}{} - } - - w.reconcileCheckpointPod(ctx, pod) - - triggered := sawEventReason(w.clientset.(*fake.Clientset), "CheckpointRequested") - - if triggered != tc.want { - t.Errorf("triggered = %v, want %v (inFlight=%d, preSeed=%v, actions=%#v)", triggered, tc.want, len(w.inFlight), tc.preSeed, w.clientset.(*fake.Clientset).Actions()) - } - - // Let the background goroutine (if any) finish before the test ends - if tc.want { - time.Sleep(50 * time.Millisecond) - } - }) - } -} - -func TestReconcileCheckpointPodFailsWhenAnyRegularContainerFails(t *testing.T) { - for _, jobStatus := range []string{"", snapshotprotocol.CheckpointStatusCompleted} { - t.Run("job status "+jobStatus, func(t *testing.T) { - labels := map[string]string{ - snapshotprotocol.CheckpointSourceLabel: "true", - snapshotprotocol.CheckpointIDLabel: "abc123", - "batch.kubernetes.io/job-name": "checkpoint-job", - } - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: "checkpoint-job", - Namespace: "default", - Annotations: map[string]string{}, - }, - } - if jobStatus != "" { - job.Annotations[snapshotprotocol.CheckpointStatusAnnotation] = jobStatus - } - pod := makePod("test-pod", "default", testNodeName, corev1.PodRunning, false, labels, nil) - pod.Spec.Containers = append(pod.Spec.Containers, corev1.Container{Name: "helper"}) - pod.Status.ContainerStatuses = []corev1.ContainerStatus{ - { - Name: "main", - Ready: true, - State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, - ContainerID: "containerd://main-id", - }, - { - Name: "helper", - State: corev1.ContainerState{ - Terminated: &corev1.ContainerStateTerminated{ExitCode: 1, Reason: "Error"}, - }, - ContainerID: "containerd://helper-id", - }, - } - - w := makeTestController(t, job) - rt := &fakeRuntime{} - w.runtime = rt - w.reconcileCheckpointPod(context.Background(), pod) - - updated, err := w.clientset.BatchV1().Jobs("default").Get(context.Background(), "checkpoint-job", metav1.GetOptions{}) - if err != nil { - t.Fatalf("failed to get checkpoint job: %v", err) - } - if got := updated.Annotations[snapshotprotocol.CheckpointStatusAnnotation]; got != snapshotprotocol.CheckpointStatusFailed { - t.Fatalf("checkpoint status annotation = %q, want %q", got, snapshotprotocol.CheckpointStatusFailed) - } - - var sawFailureEvent bool - for _, action := range w.clientset.(*fake.Clientset).Actions() { - create, ok := action.(clientgotesting.CreateAction) - if !ok || create.GetResource().Resource != "events" { - continue - } - event, ok := create.GetObject().(*corev1.Event) - if ok && event.Reason == "CheckpointFailed" && strings.Contains(event.Message, `container "helper"`) { - sawFailureEvent = true - break - } - } - if !sawFailureEvent { - t.Fatalf("expected CheckpointFailed event for failed regular container; actions=%#v", w.clientset.(*fake.Clientset).Actions()) - } - if len(w.inFlight) != 0 { - t.Fatalf("failed checkpoint pod should not start snapshot worker, got inFlight=%v", w.inFlight) - } - if len(rt.resolvedContainerIDs) != 1 || rt.resolvedContainerIDs[0] != "main-id" { - t.Fatalf("expected to resolve remaining running container before failing job, got %v", rt.resolvedContainerIDs) - } - }) - } -} - func TestReconcileRestorePod(t *testing.T) { tests := []struct { name string @@ -951,65 +708,3 @@ func TestPollForContainerIDSkipsWhenRestoreAttemptAlreadyHeld(t *testing.T) { } } -func TestRunCheckpointKeepsLeaseAndInFlightOnTerminalStatusPatchFailure(t *testing.T) { - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "default", - Labels: map[string]string{ - "batch.kubernetes.io/job-name": "checkpoint-job", - }, - }, - } - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: "checkpoint-job", - Namespace: "default", - }, - } - lease := makeLease("default", "checkpoint-job", "test-holder", time.Now()) - - clientset := fake.NewClientset(pod.DeepCopy(), job, lease) - patchCalls := 0 - clientset.PrependReactor("patch", "jobs", func(clientgotesting.Action) (bool, runtime.Object, error) { - patchCalls++ - return true, nil, errors.New("terminal patch failed") - }) - - w := &NodeController{ - config: &types.AgentConfig{ - NodeName: testNodeName, - Storage: types.StorageSpec{ - Type: snapshotprotocol.StorageTypePVC, - BasePath: t.TempDir(), - }, - }, - clientset: clientset, - runtime: &fakeRuntime{}, - log: testr.New(t), - holderID: "test-holder", - inFlight: map[string]struct{}{ - "default/test-pod": {}, - }, - stopCh: make(chan struct{}), - } - - err := w.runCheckpoint(context.Background(), pod, job, "abc123", "main", "default/test-pod", time.Now()) - if err == nil { - t.Fatal("expected terminal checkpoint status update to fail") - } - if _, ok := w.inFlight["default/test-pod"]; !ok { - t.Fatal("checkpoint terminal status failure should keep pod in-flight") - } - if patchCalls != 1 { - t.Fatalf("patchCalls = %d, want %d", patchCalls, 1) - } - - remainingLease, err := clientset.CoordinationV1().Leases("default").Get(context.Background(), "checkpoint-job", metav1.GetOptions{}) - if err != nil { - t.Fatalf("expected checkpoint lease to remain after terminal status patch failure: %v", err) - } - if remainingLease.Spec.HolderIdentity == nil || *remainingLease.Spec.HolderIdentity != "test-holder" { - t.Fatalf("unexpected remaining lease holder: %#v", remainingLease.Spec.HolderIdentity) - } -} diff --git a/deploy/snapshot/internal/controller/manager.go b/deploy/snapshot/internal/controller/manager.go new file mode 100644 index 000000000000..ec71bb89d45e --- /dev/null +++ b/deploy/snapshot/internal/controller/manager.go @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "fmt" + + "github.com/google/uuid" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/client-go/kubernetes" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/metrics/server" + + nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" + snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime" + "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types" + snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" +) + +// NewSnapshotContentManager builds the per-node controller-runtime Manager that drives +// checkpoint capture. Its cache is scoped to this node — SnapshotContent by the +// nvidia.com/snapshot-node mirror label, pods by their spec.nodeName field (so the agent +// does not open a second cluster-wide pod watch). Leader election is off, and the +// SnapshotContent reconciler is registered with the production executor-backed driver. +func NewSnapshotContentManager(cfg *types.AgentConfig, rt snapshotruntime.Runtime) (ctrl.Manager, error) { + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(nvidiacomv1alpha1.AddToScheme(scheme)) + + nodeContentSelector := labels.SelectorFromSet(labels.Set{snapshotprotocol.SnapshotNodeLabel: cfg.NodeName}) + nodePodSelector := fields.OneTermEqualSelector("spec.nodeName", cfg.NodeName) + + restConfig := ctrl.GetConfigOrDie() + mgr, err := ctrl.NewManager(restConfig, ctrl.Options{ + Scheme: scheme, + LeaderElection: false, + Metrics: server.Options{BindAddress: "0"}, + Cache: cache.Options{ + ByObject: map[client.Object]cache.ByObject{ + &nvidiacomv1alpha1.SnapshotContent{}: {Label: nodeContentSelector}, + &corev1.Pod{}: {Field: nodePodSelector}, + }, + }, + }) + if err != nil { + return nil, fmt.Errorf("create snapshot-content manager: %w", err) + } + + clientset, err := kubernetes.NewForConfig(restConfig) + if err != nil { + return nil, fmt.Errorf("create kubernetes client for lease coordination: %w", err) + } + + reconciler := &SnapshotContentReconciler{ + Client: mgr.GetClient(), + Clientset: clientset, + Config: cfg, + NodeName: cfg.NodeName, + HolderID: "snapshot-agent/" + uuid.NewString(), + Checkpointer: newExecutorCheckpointer(clientset, rt, cfg, cfg.NodeName), + } + if err := reconciler.SetupWithManager(mgr); err != nil { + return nil, fmt.Errorf("set up SnapshotContent reconciler: %w", err) + } + return mgr, nil +} diff --git a/deploy/snapshot/internal/controller/nodecheckpointer.go b/deploy/snapshot/internal/controller/nodecheckpointer.go new file mode 100644 index 000000000000..b4aa88c2a180 --- /dev/null +++ b/deploy/snapshot/internal/controller/nodecheckpointer.go @@ -0,0 +1,124 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "context" + "fmt" + "os" + "syscall" + "time" + + "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" + "k8s.io/client-go/kubernetes" + + "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/executor" + snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime" + "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types" + snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" +) + +// CheckpointParams carries everything the node driver needs to dump one container. +type CheckpointParams struct { + // Pod is the live source pod (already provenance-verified by the reconciler). + Pod *corev1.Pod + // ContainerName is the single target container to checkpoint. + ContainerName string + // CheckpointID is the stable artifact identity. + CheckpointID string + // HostPath is the agent-resolved destination directory for the dump. + HostPath string + // ContainerPath is the destination as seen inside the workload container's mount + // namespace (equal to HostPath under agentMount storage). + ContainerPath string + // StartedAt marks when the reconciler observed the work order, for timing. + StartedAt time.Time +} + +// NodeCheckpointer performs the CRIU dump for a single SnapshotContent work order. The +// concrete implementation wraps executor.Checkpoint; unit tests substitute a fake. +type NodeCheckpointer interface { + // Checkpoint runs the dump and verifies the produced artifact. It returns an error + // on any failure; on success the artifact exists at params.HostPath. + Checkpoint(ctx context.Context, params CheckpointParams) error +} + +// executorCheckpointer is the production NodeCheckpointer backed by executor.Checkpoint. +type executorCheckpointer struct { + clientset kubernetes.Interface + runtime snapshotruntime.Runtime + config *types.AgentConfig + nodeName string +} + +// newExecutorCheckpointer builds the production node checkpointer. +func newExecutorCheckpointer(clientset kubernetes.Interface, rt snapshotruntime.Runtime, cfg *types.AgentConfig, nodeName string) *executorCheckpointer { + return &executorCheckpointer{clientset: clientset, runtime: rt, config: cfg, nodeName: nodeName} +} + +// Checkpoint resolves the target container, runs executor.Checkpoint to the destination, +// verifies the artifact directory, and writes the snapshot-complete sentinel. On dump or +// verification failure it SIGKILLs the CUDA-locked process before returning the error. +func (ec *executorCheckpointer) Checkpoint(ctx context.Context, params CheckpointParams) error { + log := logr.FromContextOrDiscard(ctx) + + containerID := containerIDForName(params.Pod, params.ContainerName) + if containerID == "" { + return fmt.Errorf("could not resolve container %q ID", params.ContainerName) + } + + containerPID, _, err := ec.runtime.ResolveContainer(ctx, containerID) + if err != nil { + return fmt.Errorf("resolve container %q: %w", params.ContainerName, err) + } + + req := executor.CheckpointRequest{ + ContainerID: containerID, + ContainerName: params.ContainerName, + CheckpointID: params.CheckpointID, + CheckpointLocation: params.HostPath, + StartedAt: params.StartedAt, + NodeName: ec.nodeName, + PodName: params.Pod.Name, + PodNamespace: params.Pod.Namespace, + Clientset: ec.clientset, + } + if err := executor.Checkpoint(ctx, ec.runtime, log, req, ec.config); err != nil { + ec.kill(log, containerPID, "checkpoint failed") + return fmt.Errorf("checkpoint: %w", err) + } + + info, statErr := os.Stat(params.HostPath) + if statErr != nil || !info.IsDir() { + ec.kill(log, containerPID, "checkpoint verification failed") + if statErr != nil { + return fmt.Errorf("verify checkpoint path %s: %w", params.HostPath, statErr) + } + return fmt.Errorf("verify checkpoint path %s: not a directory", params.HostPath) + } + + if err := snapshotruntime.WriteControlSentinel(containerPID, snapshotprotocol.SnapshotCompleteFile); err != nil { + ec.kill(log, containerPID, "checkpoint sentinel failed") + return fmt.Errorf("write snapshot-complete sentinel: %w", err) + } + return nil +} + +// kill signals the CUDA-locked process so it does not hang after a failed dump. +func (ec *executorCheckpointer) kill(log logr.Logger, pid int, reason string) { + if err := snapshotruntime.SendSignalToPID(log, pid, syscall.SIGKILL, reason); err != nil { + log.Error(err, "Failed to signal checkpoint process", "reason", reason) + } +} + +// containerIDForName returns the running container's CRI-stripped ID, or "" if absent. +func containerIDForName(pod *corev1.Pod, containerName string) string { + for _, cs := range pod.Status.ContainerStatuses { + if cs.Name == containerName { + return snapshotruntime.StripCRIScheme(cs.ContainerID) + } + } + return "" +} diff --git a/deploy/snapshot/internal/controller/snapshotcontent_reconciler.go b/deploy/snapshot/internal/controller/snapshotcontent_reconciler.go new file mode 100644 index 000000000000..6314716a395a --- /dev/null +++ b/deploy/snapshot/internal/controller/snapshotcontent_reconciler.go @@ -0,0 +1,300 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + "sync" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + + nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" + "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types" + snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" +) + +// quiesceRequeueInterval is how often the reconciler re-checks a not-yet-Ready source pod. +const quiesceRequeueInterval = 2 * time.Second + +// SnapshotContentReconciler is the per-node CSI-style driver. It picks up SnapshotContent +// work orders for its node, dumps the source container, and writes only +// SnapshotContent.status (snapshotHandle + Ready/Failed). It holds no finalizer. +type SnapshotContentReconciler struct { + client.Client + Clientset kubernetes.Interface + Config *types.AgentConfig + NodeName string + HolderID string + Checkpointer NodeCheckpointer + + inFlight map[string]struct{} + inFlightMu sync.Mutex +} + +// Reconcile drives one SnapshotContent through provenance checks, quiesce, dump, and the +// terminal status write. It never mutates spec and writes status via Status().Patch only. +func (scr *SnapshotContentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + content := &nvidiacomv1alpha1.SnapshotContent{} + if err := scr.Get(ctx, req.NamespacedName, content); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + if content.Spec.Source.NodeName != scr.NodeName { + return ctrl.Result{}, nil + } + + // Idempotency: terminal status means the work is done. + if isContentTerminal(content) { + return ctrl.Result{}, nil + } + + checkpointID, version, err := storageCoordsFromContent(content) + if err != nil { + return scr.writeFailed(ctx, content, "MissingStorageCoords", err) + } + + destination, err := scr.resolveDestination(checkpointID, version) + if err != nil { + return scr.writeFailed(ctx, content, "InvalidDestination", err) + } + + // Resume: a present artifact with unwritten status means a prior dump finished but the + // status write did not. The artifact dir exists only after the executor's atomic rename, + // so its presence means a completed dump; read via the agent's mounted volume, never + // /host/proc//root (there is no live PID on resume). + if artifactPresent(destination) { + return scr.writeReady(ctx, content) + } + + key := req.NamespacedName.String() + if !scr.tryAcquire(key) { + return ctrl.Result{}, nil + } + releaseInFlight := true + defer func() { + if releaseInFlight { + scr.release(key) + } + }() + + pod, result, err := scr.resolveSourcePod(ctx, content) + if err != nil || pod == nil { + return result, err + } + + containerName, err := snapshotprotocol.TargetContainersFromAnnotations(pod.Annotations, 1, 1) + if err != nil { + return scr.writeFailed(ctx, content, "MissingTargetContainer", err) + } + if !isContainerReady(pod, containerName[0]) { + logger.V(1).Info("Source container not ready, requeueing to quiesce", "pod", pod.Name, "container", containerName[0]) + return ctrl.Result{RequeueAfter: quiesceRequeueInterval}, nil + } + + leaseKey := client.ObjectKey{Namespace: content.Spec.SnapshotRef.Namespace, Name: content.Name} + acquired, err := scr.acquireLease(ctx, leaseKey) + if err != nil { + return ctrl.Result{}, err + } + if !acquired { + return ctrl.Result{RequeueAfter: quiesceRequeueInterval}, nil + } + + releaseInFlight = false + go scr.runCheckpoint(ctx, content, pod, containerName[0], checkpointID, destination, leaseKey, key) + return ctrl.Result{}, nil +} + +// runCheckpoint executes the dump under a renewed lease and writes the terminal status. +func (scr *SnapshotContentReconciler) runCheckpoint( + ctx context.Context, + content *nvidiacomv1alpha1.SnapshotContent, + pod *corev1.Pod, + containerName, checkpointID, destination string, + leaseKey client.ObjectKey, + inFlightKey string, +) { + logger := log.FromContext(ctx) + defer scr.release(inFlightKey) + + leaseCtx, stopLease := context.WithCancel(ctx) + defer stopLease() + go scr.renewLease(leaseCtx, leaseKey) + defer func() { + releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := scr.releaseLease(releaseCtx, leaseKey); err != nil { + logger.Error(err, "Failed to release checkpoint lease", "lease", leaseKey.String()) + } + }() + + params := CheckpointParams{ + Pod: pod, + ContainerName: containerName, + CheckpointID: checkpointID, + HostPath: destination, + ContainerPath: destination, + StartedAt: time.Now(), + } + if err := scr.Checkpointer.Checkpoint(leaseCtx, params); err != nil { + logger.Error(err, "Checkpoint failed", "content", content.Name) + if _, werr := scr.writeFailed(ctx, content, "CheckpointFailed", err); werr != nil { + logger.Error(werr, "Failed to write SnapshotContent failed status", "content", content.Name) + } + return + } + + if _, err := scr.writeReady(ctx, content); err != nil { + logger.Error(err, "Failed to write SnapshotContent ready status", "content", content.Name) + } +} + +// resolveSourcePod loads the source pod and enforces UID provenance and pod liveness. +// It returns (nil, result, err) when the caller should return result/err instead of dumping. +func (scr *SnapshotContentReconciler) resolveSourcePod(ctx context.Context, content *nvidiacomv1alpha1.SnapshotContent) (*corev1.Pod, ctrl.Result, error) { + pod := &corev1.Pod{} + key := client.ObjectKey{Namespace: content.Spec.SnapshotRef.Namespace, Name: content.Spec.Source.PodRef.Name} + if err := scr.Get(ctx, key, pod); err != nil { + if apierrors.IsNotFound(err) { + return nil, ctrl.Result{RequeueAfter: quiesceRequeueInterval}, nil + } + return nil, ctrl.Result{}, err + } + if content.Spec.Source.PodRef.UID != "" && pod.UID != content.Spec.Source.PodRef.UID { + result, err := scr.writeFailed(ctx, content, "StalePodReference", + fmt.Errorf("source pod %q UID %q does not match work order UID %q", pod.Name, pod.UID, content.Spec.Source.PodRef.UID)) + return nil, result, err + } + if pod.DeletionTimestamp != nil || pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodSucceeded { + result, err := scr.writeFailed(ctx, content, "SourcePodGone", + fmt.Errorf("source pod %q is no longer running (phase %s)", pod.Name, pod.Status.Phase)) + return nil, result, err + } + return pod, ctrl.Result{}, nil +} + +// resolveDestination computes the artifact directory on the agent's mounted volume. +func (scr *SnapshotContentReconciler) resolveDestination(checkpointID, version string) (string, error) { + resolved, err := snapshotprotocol.ResolveCheckpointStorage(checkpointID, version, snapshotprotocol.Storage{ + Type: scr.Config.Storage.Type, + BasePath: scr.Config.Storage.BasePath, + }) + if err != nil { + return "", err + } + location := resolved.Location + if !filepath.IsAbs(location) || filepath.Clean(location) != location { + return "", fmt.Errorf("checkpoint location must be an absolute, clean path: %q", location) + } + return location, nil +} + +// writeReady patches status with the Ready condition. +func (scr *SnapshotContentReconciler) writeReady(ctx context.Context, content *nvidiacomv1alpha1.SnapshotContent) (ctrl.Result, error) { + patch := client.MergeFrom(content.DeepCopy()) + meta.SetStatusCondition(&content.Status.Conditions, metav1.Condition{ + Type: nvidiacomv1alpha1.SnapshotConditionReady, + Status: metav1.ConditionTrue, + Reason: "Captured", + Message: "Checkpoint captured and verified", + }) + if err := scr.Status().Patch(ctx, content, patch); err != nil { + return ctrl.Result{}, fmt.Errorf("patch SnapshotContent ready status: %w", err) + } + return ctrl.Result{}, nil +} + +// writeFailed patches status with the Failed condition. +func (scr *SnapshotContentReconciler) writeFailed(ctx context.Context, content *nvidiacomv1alpha1.SnapshotContent, reason string, cause error) (ctrl.Result, error) { + patch := client.MergeFrom(content.DeepCopy()) + meta.SetStatusCondition(&content.Status.Conditions, metav1.Condition{ + Type: nvidiacomv1alpha1.SnapshotConditionFailed, + Status: metav1.ConditionTrue, + Reason: reason, + Message: cause.Error(), + }) + if err := scr.Status().Patch(ctx, content, patch); err != nil { + return ctrl.Result{}, fmt.Errorf("patch SnapshotContent failed status: %w", err) + } + return ctrl.Result{}, nil +} + +// tryAcquire claims the in-flight slot for a work order, returning false if already held. +func (scr *SnapshotContentReconciler) tryAcquire(key string) bool { + scr.inFlightMu.Lock() + defer scr.inFlightMu.Unlock() + if scr.inFlight == nil { + scr.inFlight = make(map[string]struct{}) + } + if _, held := scr.inFlight[key]; held { + return false + } + scr.inFlight[key] = struct{}{} + return true +} + +// release frees the in-flight slot for a work order. +func (scr *SnapshotContentReconciler) release(key string) { + scr.inFlightMu.Lock() + defer scr.inFlightMu.Unlock() + delete(scr.inFlight, key) +} + +// SetupWithManager registers the reconciler. The manager cache is label-scoped to this +// node, so a defense-in-depth nodeName predicate is enough; no extra watches are added. +func (scr *SnapshotContentReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&nvidiacomv1alpha1.SnapshotContent{}). + Complete(scr) +} + +// isContentTerminal reports whether the work order already has a terminal condition. +func isContentTerminal(content *nvidiacomv1alpha1.SnapshotContent) bool { + for _, t := range []string{nvidiacomv1alpha1.SnapshotConditionReady, nvidiacomv1alpha1.SnapshotConditionFailed} { + if cond := meta.FindStatusCondition(content.Status.Conditions, t); cond != nil && cond.Status == metav1.ConditionTrue { + return true + } + } + return false +} + +// storageCoordsFromContent reads the checkpoint ID (label) and artifact version +// (annotation) carried on the work order. A missing/blank checkpoint ID is fatal; the +// version falls back to the default only when the annotation is entirely absent. +func storageCoordsFromContent(content *nvidiacomv1alpha1.SnapshotContent) (string, string, error) { + checkpointID := strings.TrimSpace(content.Labels[snapshotprotocol.CheckpointIDLabel]) + if checkpointID == "" { + return "", "", fmt.Errorf("missing %s label", snapshotprotocol.CheckpointIDLabel) + } + version, ok := content.Annotations[snapshotprotocol.CheckpointArtifactVersionAnnotation] + if !ok { + version = snapshotprotocol.DefaultCheckpointArtifactVersion + } + version = strings.TrimSpace(version) + if version == "" { + return "", "", fmt.Errorf("blank %s annotation", snapshotprotocol.CheckpointArtifactVersionAnnotation) + } + return checkpointID, version, nil +} + +// artifactPresent reports whether a completed checkpoint directory already exists on disk. +func artifactPresent(destination string) bool { + info, err := os.Stat(destination) + return err == nil && info.IsDir() +} + diff --git a/deploy/snapshot/internal/controller/snapshotcontent_reconciler_test.go b/deploy/snapshot/internal/controller/snapshotcontent_reconciler_test.go new file mode 100644 index 000000000000..1a96782854b0 --- /dev/null +++ b/deploy/snapshot/internal/controller/snapshotcontent_reconciler_test.go @@ -0,0 +1,238 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "context" + "errors" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + k8sfake "k8s.io/client-go/kubernetes/fake" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + crfake "sigs.k8s.io/controller-runtime/pkg/client/fake" + + nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" + snapshottypes "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types" + snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" +) + +// fakeCheckpointer records calls and returns a configured error. +type fakeCheckpointer struct { + called bool + err error +} + +func (fc *fakeCheckpointer) Checkpoint(_ context.Context, _ CheckpointParams) error { + fc.called = true + return fc.err +} + +func contentScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + require.NoError(t, nvidiacomv1alpha1.AddToScheme(s)) + require.NoError(t, corev1.AddToScheme(s)) + return s +} + +func makeContentReconciler(t *testing.T, checkpointer NodeCheckpointer, objs ...client.Object) *SnapshotContentReconciler { + t.Helper() + s := contentScheme(t) + return &SnapshotContentReconciler{ + Client: crfake.NewClientBuilder().WithScheme(s).WithObjects(objs...). + WithStatusSubresource(&nvidiacomv1alpha1.SnapshotContent{}).Build(), + Clientset: k8sfake.NewClientset(), + Config: &snapshottypes.AgentConfig{NodeName: "node-a", Storage: snapshottypes.StorageSpec{Type: "pvc", BasePath: t.TempDir()}}, + NodeName: "node-a", + HolderID: "snapshot-agent/test", + Checkpointer: checkpointer, + inFlight: make(map[string]struct{}), + } +} + +func makeWorkOrder(name, node, checkpointID string) *nvidiacomv1alpha1.SnapshotContent { + return &nvidiacomv1alpha1.SnapshotContent{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{snapshotprotocol.CheckpointIDLabel: checkpointID}, + Annotations: map[string]string{snapshotprotocol.CheckpointArtifactVersionAnnotation: "1"}, + }, + Spec: nvidiacomv1alpha1.SnapshotContentSpec{ + SnapshotRef: nvidiacomv1alpha1.SnapshotReference{Namespace: "inference", Name: "snapshot-" + checkpointID}, + Source: nvidiacomv1alpha1.SnapshotContentSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0", UID: types.UID("pod-uid")}, NodeName: node}, + }, + } +} + +func reconcileContent(t *testing.T, r *SnapshotContentReconciler, name string) ctrl.Result { + t.Helper() + res, err := r.Reconcile(context.Background(), ctrl.Request{NamespacedName: types.NamespacedName{Name: name}}) + require.NoError(t, err) + return res +} + +func getContent(t *testing.T, r *SnapshotContentReconciler, name string) *nvidiacomv1alpha1.SnapshotContent { + t.Helper() + c := &nvidiacomv1alpha1.SnapshotContent{} + require.NoError(t, r.Get(context.Background(), types.NamespacedName{Name: name}, c)) + return c +} + +func TestSnapshotContentReconciler_IgnoresOtherNode(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-b", "x") + fc := &fakeCheckpointer{} + r := makeContentReconciler(t, fc, content) + + reconcileContent(t, r, content.Name) + assert.False(t, fc.called) + got := getContent(t, r, content.Name) + assert.Empty(t, got.Status.Conditions) +} + +func TestSnapshotContentReconciler_InFlightGuard(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid")}, + Spec: corev1.PodSpec{NodeName: "node-a"}, + Status: corev1.PodStatus{Phase: corev1.PodRunning}, + } + r := makeContentReconciler(t, &fakeCheckpointer{}, content, pod) + // Pre-mark the work order in-flight; the reconcile must short-circuit. + r.inFlight["/snapshotcontent-x"] = struct{}{} + + res := reconcileContent(t, r, content.Name) + assert.Zero(t, res.RequeueAfter) + got := getContent(t, r, content.Name) + assert.Empty(t, got.Status.Conditions) +} + +func TestSnapshotContentReconciler_MissingCheckpointIDFails(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + delete(content.Labels, snapshotprotocol.CheckpointIDLabel) + r := makeContentReconciler(t, &fakeCheckpointer{}, content) + + reconcileContent(t, r, content.Name) + got := getContent(t, r, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "MissingStorageCoords", cond.Reason) +} + +func TestSnapshotContentReconciler_ResumeWritesReady(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + r := makeContentReconciler(t, &fakeCheckpointer{}, content) + // Pre-create the artifact directory at the resolved destination. + dest := filepath.Join(r.Config.Storage.BasePath, "abc", "versions", "1") + require.NoError(t, os.MkdirAll(dest, 0o755)) + + reconcileContent(t, r, content.Name) + got := getContent(t, r, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady) + require.NotNil(t, cond) +} + +func TestSnapshotContentReconciler_PodNotFoundBacksOff(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + r := makeContentReconciler(t, &fakeCheckpointer{}, content) // no pod + + res := reconcileContent(t, r, content.Name) + assert.Positive(t, res.RequeueAfter) + got := getContent(t, r, content.Name) + assert.Empty(t, got.Status.Conditions) +} + +func TestSnapshotContentReconciler_StalePodUIDFails(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("different-uid")}, + Spec: corev1.PodSpec{NodeName: "node-a"}, + Status: corev1.PodStatus{Phase: corev1.PodRunning}, + } + r := makeContentReconciler(t, &fakeCheckpointer{}, content, pod) + + reconcileContent(t, r, content.Name) + got := getContent(t, r, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "StalePodReference", cond.Reason) +} + +func TestSnapshotContentReconciler_PodFailedFails(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid")}, + Spec: corev1.PodSpec{NodeName: "node-a"}, + Status: corev1.PodStatus{Phase: corev1.PodFailed}, + } + r := makeContentReconciler(t, &fakeCheckpointer{}, content, pod) + + reconcileContent(t, r, content.Name) + got := getContent(t, r, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "SourcePodGone", cond.Reason) +} + +func TestSnapshotContentReconciler_NotReadyQuiesceRequeue(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid"), + Annotations: map[string]string{snapshotprotocol.TargetContainersAnnotation: "main"}, + }, + Spec: corev1.PodSpec{NodeName: "node-a"}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{{Name: "main", Ready: false}}, + }, + } + r := makeContentReconciler(t, &fakeCheckpointer{}, content, pod) + + res := reconcileContent(t, r, content.Name) + assert.Positive(t, res.RequeueAfter) + got := getContent(t, r, content.Name) + assert.Empty(t, got.Status.Conditions) +} + +func TestSnapshotContentReconciler_RunCheckpointWritesReadyOnSuccess(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + fc := &fakeCheckpointer{} + r := makeContentReconciler(t, fc, content) + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid")}} + leaseKey := client.ObjectKey{Namespace: "inference", Name: content.Name} + + r.runCheckpoint(context.Background(), content, pod, "main", "abc", + filepath.Join(r.Config.Storage.BasePath, "abc", "versions", "1"), leaseKey, "/snapshotcontent-abc") + + assert.True(t, fc.called) + got := getContent(t, r, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady) + require.NotNil(t, cond) +} + +func TestSnapshotContentReconciler_RunCheckpointWritesFailedOnError(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + fc := &fakeCheckpointer{err: errors.New("criu boom")} + r := makeContentReconciler(t, fc, content) + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid")}} + leaseKey := client.ObjectKey{Namespace: "inference", Name: content.Name} + + r.runCheckpoint(context.Background(), content, pod, "main", "abc", + filepath.Join(r.Config.Storage.BasePath, "abc", "versions", "1"), leaseKey, "/snapshotcontent-abc") + + got := getContent(t, r, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "CheckpointFailed", cond.Reason) +} diff --git a/deploy/snapshot/internal/controller/util.go b/deploy/snapshot/internal/controller/util.go index 58dca5ca4631..c791b03f7e4d 100644 --- a/deploy/snapshot/internal/controller/util.go +++ b/deploy/snapshot/internal/controller/util.go @@ -7,7 +7,6 @@ import ( "time" "github.com/go-logr/logr" - batchv1 "k8s.io/api/batch/v1" coordinationv1 "k8s.io/api/coordination/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -15,6 +14,8 @@ import ( ktypes "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" ) const ( @@ -82,164 +83,111 @@ func annotatePod(ctx context.Context, clientset kubernetes.Interface, log logr.L return err } -func getCheckpointJob(ctx context.Context, clientset kubernetes.Interface, pod *corev1.Pod) (*batchv1.Job, error) { - jobName := pod.Labels["batch.kubernetes.io/job-name"] - if jobName == "" { - return nil, fmt.Errorf("pod %s/%s has no batch.kubernetes.io/job-name label", pod.Namespace, pod.Name) - } - - job, err := clientset.BatchV1().Jobs(pod.Namespace).Get(ctx, jobName, metav1.GetOptions{}) - if err != nil { - return nil, fmt.Errorf("failed to get checkpoint job %s/%s: %w", pod.Namespace, jobName, err) - } - return job, nil -} - -func acquireCheckpointLease(ctx context.Context, clientset kubernetes.Interface, log logr.Logger, job *batchv1.Job, holderIdentity string) (bool, error) { - leaseName := job.Name +// acquireLease acquires or renews a checkpoint lease at an arbitrary namespace/name key, +// returning false when another live holder owns it. +func (scr *SnapshotContentReconciler) acquireLease(ctx context.Context, key client.ObjectKey) (bool, error) { now := metav1.NewMicroTime(time.Now()) leaseDurationSeconds := int32(checkpointLeaseDuration.Seconds()) - leaseClient := clientset.CoordinationV1().Leases(job.Namespace) - existingLease, err := leaseClient.Get(ctx, leaseName, metav1.GetOptions{}) + leaseClient := scr.Clientset.CoordinationV1().Leases(key.Namespace) + existing, err := leaseClient.Get(ctx, key.Name, metav1.GetOptions{}) if err != nil { if !apierrors.IsNotFound(err) { - return false, fmt.Errorf("failed to get checkpoint lease %s/%s: %w", job.Namespace, leaseName, err) + return false, fmt.Errorf("get checkpoint lease %s: %w", key.String(), err) } - lease := &coordinationv1.Lease{ - ObjectMeta: metav1.ObjectMeta{ - Name: leaseName, - Namespace: job.Namespace, - }, + ObjectMeta: metav1.ObjectMeta{Name: key.Name, Namespace: key.Namespace}, Spec: coordinationv1.LeaseSpec{ - HolderIdentity: &holderIdentity, + HolderIdentity: &scr.HolderID, LeaseDurationSeconds: &leaseDurationSeconds, AcquireTime: &now, RenewTime: &now, }, } - if _, err := leaseClient.Create(ctx, lease, metav1.CreateOptions{}); err != nil { if apierrors.IsAlreadyExists(err) { return false, nil } - return false, fmt.Errorf("failed to create checkpoint lease %s/%s: %w", job.Namespace, leaseName, err) + return false, fmt.Errorf("create checkpoint lease %s: %w", key.String(), err) } return true, nil } - if !checkpointLeaseExpired(existingLease, now.Time) && - existingLease.Spec.HolderIdentity != nil && - *existingLease.Spec.HolderIdentity != holderIdentity { + if !checkpointLeaseExpired(existing, now.Time) && + existing.Spec.HolderIdentity != nil && + *existing.Spec.HolderIdentity != scr.HolderID { return false, nil } - - existingLease.Spec.HolderIdentity = &holderIdentity - existingLease.Spec.LeaseDurationSeconds = &leaseDurationSeconds - if existingLease.Spec.AcquireTime == nil || checkpointLeaseExpired(existingLease, now.Time) { - existingLease.Spec.AcquireTime = &now + existing.Spec.HolderIdentity = &scr.HolderID + existing.Spec.LeaseDurationSeconds = &leaseDurationSeconds + if existing.Spec.AcquireTime == nil || checkpointLeaseExpired(existing, now.Time) { + existing.Spec.AcquireTime = &now } - existingLease.Spec.RenewTime = &now - - if _, err := leaseClient.Update(ctx, existingLease, metav1.UpdateOptions{}); err != nil { + existing.Spec.RenewTime = &now + if _, err := leaseClient.Update(ctx, existing, metav1.UpdateOptions{}); err != nil { if apierrors.IsConflict(err) { - log.V(1).Info("Checkpoint lease update conflicted", "lease", fmt.Sprintf("%s/%s", job.Namespace, leaseName)) return false, nil } - return false, fmt.Errorf("failed to update checkpoint lease %s/%s: %w", job.Namespace, leaseName, err) + return false, fmt.Errorf("update checkpoint lease %s: %w", key.String(), err) } - return true, nil } -func renewCheckpointLease(ctx context.Context, clientset kubernetes.Interface, job *batchv1.Job, holderIdentity string) error { - leaseName := job.Name - leaseClient := clientset.CoordinationV1().Leases(job.Namespace) - lease, err := leaseClient.Get(ctx, leaseName, metav1.GetOptions{}) +// renewLease periodically renews the lease until ctx is cancelled. +func (scr *SnapshotContentReconciler) renewLease(ctx context.Context, key client.ObjectKey) { + ticker := time.NewTicker(checkpointLeaseRenewInterval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + if err := scr.renewLeaseOnce(ctx, key); err != nil { + log.FromContext(ctx).Error(err, "Failed to renew checkpoint lease", "lease", key.String()) + return + } + } + } +} + +// renewLeaseOnce bumps the lease renew time, failing if this holder no longer owns it. +func (scr *SnapshotContentReconciler) renewLeaseOnce(ctx context.Context, key client.ObjectKey) error { + leaseClient := scr.Clientset.CoordinationV1().Leases(key.Namespace) + lease, err := leaseClient.Get(ctx, key.Name, metav1.GetOptions{}) if err != nil { - return fmt.Errorf("failed to get checkpoint lease %s/%s for renewal: %w", job.Namespace, leaseName, err) + return fmt.Errorf("get checkpoint lease %s for renewal: %w", key.String(), err) } - if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity != holderIdentity { - return fmt.Errorf("checkpoint lease %s/%s is no longer held by %q", job.Namespace, leaseName, holderIdentity) + if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity != scr.HolderID { + return fmt.Errorf("checkpoint lease %s is no longer held by %q", key.String(), scr.HolderID) } - now := metav1.NewMicroTime(time.Now()) leaseDurationSeconds := int32(checkpointLeaseDuration.Seconds()) lease.Spec.LeaseDurationSeconds = &leaseDurationSeconds lease.Spec.RenewTime = &now - if _, err := leaseClient.Update(ctx, lease, metav1.UpdateOptions{}); err != nil { - return fmt.Errorf("failed to renew checkpoint lease %s/%s: %w", job.Namespace, leaseName, err) + return fmt.Errorf("renew checkpoint lease %s: %w", key.String(), err) } return nil } -func releaseCheckpointLease(ctx context.Context, clientset kubernetes.Interface, log logr.Logger, job *batchv1.Job, holderIdentity string) error { - leaseName := job.Name - leaseClient := clientset.CoordinationV1().Leases(job.Namespace) - lease, err := leaseClient.Get(ctx, leaseName, metav1.GetOptions{}) +// releaseLease deletes the lease if this holder owns it. +func (scr *SnapshotContentReconciler) releaseLease(ctx context.Context, key client.ObjectKey) error { + leaseClient := scr.Clientset.CoordinationV1().Leases(key.Namespace) + lease, err := leaseClient.Get(ctx, key.Name, metav1.GetOptions{}) if err != nil { if apierrors.IsNotFound(err) { return nil } - return fmt.Errorf("failed to get checkpoint lease %s/%s for release: %w", job.Namespace, leaseName, err) + return fmt.Errorf("get checkpoint lease %s for release: %w", key.String(), err) } - - if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity != holderIdentity { - log.V(1).Info("Skipping checkpoint lease release because another holder owns it", - "lease", fmt.Sprintf("%s/%s", job.Namespace, leaseName), - "holder", holderIdentity, - ) + if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity != scr.HolderID { return nil } - - if err := leaseClient.Delete(ctx, leaseName, metav1.DeleteOptions{}); err != nil && !apierrors.IsNotFound(err) { - return fmt.Errorf("failed to delete checkpoint lease %s/%s: %w", job.Namespace, leaseName, err) + if err := leaseClient.Delete(ctx, key.Name, metav1.DeleteOptions{}); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("delete checkpoint lease %s: %w", key.String(), err) } return nil } - -func (w *NodeController) renewCheckpointLease(ctx context.Context, log logr.Logger, job *batchv1.Job, stopLease context.CancelCauseFunc) { - ticker := time.NewTicker(checkpointLeaseRenewInterval) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - if err := renewCheckpointLease(ctx, w.clientset, job, w.holderID); err != nil { - log.Error(err, "Failed to renew checkpoint lease") - stopLease(fmt.Errorf("checkpoint lease renewal failed: %w", err)) - return - } - } - } -} - -func annotateJob(ctx context.Context, clientset kubernetes.Interface, log logr.Logger, job *batchv1.Job, annotations map[string]string) error { - patchBytes, err := json.Marshal(map[string]any{ - "metadata": map[string]any{ - "annotations": annotations, - }, - }) - if err != nil { - return fmt.Errorf("failed to build job annotation patch payload: %w", err) - } - - _, err = clientset.BatchV1().Jobs(job.Namespace).Patch( - ctx, job.Name, ktypes.MergePatchType, patchBytes, metav1.PatchOptions{}, - ) - if err != nil { - log.Error(err, "Failed to annotate checkpoint job", - "job", fmt.Sprintf("%s/%s", job.Namespace, job.Name), - "annotations", annotations, - ) - } - return err -} - func emitPodEvent(ctx context.Context, clientset kubernetes.Interface, log logr.Logger, pod *corev1.Pod, component, eventType, reason, message string) { event := &corev1.Event{ ObjectMeta: metav1.ObjectMeta{ From 65208a9bd92666af22e7e9bd2d3240421ae0a0f7 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Mon, 15 Jun 2026 19:22:12 +0300 Subject: [PATCH 05/14] feat(operator): add Snapshot status condition helpers Signed-off-by: Ron Kahn --- deploy/operator/api/v1alpha1/snapshot_types.go | 11 +++++++++++ deploy/operator/api/v1alpha1/snapshotcontent_types.go | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/deploy/operator/api/v1alpha1/snapshot_types.go b/deploy/operator/api/v1alpha1/snapshot_types.go index 0918fddba06a..5ecc9d162763 100644 --- a/deploy/operator/api/v1alpha1/snapshot_types.go +++ b/deploy/operator/api/v1alpha1/snapshot_types.go @@ -18,6 +18,7 @@ package v1alpha1 import ( + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" ) @@ -32,6 +33,16 @@ const ( SnapshotConditionFailed = "Failed" ) +// IsSnapshotSucceeded reports whether the Snapshot's Ready condition is True. +func IsSnapshotSucceeded(s *Snapshot) bool { + return meta.IsStatusConditionTrue(s.Status.Conditions, SnapshotConditionReady) +} + +// IsSnapshotFailed reports whether the Snapshot's Failed condition is True. +func IsSnapshotFailed(s *Snapshot) bool { + return meta.IsStatusConditionTrue(s.Status.Conditions, SnapshotConditionFailed) +} + // SnapshotSpec defines the desired state of Snapshot. // // Minimal "trigger" shape: it names what to capture (an existing pod) and the diff --git a/deploy/operator/api/v1alpha1/snapshotcontent_types.go b/deploy/operator/api/v1alpha1/snapshotcontent_types.go index 863a61cf3cb6..79274590d587 100644 --- a/deploy/operator/api/v1alpha1/snapshotcontent_types.go +++ b/deploy/operator/api/v1alpha1/snapshotcontent_types.go @@ -18,10 +18,21 @@ package v1alpha1 import ( + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" ) +// IsSnapshotContentSucceeded reports whether the SnapshotContent's Ready condition is True. +func IsSnapshotContentSucceeded(c *SnapshotContent) bool { + return meta.IsStatusConditionTrue(c.Status.Conditions, SnapshotConditionReady) +} + +// IsSnapshotContentFailed reports whether the SnapshotContent's Failed condition is True. +func IsSnapshotContentFailed(c *SnapshotContent) bool { + return meta.IsStatusConditionTrue(c.Status.Conditions, SnapshotConditionFailed) +} + // SnapshotContentSpec defines the desired state of SnapshotContent. It is // populated by the SnapshotReconciler (operator) at creation time and is // immutable thereafter. From 3e955292974da0adf0b93d7ba0599ecab84d91aa Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Mon, 15 Jun 2026 19:22:13 +0300 Subject: [PATCH 06/14] refactor(operator): apply Component C PR review feedback Signed-off-by: Ron Kahn --- .../operator/templates/manager-rbac.yaml | 1 - deploy/operator/config/rbac/role.yaml | 1 - .../controller/dynamocheckpoint_controller.go | 59 ++++--- .../controller/snapshot_reconciler.go | 149 ++++++++---------- .../controller/snapshot_reconciler_test.go | 12 +- 5 files changed, 103 insertions(+), 119 deletions(-) diff --git a/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml b/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml index 988a2a42364a..0b9c0ef7e899 100644 --- a/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml +++ b/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml @@ -348,7 +348,6 @@ rules: - dynamographdeploymentrequests/finalizers - dynamographdeployments/finalizers - dynamomodels/finalizers - - snapshotcontents/finalizers - snapshots/finalizers verbs: - update diff --git a/deploy/operator/config/rbac/role.yaml b/deploy/operator/config/rbac/role.yaml index 45a0892273a3..678282e51e9d 100644 --- a/deploy/operator/config/rbac/role.yaml +++ b/deploy/operator/config/rbac/role.yaml @@ -245,7 +245,6 @@ rules: - dynamographdeploymentrequests/finalizers - dynamographdeployments/finalizers - dynamomodels/finalizers - - snapshotcontents/finalizers - snapshots/finalizers verbs: - update diff --git a/deploy/operator/internal/controller/dynamocheckpoint_controller.go b/deploy/operator/internal/controller/dynamocheckpoint_controller.go index 3199b5a213a8..38949fc9d1b1 100644 --- a/deploy/operator/internal/controller/dynamocheckpoint_controller.go +++ b/deploy/operator/internal/controller/dynamocheckpoint_controller.go @@ -30,6 +30,7 @@ import ( "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/record" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" @@ -356,8 +357,6 @@ func (r *CheckpointReconciler) handleCreating(ctx context.Context, ckpt *nvidiac // hang guards) onto the DynamoCheckpoint phase. Completion cascades up from SnapshotContent // → Snapshot → DynamoCheckpoint, so this never reads the Job's terminal annotation. func (r *CheckpointReconciler) observeSnapshot(ctx context.Context, ckpt *nvidiacomv1alpha1.DynamoCheckpoint, job *batchv1.Job, checkpointID string) (ctrl.Result, error) { - logger := log.FromContext(ctx) - snap := &nvidiacomv1alpha1.Snapshot{} if err := r.Get(ctx, client.ObjectKey{Namespace: ckpt.Namespace, Name: snapshotName(checkpointID)}, snap); err != nil { if apierrors.IsNotFound(err) { @@ -367,26 +366,14 @@ func (r *CheckpointReconciler) observeSnapshot(ctx context.Context, ckpt *nvidia } // Read Snapshot.status only once it is bound; an unbound Snapshot is still being - // set up by the SnapshotReconciler. + // set up by the SnapshotReconciler. Observe the terminal state and delegate the + // status write to a dedicated function. if snap.Status.BoundSnapshotContentName != nil { - if cond := meta.FindStatusCondition(snap.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady); cond != nil && cond.Status == metav1.ConditionTrue { - logger.Info("Snapshot ready", "snapshot", snap.Name) - r.Recorder.Event(ckpt, corev1.EventTypeNormal, "CheckpointReady", cond.Message) - now := metav1.Now() - ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseReady - ckpt.Status.CheckpointID = checkpointID - ckpt.Status.CreatedAt = &now - ckpt.Status.Message = "" - meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{ - Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted), - Status: metav1.ConditionTrue, - Reason: "SnapshotReady", - Message: cond.Message, - }) - return ctrl.Result{}, r.Status().Update(ctx, ckpt) + if nvidiacomv1alpha1.IsSnapshotSucceeded(snap) { + return r.markCheckpointReady(ctx, ckpt, checkpointID, snapshotConditionMessage(snap, nvidiacomv1alpha1.SnapshotConditionReady)) } - if cond := meta.FindStatusCondition(snap.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed); cond != nil && cond.Status == metav1.ConditionTrue { - return r.failCreating(ctx, ckpt, "SnapshotFailed", cond.Message) + if nvidiacomv1alpha1.IsSnapshotFailed(snap) { + return r.failCreating(ctx, ckpt, "SnapshotFailed", snapshotConditionMessage(snap, nvidiacomv1alpha1.SnapshotConditionFailed)) } } @@ -422,6 +409,31 @@ func (r *CheckpointReconciler) failCreating(ctx context.Context, ckpt *nvidiacom return ctrl.Result{}, r.Status().Update(ctx, ckpt) } +// markCheckpointReady marks the DynamoCheckpoint Ready after its bound Snapshot succeeded. +func (r *CheckpointReconciler) markCheckpointReady(ctx context.Context, ckpt *nvidiacomv1alpha1.DynamoCheckpoint, checkpointID, message string) (ctrl.Result, error) { + log.FromContext(ctx).Info("Checkpoint ready", "checkpointID", checkpointID) + r.Recorder.Event(ckpt, corev1.EventTypeNormal, "CheckpointReady", message) + ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseReady + ckpt.Status.CheckpointID = checkpointID + ckpt.Status.CreatedAt = ptr.To(metav1.Now()) + ckpt.Status.Message = "" + meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{ + Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted), + Status: metav1.ConditionTrue, + Reason: "SnapshotReady", + Message: message, + }) + return ctrl.Result{}, r.Status().Update(ctx, ckpt) +} + +// snapshotConditionMessage returns the message of the named Snapshot condition, or "". +func snapshotConditionMessage(snap *nvidiacomv1alpha1.Snapshot, condType string) string { + if cond := meta.FindStatusCondition(snap.Status.Conditions, condType); cond != nil { + return cond.Message + } + return "" +} + // checkpointJobFailed reports whether the Job has a True JobFailed condition. func checkpointJobFailed(job *batchv1.Job) (bool, string) { for _, condition := range job.Status.Conditions { @@ -520,10 +532,11 @@ func (r *CheckpointReconciler) SetupWithManager(mgr ctrl.Manager) error { GenericFunc: func(ge event.GenericEvent) bool { return true }, })). Owns(&nvidiacomv1alpha1.Snapshot{}, builder.WithPredicates(predicate.Funcs{ - // Status mirror cascades up via Snapshot status updates only; ignore - // create/delete so the mirror cannot storm. + // Ignore create (we just created it). Watch update (status mirror) and + // delete (re-enqueue to recreate / unblock). Delete is safe: reconcile + // exits at the deletion-timestamp guard before reaching observeSnapshot. CreateFunc: func(ce event.CreateEvent) bool { return false }, - DeleteFunc: func(de event.DeleteEvent) bool { return false }, + DeleteFunc: func(de event.DeleteEvent) bool { return true }, UpdateFunc: func(ue event.UpdateEvent) bool { return true }, GenericFunc: func(ge event.GenericEvent) bool { return false }, })). diff --git a/deploy/operator/internal/controller/snapshot_reconciler.go b/deploy/operator/internal/controller/snapshot_reconciler.go index 6377960b306a..22f4b0e7e515 100644 --- a/deploy/operator/internal/controller/snapshot_reconciler.go +++ b/deploy/operator/internal/controller/snapshot_reconciler.go @@ -33,13 +33,10 @@ import ( "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" @@ -47,7 +44,8 @@ import ( ) const ( - // snapshotFinalizer guards SnapshotContent cleanup before a Snapshot is removed. + // snapshotFinalizer is set on the Snapshot so its bound SnapshotContent is + // deleted before the Snapshot is removed. snapshotFinalizer = "nvidia.com/snapshot-content-cleanup" // snapshotContentFieldManager is the Server-Side Apply field owner for SnapshotContents. @@ -81,7 +79,6 @@ type SnapshotReconciler struct { // +kubebuilder:rbac:groups=nvidia.com,resources=snapshots/finalizers,verbs=update // +kubebuilder:rbac:groups=nvidia.com,resources=snapshotcontents,verbs=create;get;list;watch;update;patch;delete // +kubebuilder:rbac:groups=nvidia.com,resources=snapshotcontents/status,verbs=get;update;patch -// +kubebuilder:rbac:groups=nvidia.com,resources=snapshotcontents/finalizers,verbs=update // +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch // Reconcile drives a Snapshot through binding, status mirroring, and cascade deletion. @@ -105,14 +102,18 @@ func (sr *SnapshotReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, nil } - pod, err := sr.resolveSourcePod(ctx, snap) + pod, err := sr.getSourcePod(ctx, snap) if err != nil { - if errors.Is(err, errSnapshotPodUnscheduled) || apierrors.IsNotFound(err) { - logger.V(1).Info("Source pod not ready, backing off", "snapshot", snap.Name, "reason", err.Error()) + if apierrors.IsNotFound(err) { + logger.V(1).Info("Source pod not found, backing off", "snapshot", snap.Name) return ctrl.Result{RequeueAfter: jitteredBackoff(snapshotPodResolveBackoffBase)}, nil } return ctrl.Result{}, err } + if err := validateSourcePod(pod); err != nil { + logger.V(1).Info("Source pod not ready, backing off", "snapshot", snap.Name, "reason", err.Error()) + return ctrl.Result{RequeueAfter: jitteredBackoff(snapshotPodResolveBackoffBase)}, nil + } contentName := snapshotContentName(snap.Spec.CheckpointID) if errs := validation.IsDNS1123Subdomain(contentName); len(errs) > 0 || len(contentName) > maxResourceNameLength { @@ -120,61 +121,57 @@ func (sr *SnapshotReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( fmt.Errorf("composed SnapshotContent name %q is invalid: too long or not a DNS subdomain", contentName)) } - bound, err := sr.findBoundContent(ctx, contentName) + content, err := sr.ensureSnapshotContent(ctx, snap, contentName, pod) if err != nil { return ctrl.Result{}, err } - if bound != nil && bound.Spec.Source.NodeName != pod.Spec.NodeName { + // A freshly-created content always matches; only a pre-existing content whose + // source pod was rescheduled to another node mismatches (spec is immutable). + if content.Spec.Source.NodeName != pod.Spec.NodeName { return sr.failSnapshot(ctx, snap, "PodRescheduled", fmt.Errorf("source pod moved from node %q to %q; CRIU checkpoint cannot survive migration", - bound.Spec.Source.NodeName, pod.Spec.NodeName)) - } - - if err := sr.ensureSnapshotContent(ctx, snap, contentName, pod); err != nil { - return ctrl.Result{}, err + content.Spec.Source.NodeName, pod.Spec.NodeName)) } - if err := sr.bindAndMirror(ctx, snap, contentName); err != nil { - return ctrl.Result{}, err - } - return ctrl.Result{}, nil + return sr.propagateStatus(ctx, snap, content) } -// resolveSourcePod loads the source pod and requires it be scheduled to a node. -func (sr *SnapshotReconciler) resolveSourcePod(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot) (*corev1.Pod, error) { +// getSourcePod loads the source pod referenced by the Snapshot. +func (sr *SnapshotReconciler) getSourcePod(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot) (*corev1.Pod, error) { pod := &corev1.Pod{} key := client.ObjectKey{Namespace: snap.Namespace, Name: snap.Spec.Source.PodRef.Name} if err := sr.Get(ctx, key, pod); err != nil { return nil, err } + return pod, nil +} + +// validateSourcePod requires the pod to be scheduled to a node. +func validateSourcePod(pod *corev1.Pod) error { if pod.Spec.NodeName == "" { - return nil, errSnapshotPodUnscheduled + return errSnapshotPodUnscheduled } - return pod, nil + return nil } -// findBoundContent returns the bound SnapshotContent if it already exists, or nil. -func (sr *SnapshotReconciler) findBoundContent(ctx context.Context, contentName string) (*nvidiacomv1alpha1.SnapshotContent, error) { - content := &nvidiacomv1alpha1.SnapshotContent{} - if err := sr.Get(ctx, client.ObjectKey{Name: contentName}, content); err != nil { - if apierrors.IsNotFound(err) { - return nil, nil - } +// ensureSnapshotContent returns the existing SnapshotContent or, when absent, creates it +// via a single Server-Side Apply carrying source, the node mirror label, and storage-coord +// metadata. The returned object is the source of truth for the reschedule guard. +func (sr *SnapshotReconciler) ensureSnapshotContent(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot, contentName string, pod *corev1.Pod) (*nvidiacomv1alpha1.SnapshotContent, error) { + existing := &nvidiacomv1alpha1.SnapshotContent{} + if err := sr.Get(ctx, client.ObjectKey{Name: contentName}, existing); err == nil { + return existing, nil + } else if !apierrors.IsNotFound(err) { return nil, err } - return content, nil -} -// ensureSnapshotContent applies the SnapshotContent work order via a single Server-Side -// Apply carrying source, the node mirror label, storage-coord metadata, and the finalizer. -func (sr *SnapshotReconciler) ensureSnapshotContent(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot, contentName string, pod *corev1.Pod) error { content := sr.buildSnapshotContent(snap, contentName, pod) if err := sr.Patch(ctx, content, client.Apply, client.FieldOwner(snapshotContentFieldManager), client.ForceOwnership); err != nil { sr.Recorder.Event(snap, corev1.EventTypeWarning, "SnapshotContentCreateFailed", err.Error()) - return fmt.Errorf("apply SnapshotContent %q: %w", contentName, err) + return nil, fmt.Errorf("apply SnapshotContent %q: %w", contentName, err) } - return nil + return content, nil } // buildSnapshotContent constructs the desired cluster-scoped SnapshotContent for a Snapshot. @@ -193,7 +190,6 @@ func (sr *SnapshotReconciler) buildSnapshotContent(snap *nvidiacomv1alpha1.Snaps Annotations: map[string]string{ snapshotprotocol.CheckpointArtifactVersionAnnotation: snapshotprotocol.ArtifactVersion(snap.Annotations[snapshotprotocol.CheckpointArtifactVersionAnnotation]), }, - Finalizers: []string{snapshotFinalizer}, }, Spec: nvidiacomv1alpha1.SnapshotContentSpec{ SnapshotRef: nvidiacomv1alpha1.SnapshotReference{ @@ -209,38 +205,35 @@ func (sr *SnapshotReconciler) buildSnapshotContent(snap *nvidiacomv1alpha1.Snaps } } -// bindAndMirror records the binding and mirrors the SnapshotContent's terminal status to -// the Snapshot, defaulting to a Pending condition until the agent writes a result. -func (sr *SnapshotReconciler) bindAndMirror(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot, contentName string) error { - content := &nvidiacomv1alpha1.SnapshotContent{} - if err := sr.Get(ctx, client.ObjectKey{Name: contentName}, content); err != nil { - return client.IgnoreNotFound(err) - } - +// propagateStatus records the binding and mirrors the SnapshotContent's terminal status to +// the Snapshot, defaulting to a Pending condition until the agent writes a result. It +// receives the content resolved earlier in the reconcile, so it never re-Gets it. +func (sr *SnapshotReconciler) propagateStatus(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot, content *nvidiacomv1alpha1.SnapshotContent) (ctrl.Result, error) { changed := false - if snap.Status.BoundSnapshotContentName == nil || *snap.Status.BoundSnapshotContentName != contentName { - snap.Status.BoundSnapshotContentName = &contentName + if snap.Status.BoundSnapshotContentName == nil || *snap.Status.BoundSnapshotContentName != content.Name { + name := content.Name + snap.Status.BoundSnapshotContentName = &name changed = true } - ready := meta.FindStatusCondition(content.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady) - failed := meta.FindStatusCondition(content.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) switch { - case ready != nil && ready.Status == metav1.ConditionTrue: - changed = sr.setCondition(snap, nvidiacomv1alpha1.SnapshotConditionReady, metav1.ConditionTrue, ready.Reason, ready.Message) || changed - case failed != nil && failed.Status == metav1.ConditionTrue: - changed = sr.setCondition(snap, nvidiacomv1alpha1.SnapshotConditionFailed, metav1.ConditionTrue, failed.Reason, failed.Message) || changed + case nvidiacomv1alpha1.IsSnapshotContentSucceeded(content): + cond := meta.FindStatusCondition(content.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady) + changed = sr.setCondition(snap, nvidiacomv1alpha1.SnapshotConditionReady, metav1.ConditionTrue, cond.Reason, cond.Message) || changed + case nvidiacomv1alpha1.IsSnapshotContentFailed(content): + cond := meta.FindStatusCondition(content.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + changed = sr.setCondition(snap, nvidiacomv1alpha1.SnapshotConditionFailed, metav1.ConditionTrue, cond.Reason, cond.Message) || changed default: changed = sr.setCondition(snap, nvidiacomv1alpha1.SnapshotConditionReady, metav1.ConditionFalse, "Pending", "Waiting for node agent to capture the checkpoint") || changed } if !changed { - return nil + return ctrl.Result{}, nil } if err := sr.Status().Update(ctx, snap); err != nil { - return fmt.Errorf("update snapshot status: %w", err) + return ctrl.Result{}, fmt.Errorf("update snapshot status: %w", err) } - return nil + return ctrl.Result{}, nil } // setCondition sets a status condition and reports whether it changed. @@ -263,37 +256,25 @@ func (sr *SnapshotReconciler) failSnapshot(ctx context.Context, snap *nvidiacomv return ctrl.Result{}, nil } -// handleDelete cascades deletion to the bound SnapshotContent, waits for it to be gone, -// then drops the Snapshot finalizer. +// handleDelete cascades deletion to the bound SnapshotContent and blocks (requeues) until +// it is gone before dropping the Snapshot finalizer. The SnapshotContent carries no +// finalizer of its own, so the Delete takes effect immediately. func (sr *SnapshotReconciler) handleDelete(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot) (ctrl.Result, error) { if !controllerutil.ContainsFinalizer(snap, snapshotFinalizer) { return ctrl.Result{}, nil } contentName := snapshotContentName(snap.Spec.CheckpointID) - content := &nvidiacomv1alpha1.SnapshotContent{} - err := sr.Get(ctx, client.ObjectKey{Name: contentName}, content) - switch { - case err == nil: - // Clear the controller finalizer first so the subsequent Delete is not - // blocked, then issue the Delete. The reconcile requeues until the - // SnapshotContent is fully gone. - if controllerutil.ContainsFinalizer(content, snapshotFinalizer) { - controllerutil.RemoveFinalizer(content, snapshotFinalizer) - if updErr := sr.Update(ctx, content); updErr != nil && !apierrors.IsNotFound(updErr) { - return ctrl.Result{}, fmt.Errorf("clear SnapshotContent %q finalizer: %w", contentName, updErr) - } - } - if content.GetDeletionTimestamp().IsZero() { - if delErr := sr.Delete(ctx, content); delErr != nil && !apierrors.IsNotFound(delErr) { - return ctrl.Result{}, fmt.Errorf("delete SnapshotContent %q: %w", contentName, delErr) - } - } + content := &nvidiacomv1alpha1.SnapshotContent{ObjectMeta: metav1.ObjectMeta{Name: contentName}} + if err := sr.Delete(ctx, content); err != nil && !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("delete SnapshotContent %q: %w", contentName, err) + } + + // Block until the content is confirmed gone before releasing the Snapshot. + if err := sr.Get(ctx, client.ObjectKey{Name: contentName}, &nvidiacomv1alpha1.SnapshotContent{}); err == nil { return ctrl.Result{RequeueAfter: snapshotContentDeleteRequeue}, nil - case apierrors.IsNotFound(err): - // SnapshotContent gone; drop the Snapshot finalizer. - default: - return ctrl.Result{}, fmt.Errorf("get SnapshotContent %q: %w", contentName, err) + } else if !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("confirm SnapshotContent %q deleted: %w", contentName, err) } controllerutil.RemoveFinalizer(snap, snapshotFinalizer) @@ -311,12 +292,6 @@ func (sr *SnapshotReconciler) SetupWithManager(mgr ctrl.Manager) error { Watches( &nvidiacomv1alpha1.SnapshotContent{}, handler.EnqueueRequestsFromMapFunc(snapshotContentToSnapshot), - builder.WithPredicates(predicate.Funcs{ - CreateFunc: func(event.CreateEvent) bool { return false }, - UpdateFunc: func(ue event.UpdateEvent) bool { return true }, - DeleteFunc: func(event.DeleteEvent) bool { return true }, - GenericFunc: func(event.GenericEvent) bool { return false }, - }), ). Complete(sr) } diff --git a/deploy/operator/internal/controller/snapshot_reconciler_test.go b/deploy/operator/internal/controller/snapshot_reconciler_test.go index 95bd75876677..0145cbaf77a2 100644 --- a/deploy/operator/internal/controller/snapshot_reconciler_test.go +++ b/deploy/operator/internal/controller/snapshot_reconciler_test.go @@ -116,7 +116,7 @@ func TestSnapshotReconciler_BuildsWorkOrderAndBinds(t *testing.T) { assert.Equal(t, "node-a", content.Labels[snapshotprotocol.SnapshotNodeLabel]) assert.Equal(t, "abc123", content.Labels[snapshotprotocol.CheckpointIDLabel]) assert.Equal(t, "3", content.Annotations[snapshotprotocol.CheckpointArtifactVersionAnnotation]) - assert.Contains(t, content.Finalizers, snapshotFinalizer) + assert.Empty(t, content.Finalizers) assert.Equal(t, "inference", content.Spec.SnapshotRef.Namespace) assert.Equal(t, snap.Name, content.Spec.SnapshotRef.Name) @@ -213,7 +213,7 @@ func TestSnapshotReconciler_CascadeDelete(t *testing.T) { snap := makeSnapshotForReconcile("abc123", "worker-0") snap.DeletionTimestamp = &now content := &nvidiacomv1alpha1.SnapshotContent{ - ObjectMeta: metav1.ObjectMeta{Name: "snapshotcontent-abc123", Finalizers: []string{snapshotFinalizer}}, + ObjectMeta: metav1.ObjectMeta{Name: "snapshotcontent-abc123"}, Spec: nvidiacomv1alpha1.SnapshotContentSpec{ SnapshotRef: nvidiacomv1alpha1.SnapshotReference{Namespace: "inference", Name: snap.Name}, Source: nvidiacomv1alpha1.SnapshotContentSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0"}, NodeName: "node-a"}, @@ -221,14 +221,12 @@ func TestSnapshotReconciler_CascadeDelete(t *testing.T) { } r := makeSnapshotReconciler(s, snap, content) - // First pass deletes the content and clears its finalizer; it requeues. - res := reconcileSnapshot(t, r, snap.Name) - assert.Positive(t, res.RequeueAfter) + // The content carries no finalizer, so it is deleted immediately; one pass deletes + // the content and, once confirmed gone, drops the Snapshot finalizer. + reconcileSnapshot(t, r, snap.Name) err := r.Get(context.Background(), types.NamespacedName{Name: "snapshotcontent-abc123"}, &nvidiacomv1alpha1.SnapshotContent{}) assert.True(t, apierrors.IsNotFound(err)) - // Second pass drops the Snapshot finalizer now that the content is gone. - reconcileSnapshot(t, r, snap.Name) gone := &nvidiacomv1alpha1.Snapshot{} err = r.Get(context.Background(), types.NamespacedName{Namespace: "inference", Name: snap.Name}, gone) if err == nil { From 927d989df10d794e1e2cb96eed3c2ee43778a1c6 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Mon, 15 Jun 2026 23:09:15 +0300 Subject: [PATCH 07/14] refactor(snapshot): drop manager, capture from pod annotations Signed-off-by: Ron Kahn --- .../controller/snapshot_reconciler.go | 10 +- .../controller/snapshot_reconciler_test.go | 4 +- deploy/snapshot/cmd/agent/main.go | 34 +- deploy/snapshot/go.mod | 3 - deploy/snapshot/go.sum | 10 - .../internal/controller/controller.go | 84 ++++- .../internal/controller/controller_test.go | 6 + .../snapshot/internal/controller/manager.go | 74 ---- .../internal/controller/nodecheckpointer.go | 124 ------- .../internal/controller/snapshotcontent.go | 350 ++++++++++++++++++ .../controller/snapshotcontent_reconciler.go | 300 --------------- .../snapshotcontent_reconciler_test.go | 238 ------------ .../controller/snapshotcontent_test.go | 348 +++++++++++++++++ deploy/snapshot/internal/controller/util.go | 28 +- 14 files changed, 803 insertions(+), 810 deletions(-) delete mode 100644 deploy/snapshot/internal/controller/manager.go delete mode 100644 deploy/snapshot/internal/controller/nodecheckpointer.go create mode 100644 deploy/snapshot/internal/controller/snapshotcontent.go delete mode 100644 deploy/snapshot/internal/controller/snapshotcontent_reconciler.go delete mode 100644 deploy/snapshot/internal/controller/snapshotcontent_reconciler_test.go create mode 100644 deploy/snapshot/internal/controller/snapshotcontent_test.go diff --git a/deploy/operator/internal/controller/snapshot_reconciler.go b/deploy/operator/internal/controller/snapshot_reconciler.go index 22f4b0e7e515..973fe0b1f64f 100644 --- a/deploy/operator/internal/controller/snapshot_reconciler.go +++ b/deploy/operator/internal/controller/snapshot_reconciler.go @@ -154,9 +154,9 @@ func validateSourcePod(pod *corev1.Pod) error { return nil } -// ensureSnapshotContent returns the existing SnapshotContent or, when absent, creates it -// via a single Server-Side Apply carrying source, the node mirror label, and storage-coord -// metadata. The returned object is the source of truth for the reschedule guard. +// ensureSnapshotContent returns the existing SnapshotContent or, when absent, creates the +// trigger via a single Server-Side Apply carrying the source ref and the node mirror label. +// The returned object is the source of truth for the reschedule guard. func (sr *SnapshotReconciler) ensureSnapshotContent(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot, contentName string, pod *corev1.Pod) (*nvidiacomv1alpha1.SnapshotContent, error) { existing := &nvidiacomv1alpha1.SnapshotContent{} if err := sr.Get(ctx, client.ObjectKey{Name: contentName}, existing); err == nil { @@ -185,10 +185,6 @@ func (sr *SnapshotReconciler) buildSnapshotContent(snap *nvidiacomv1alpha1.Snaps Name: contentName, Labels: map[string]string{ snapshotprotocol.SnapshotNodeLabel: pod.Spec.NodeName, - snapshotprotocol.CheckpointIDLabel: snap.Spec.CheckpointID, - }, - Annotations: map[string]string{ - snapshotprotocol.CheckpointArtifactVersionAnnotation: snapshotprotocol.ArtifactVersion(snap.Annotations[snapshotprotocol.CheckpointArtifactVersionAnnotation]), }, }, Spec: nvidiacomv1alpha1.SnapshotContentSpec{ diff --git a/deploy/operator/internal/controller/snapshot_reconciler_test.go b/deploy/operator/internal/controller/snapshot_reconciler_test.go index 0145cbaf77a2..f4e2f06429c6 100644 --- a/deploy/operator/internal/controller/snapshot_reconciler_test.go +++ b/deploy/operator/internal/controller/snapshot_reconciler_test.go @@ -114,8 +114,8 @@ func TestSnapshotReconciler_BuildsWorkOrderAndBinds(t *testing.T) { assert.Equal(t, types.UID("pod-uid-9"), content.Spec.Source.PodRef.UID) assert.Equal(t, "node-a", content.Spec.Source.NodeName) assert.Equal(t, "node-a", content.Labels[snapshotprotocol.SnapshotNodeLabel]) - assert.Equal(t, "abc123", content.Labels[snapshotprotocol.CheckpointIDLabel]) - assert.Equal(t, "3", content.Annotations[snapshotprotocol.CheckpointArtifactVersionAnnotation]) + assert.NotContains(t, content.Labels, snapshotprotocol.CheckpointIDLabel) + assert.NotContains(t, content.Annotations, snapshotprotocol.CheckpointArtifactVersionAnnotation) assert.Empty(t, content.Finalizers) assert.Equal(t, "inference", content.Spec.SnapshotRef.Namespace) assert.Equal(t, snap.Name, content.Spec.SnapshotRef.Name) diff --git a/deploy/snapshot/cmd/agent/main.go b/deploy/snapshot/cmd/agent/main.go index 0a3c77f73f7f..09d286cc840d 100644 --- a/deploy/snapshot/cmd/agent/main.go +++ b/deploy/snapshot/cmd/agent/main.go @@ -47,13 +47,11 @@ func main() { } }() - // rootCtx is cancelled on signal. The restore informer's lifetime is bound to - // informerCtx, which is only cancelled after the manager's Start returns, so the - // restore path keeps running until the capture manager has fully shut down. + // rootCtx is cancelled on signal. The single node controller drives both the + // restore (pod informer) and capture (SnapshotContent informer) paths and shuts + // down when rootCtx is cancelled. rootCtx, cancel := context.WithCancel(context.Background()) defer cancel() - informerCtx, stopInformer := context.WithCancel(context.Background()) - defer stopInformer() sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) @@ -69,33 +67,13 @@ func main() { "runtime", *runtimeType, ) - // Restore path: the existing node-local client-go controller. + // The node controller handles both restore and capture paths. nodeController, err := controller.NewNodeController(cfg, rt, rootLog.WithName("controller")) if err != nil { fatal(agentLog, err, "Failed to create snapshot node controller") } - restoreDone := make(chan error, 1) - go func() { - agentLog.Info("Snapshot restore controller started") - restoreDone <- nodeController.Run(informerCtx) - }() - - // Capture path: the per-node SnapshotContent controller-runtime manager. - mgr, err := controller.NewSnapshotContentManager(cfg, rt) - if err != nil { - fatal(agentLog, err, "Failed to create snapshot-content manager") - } - - agentLog.Info("Starting snapshot-content manager") - startErr := mgr.Start(rootCtx) - - // Manager has returned; now tear down the restore informer. - stopInformer() - if restoreErr := <-restoreDone; restoreErr != nil { - agentLog.Error(restoreErr, "Snapshot restore controller exited with error") - } - if startErr != nil { - fatal(agentLog, startErr, "Snapshot-content manager exited with error") + if runErr := nodeController.Run(rootCtx); runErr != nil { + fatal(agentLog, runErr, "Snapshot node controller exited with error") } agentLog.Info("Agent stopped") diff --git a/deploy/snapshot/go.mod b/deploy/snapshot/go.mod index a4fa664d78c8..2b5a117b265e 100644 --- a/deploy/snapshot/go.mod +++ b/deploy/snapshot/go.mod @@ -54,7 +54,6 @@ require ( github.com/emicklei/go-restful/v3 v3.13.0 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect - github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-openapi/jsonpointer v0.21.2 // indirect @@ -62,7 +61,6 @@ require ( github.com/go-openapi/swag v0.23.1 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect - github.com/google/btree v1.1.3 // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect @@ -110,7 +108,6 @@ require ( golang.org/x/term v0.40.0 // indirect golang.org/x/text v0.34.0 // indirect golang.org/x/time v0.13.0 // indirect - gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/genproto v0.0.0-20231211222908-989df2bf70f3 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect diff --git a/deploy/snapshot/go.sum b/deploy/snapshot/go.sum index 847f410cedfe..0a89b9b5bdf2 100644 --- a/deploy/snapshot/go.sum +++ b/deploy/snapshot/go.sum @@ -60,14 +60,10 @@ github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymF github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= -github.com/evanphx/json-patch v5.7.0+incompatible h1:vgGkfT/9f8zE6tvSCe74nfpAVDQ2tG6yudJd8LBksgI= -github.com/evanphx/json-patch v5.7.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= -github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= -github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= @@ -104,8 +100,6 @@ github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= -github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= -github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= @@ -118,8 +112,6 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= -github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8 h1:ZI8gCoCjGzPsum4L21jHdQs8shFBIQih1TM9Rd/c+EQ= github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= @@ -310,8 +302,6 @@ golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= -gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= diff --git a/deploy/snapshot/internal/controller/controller.go b/deploy/snapshot/internal/controller/controller.go index 80eee39a39fc..376edcb2ca9a 100644 --- a/deploy/snapshot/internal/controller/controller.go +++ b/deploy/snapshot/internal/controller/controller.go @@ -20,11 +20,18 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/dynamic/dynamicinformer" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/executor" snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types" @@ -32,13 +39,19 @@ import ( ) // NodeController watches local-node pods with checkpoint metadata and reconciles -// snapshot execution for checkpoint and restore requests. +// snapshot execution for checkpoint and restore requests. The restore path is +// driven by a client-go pod informer; the capture path is driven by a dynamic +// informer over SnapshotContent work orders filtered to this node, with typed +// reads/writes via an uncached controller-runtime client. type NodeController struct { - config *types.AgentConfig - clientset kubernetes.Interface - runtime snapshotruntime.Runtime - log logr.Logger - holderID string + config *types.AgentConfig + clientset kubernetes.Interface + client client.Client + dynClient dynamic.Interface + runtime snapshotruntime.Runtime + log logr.Logger + holderID string + checkpointFn func(ctx context.Context, params CheckpointParams) error inFlight map[string]struct{} inFlightMu sync.Mutex @@ -55,8 +68,15 @@ const ( containerResolveAttemptTimeout = 1 * time.Second restoreContainerResolveInterval = 50 * time.Millisecond restoreContainerResolveTimeout = 30 * time.Second + + // snapshotContentResyncInterval re-drives every SnapshotContent work order so a + // not-yet-Ready source pod is re-checked for quiesce without a busy loop. + snapshotContentResyncInterval = 10 * time.Second ) +// snapshotContentGVR is the cluster-scoped resource the capture informer watches. +var snapshotContentGVR = nvidiacomv1alpha1.GroupVersion.WithResource("snapshotcontents") + // NewNodeController creates the node-local controller that runs inside snapshot-agent. func NewNodeController( cfg *types.AgentConfig, @@ -73,15 +93,33 @@ func NewNodeController( return nil, fmt.Errorf("failed to create kubernetes client: %w", err) } - return &NodeController{ + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(nvidiacomv1alpha1.AddToScheme(scheme)) + + typedClient, err := client.New(restConfig, client.Options{Scheme: scheme}) + if err != nil { + return nil, fmt.Errorf("failed to create typed client: %w", err) + } + + dynClient, err := dynamic.NewForConfig(restConfig) + if err != nil { + return nil, fmt.Errorf("failed to create dynamic client: %w", err) + } + + w := &NodeController{ config: cfg, clientset: clientset, + client: typedClient, + dynClient: dynClient, runtime: rt, log: log, holderID: "snapshot-agent/" + uuid.NewString(), inFlight: make(map[string]struct{}), stopCh: make(chan struct{}), - }, nil + } + w.checkpointFn = w.executorCheckpoint + return w, nil } // Run starts the local pod informers and processes checkpoint/restore events. @@ -102,8 +140,6 @@ func (w *NodeController) Run(ctx context.Context) error { var syncFuncs []cache.InformerSynced - // Capture is driven by the SnapshotContent controller-runtime reconciler; this - // client-go controller only handles the restore path. // Restore pods carry a checkpoint ID but are not checkpoint sources. restoreSel, err := labels.Parse(snapshotprotocol.CheckpointIDLabel + ",!" + snapshotprotocol.CheckpointSourceLabel) if err != nil { @@ -143,6 +179,34 @@ func (w *NodeController) Run(ctx context.Context) error { go restoreFactory.Start(w.stopCh) syncFuncs = append(syncFuncs, restoreInformer.HasSynced) + // Capture path: a dynamic informer over SnapshotContent work orders, filtered at + // the list/watch level to this node's mirror label. The node-label filter is the + // node scoping; reconcileSnapshotContent keeps a defensive nodeName check. + nodeContentSelector := labels.SelectorFromSet(labels.Set{snapshotprotocol.SnapshotNodeLabel: w.config.NodeName}).String() + dynFactory := dynamicinformer.NewFilteredDynamicSharedInformerFactory( + w.dynClient, snapshotContentResyncInterval, metav1.NamespaceAll, + func(opts *metav1.ListOptions) { + opts.LabelSelector = nodeContentSelector + }, + ) + contentInformer := dynFactory.ForResource(snapshotContentGVR).Informer() + if _, err := contentInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + if name, ok := contentNameFromInformerObj(obj); ok { + w.reconcileSnapshotContent(ctx, name) + } + }, + UpdateFunc: func(_, newObj interface{}) { + if name, ok := contentNameFromInformerObj(newObj); ok { + w.reconcileSnapshotContent(ctx, name) + } + }, + }); err != nil { + return fmt.Errorf("failed to add snapshot-content informer handler: %w", err) + } + go dynFactory.Start(w.stopCh) + syncFuncs = append(syncFuncs, contentInformer.HasSynced) + if !cache.WaitForCacheSync(w.stopCh, syncFuncs...) { return fmt.Errorf("failed to sync informer caches") } diff --git a/deploy/snapshot/internal/controller/controller_test.go b/deploy/snapshot/internal/controller/controller_test.go index d955b18e77c9..0ac4a0e6a0ae 100644 --- a/deploy/snapshot/internal/controller/controller_test.go +++ b/deploy/snapshot/internal/controller/controller_test.go @@ -30,12 +30,18 @@ const testContainerID = "test-container" type fakeRuntime struct { containerIDByPod string resolvedContainerIDs []string + // resolveContainerPID, when set, is returned by ResolveContainer with no error so the + // capture path can advance past container resolution. + resolveContainerPID int } var _ snapshotruntime.Runtime = (*fakeRuntime)(nil) func (r *fakeRuntime) ResolveContainer(ctx context.Context, id string) (int, *specs.Spec, error) { r.resolvedContainerIDs = append(r.resolvedContainerIDs, id) + if r.resolveContainerPID > 0 { + return r.resolveContainerPID, nil, nil + } return 0, nil, errors.New("not implemented") } func (r *fakeRuntime) ResolveContainerIDByPod(ctx context.Context, pod, ns, ctr string) (string, error) { diff --git a/deploy/snapshot/internal/controller/manager.go b/deploy/snapshot/internal/controller/manager.go deleted file mode 100644 index ec71bb89d45e..000000000000 --- a/deploy/snapshot/internal/controller/manager.go +++ /dev/null @@ -1,74 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -package controller - -import ( - "fmt" - - "github.com/google/uuid" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/fields" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/runtime" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/client-go/kubernetes" - clientgoscheme "k8s.io/client-go/kubernetes/scheme" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/cache" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/metrics/server" - - nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" - snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime" - "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types" - snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" -) - -// NewSnapshotContentManager builds the per-node controller-runtime Manager that drives -// checkpoint capture. Its cache is scoped to this node — SnapshotContent by the -// nvidia.com/snapshot-node mirror label, pods by their spec.nodeName field (so the agent -// does not open a second cluster-wide pod watch). Leader election is off, and the -// SnapshotContent reconciler is registered with the production executor-backed driver. -func NewSnapshotContentManager(cfg *types.AgentConfig, rt snapshotruntime.Runtime) (ctrl.Manager, error) { - scheme := runtime.NewScheme() - utilruntime.Must(clientgoscheme.AddToScheme(scheme)) - utilruntime.Must(nvidiacomv1alpha1.AddToScheme(scheme)) - - nodeContentSelector := labels.SelectorFromSet(labels.Set{snapshotprotocol.SnapshotNodeLabel: cfg.NodeName}) - nodePodSelector := fields.OneTermEqualSelector("spec.nodeName", cfg.NodeName) - - restConfig := ctrl.GetConfigOrDie() - mgr, err := ctrl.NewManager(restConfig, ctrl.Options{ - Scheme: scheme, - LeaderElection: false, - Metrics: server.Options{BindAddress: "0"}, - Cache: cache.Options{ - ByObject: map[client.Object]cache.ByObject{ - &nvidiacomv1alpha1.SnapshotContent{}: {Label: nodeContentSelector}, - &corev1.Pod{}: {Field: nodePodSelector}, - }, - }, - }) - if err != nil { - return nil, fmt.Errorf("create snapshot-content manager: %w", err) - } - - clientset, err := kubernetes.NewForConfig(restConfig) - if err != nil { - return nil, fmt.Errorf("create kubernetes client for lease coordination: %w", err) - } - - reconciler := &SnapshotContentReconciler{ - Client: mgr.GetClient(), - Clientset: clientset, - Config: cfg, - NodeName: cfg.NodeName, - HolderID: "snapshot-agent/" + uuid.NewString(), - Checkpointer: newExecutorCheckpointer(clientset, rt, cfg, cfg.NodeName), - } - if err := reconciler.SetupWithManager(mgr); err != nil { - return nil, fmt.Errorf("set up SnapshotContent reconciler: %w", err) - } - return mgr, nil -} diff --git a/deploy/snapshot/internal/controller/nodecheckpointer.go b/deploy/snapshot/internal/controller/nodecheckpointer.go deleted file mode 100644 index b4aa88c2a180..000000000000 --- a/deploy/snapshot/internal/controller/nodecheckpointer.go +++ /dev/null @@ -1,124 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -package controller - -import ( - "context" - "fmt" - "os" - "syscall" - "time" - - "github.com/go-logr/logr" - corev1 "k8s.io/api/core/v1" - "k8s.io/client-go/kubernetes" - - "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/executor" - snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime" - "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types" - snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" -) - -// CheckpointParams carries everything the node driver needs to dump one container. -type CheckpointParams struct { - // Pod is the live source pod (already provenance-verified by the reconciler). - Pod *corev1.Pod - // ContainerName is the single target container to checkpoint. - ContainerName string - // CheckpointID is the stable artifact identity. - CheckpointID string - // HostPath is the agent-resolved destination directory for the dump. - HostPath string - // ContainerPath is the destination as seen inside the workload container's mount - // namespace (equal to HostPath under agentMount storage). - ContainerPath string - // StartedAt marks when the reconciler observed the work order, for timing. - StartedAt time.Time -} - -// NodeCheckpointer performs the CRIU dump for a single SnapshotContent work order. The -// concrete implementation wraps executor.Checkpoint; unit tests substitute a fake. -type NodeCheckpointer interface { - // Checkpoint runs the dump and verifies the produced artifact. It returns an error - // on any failure; on success the artifact exists at params.HostPath. - Checkpoint(ctx context.Context, params CheckpointParams) error -} - -// executorCheckpointer is the production NodeCheckpointer backed by executor.Checkpoint. -type executorCheckpointer struct { - clientset kubernetes.Interface - runtime snapshotruntime.Runtime - config *types.AgentConfig - nodeName string -} - -// newExecutorCheckpointer builds the production node checkpointer. -func newExecutorCheckpointer(clientset kubernetes.Interface, rt snapshotruntime.Runtime, cfg *types.AgentConfig, nodeName string) *executorCheckpointer { - return &executorCheckpointer{clientset: clientset, runtime: rt, config: cfg, nodeName: nodeName} -} - -// Checkpoint resolves the target container, runs executor.Checkpoint to the destination, -// verifies the artifact directory, and writes the snapshot-complete sentinel. On dump or -// verification failure it SIGKILLs the CUDA-locked process before returning the error. -func (ec *executorCheckpointer) Checkpoint(ctx context.Context, params CheckpointParams) error { - log := logr.FromContextOrDiscard(ctx) - - containerID := containerIDForName(params.Pod, params.ContainerName) - if containerID == "" { - return fmt.Errorf("could not resolve container %q ID", params.ContainerName) - } - - containerPID, _, err := ec.runtime.ResolveContainer(ctx, containerID) - if err != nil { - return fmt.Errorf("resolve container %q: %w", params.ContainerName, err) - } - - req := executor.CheckpointRequest{ - ContainerID: containerID, - ContainerName: params.ContainerName, - CheckpointID: params.CheckpointID, - CheckpointLocation: params.HostPath, - StartedAt: params.StartedAt, - NodeName: ec.nodeName, - PodName: params.Pod.Name, - PodNamespace: params.Pod.Namespace, - Clientset: ec.clientset, - } - if err := executor.Checkpoint(ctx, ec.runtime, log, req, ec.config); err != nil { - ec.kill(log, containerPID, "checkpoint failed") - return fmt.Errorf("checkpoint: %w", err) - } - - info, statErr := os.Stat(params.HostPath) - if statErr != nil || !info.IsDir() { - ec.kill(log, containerPID, "checkpoint verification failed") - if statErr != nil { - return fmt.Errorf("verify checkpoint path %s: %w", params.HostPath, statErr) - } - return fmt.Errorf("verify checkpoint path %s: not a directory", params.HostPath) - } - - if err := snapshotruntime.WriteControlSentinel(containerPID, snapshotprotocol.SnapshotCompleteFile); err != nil { - ec.kill(log, containerPID, "checkpoint sentinel failed") - return fmt.Errorf("write snapshot-complete sentinel: %w", err) - } - return nil -} - -// kill signals the CUDA-locked process so it does not hang after a failed dump. -func (ec *executorCheckpointer) kill(log logr.Logger, pid int, reason string) { - if err := snapshotruntime.SendSignalToPID(log, pid, syscall.SIGKILL, reason); err != nil { - log.Error(err, "Failed to signal checkpoint process", "reason", reason) - } -} - -// containerIDForName returns the running container's CRI-stripped ID, or "" if absent. -func containerIDForName(pod *corev1.Pod, containerName string) string { - for _, cs := range pod.Status.ContainerStatuses { - if cs.Name == containerName { - return snapshotruntime.StripCRIScheme(cs.ContainerID) - } - } - return "" -} diff --git a/deploy/snapshot/internal/controller/snapshotcontent.go b/deploy/snapshot/internal/controller/snapshotcontent.go new file mode 100644 index 000000000000..91c72f30c567 --- /dev/null +++ b/deploy/snapshot/internal/controller/snapshotcontent.go @@ -0,0 +1,350 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "context" + "fmt" + "os" + "strings" + "syscall" + "time" + + "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + + nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" + "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/executor" + snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime" + snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" +) + +// CheckpointParams carries everything the node driver needs to dump one container. +type CheckpointParams struct { + // Pod is the live source pod (already provenance-verified by the reconciler). + Pod *corev1.Pod + // ContainerName is the single target container to checkpoint. + ContainerName string + // ContainerID is the agent-resolved running container ID (CRI scheme stripped). + ContainerID string + // ContainerPID is the agent-resolved host PID of the running container. + ContainerPID int + // CheckpointID is the stable artifact identity. + CheckpointID string + // HostPath is the agent-resolved destination directory for the dump. + HostPath string + // ContainerPath is the destination as seen inside the workload container's mount + // namespace (equal to HostPath under agentMount storage). + ContainerPath string + // StartedAt marks when the controller observed the work order, for timing. + StartedAt time.Time +} + +// reconcileSnapshotContent drives one SnapshotContent work order through provenance +// checks, quiesce, dump, and the terminal status write. Capture parameters come from the +// source pod's labels/annotations, never from SnapshotContent metadata. It never mutates +// spec and writes status via Status().Patch only. There is no requeue mechanism here: a +// not-yet-Ready source pod is re-driven by the 10s SnapshotContent resync and pod events. +func (w *NodeController) reconcileSnapshotContent(ctx context.Context, name string) { + logger := w.log.WithValues("content", name) + + content := &nvidiacomv1alpha1.SnapshotContent{} + if err := w.client.Get(ctx, client.ObjectKey{Name: name}, content); err != nil { + if apierrors.IsNotFound(err) { + return + } + logger.Error(err, "Failed to get SnapshotContent") + return + } + + // Defense in depth: the informer label filter already scopes to this node. + if content.Spec.Source.NodeName != w.config.NodeName { + return + } + + // Idempotency: terminal status means the work is done. + if isContentTerminal(content) { + return + } + + key := content.Name + if !w.tryAcquire(key) { + return + } + releaseInFlight := true + defer func() { + if releaseInFlight { + w.release(key) + } + }() + + pod, ok := w.resolveSourcePod(ctx, content) + if !ok { + return + } + + // Capture parameters come from the source pod. The checkpoint ID is the pod label, and it + // must agree with the ID embedded in the work order name (snapshotcontent-). + id := strings.TrimSpace(pod.Labels[snapshotprotocol.CheckpointIDLabel]) + if id == "" { + w.writeFailed(ctx, content, "MissingCheckpointID", + fmt.Errorf("source pod %q missing %s label", pod.Name, snapshotprotocol.CheckpointIDLabel)) + return + } + expected := strings.TrimPrefix(content.Name, "snapshotcontent-") + if id != expected { + w.writeFailed(ctx, content, "CheckpointIDMismatch", + fmt.Errorf("source pod checkpoint id %q does not match work order id %q", id, expected)) + return + } + + containerName, err := snapshotprotocol.TargetContainersFromAnnotations(pod.Annotations, 1, 1) + if err != nil { + w.writeFailed(ctx, content, "MissingTargetContainer", err) + return + } + if !isContainerReady(pod, containerName[0]) { + logger.V(1).Info("Source container not ready, awaiting quiesce", "pod", pod.Name, "container", containerName[0]) + return + } + + // Resolve the running container ID and host PID, then compute the destination from the + // pod's storage annotations. + containerID := containerIDForName(pod, containerName[0]) + if containerID == "" { + w.writeFailed(ctx, content, "ContainerNotResolved", + fmt.Errorf("could not resolve container %q ID", containerName[0])) + return + } + containerPID, _, err := w.runtime.ResolveContainer(ctx, containerID) + if err != nil { + w.writeFailed(ctx, content, "ContainerNotResolved", fmt.Errorf("resolve container %q: %w", containerName[0], err)) + return + } + loc, err := w.checkpointLocationsFromPod(pod, id, containerPID) + if err != nil { + w.writeFailed(ctx, content, "InvalidDestination", err) + return + } + if err := w.validatePodMountContainerPID(ctx, containerID, containerPID); err != nil { + w.writeFailed(ctx, content, "ContainerChanged", err) + return + } + + // Resume: a present artifact with unwritten status means a prior dump finished but the + // status write did not. The artifact dir exists only after the executor's atomic rename, + // so its presence means a completed dump. + if artifactPresent(loc.HostPath) { + w.writeReady(ctx, content) + return + } + + leaseKey := client.ObjectKey{Namespace: content.Spec.SnapshotRef.Namespace, Name: content.Name} + acquired, err := w.acquireLease(ctx, leaseKey) + if err != nil { + logger.Error(err, "Failed to acquire checkpoint lease", "lease", leaseKey.String()) + return + } + if !acquired { + return + } + + releaseInFlight = false + go w.runCheckpoint(ctx, content, pod, containerName[0], containerID, containerPID, id, loc, leaseKey, key) +} + +// runCheckpoint executes the dump under a renewed lease and writes the terminal status. +// The container ID, host PID, and resolved locations are pre-resolved by the reconciler so +// the dump does not re-resolve them. +func (w *NodeController) runCheckpoint( + ctx context.Context, + content *nvidiacomv1alpha1.SnapshotContent, + pod *corev1.Pod, + containerName, containerID string, + containerPID int, + checkpointID string, + loc checkpointLocations, + leaseKey client.ObjectKey, + inFlightKey string, +) { + logger := w.log.WithValues("content", content.Name) + defer w.release(inFlightKey) + + leaseCtx, stopLease := context.WithCancel(ctx) + defer stopLease() + go w.renewLease(leaseCtx, leaseKey) + defer func() { + releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := w.releaseLease(releaseCtx, leaseKey); err != nil { + logger.Error(err, "Failed to release checkpoint lease", "lease", leaseKey.String()) + } + }() + + params := CheckpointParams{ + Pod: pod, + ContainerName: containerName, + ContainerID: containerID, + ContainerPID: containerPID, + CheckpointID: checkpointID, + HostPath: loc.HostPath, + ContainerPath: loc.ContainerPath, + StartedAt: time.Now(), + } + if err := w.checkpointFn(leaseCtx, params); err != nil { + logger.Error(err, "Checkpoint failed") + w.writeFailed(ctx, content, "CheckpointFailed", err) + return + } + + w.writeReady(ctx, content) +} + +// resolveSourcePod loads the source pod and enforces UID provenance and pod liveness. +// It returns (nil, false) when the caller should stop (status already written or backoff). +func (w *NodeController) resolveSourcePod(ctx context.Context, content *nvidiacomv1alpha1.SnapshotContent) (*corev1.Pod, bool) { + pod := &corev1.Pod{} + key := client.ObjectKey{Namespace: content.Spec.SnapshotRef.Namespace, Name: content.Spec.Source.PodRef.Name} + if err := w.client.Get(ctx, key, pod); err != nil { + if apierrors.IsNotFound(err) { + // Pod not yet observed; the resync re-drives this work order. + return nil, false + } + w.log.Error(err, "Failed to get source pod", "content", content.Name, "pod", key.String()) + return nil, false + } + if content.Spec.Source.PodRef.UID != "" && pod.UID != content.Spec.Source.PodRef.UID { + w.writeFailed(ctx, content, "StalePodReference", + fmt.Errorf("source pod %q UID %q does not match work order UID %q", pod.Name, pod.UID, content.Spec.Source.PodRef.UID)) + return nil, false + } + if pod.DeletionTimestamp != nil || pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodSucceeded { + w.writeFailed(ctx, content, "SourcePodGone", + fmt.Errorf("source pod %q is no longer running (phase %s)", pod.Name, pod.Status.Phase)) + return nil, false + } + return pod, true +} + +// writeReady patches status with the Ready condition. +func (w *NodeController) writeReady(ctx context.Context, content *nvidiacomv1alpha1.SnapshotContent) { + patch := client.MergeFrom(content.DeepCopy()) + meta.SetStatusCondition(&content.Status.Conditions, metav1.Condition{ + Type: nvidiacomv1alpha1.SnapshotConditionReady, + Status: metav1.ConditionTrue, + Reason: "Captured", + Message: "Checkpoint captured and verified", + }) + if err := w.client.Status().Patch(ctx, content, patch); err != nil { + w.log.Error(err, "Failed to write SnapshotContent ready status", "content", content.Name) + } +} + +// writeFailed patches status with the Failed condition. +func (w *NodeController) writeFailed(ctx context.Context, content *nvidiacomv1alpha1.SnapshotContent, reason string, cause error) { + patch := client.MergeFrom(content.DeepCopy()) + meta.SetStatusCondition(&content.Status.Conditions, metav1.Condition{ + Type: nvidiacomv1alpha1.SnapshotConditionFailed, + Status: metav1.ConditionTrue, + Reason: reason, + Message: cause.Error(), + }) + if err := w.client.Status().Patch(ctx, content, patch); err != nil { + w.log.Error(err, "Failed to write SnapshotContent failed status", "content", content.Name, "reason", reason) + } +} + +// executorCheckpoint is the production checkpointFn. The reconciler has already resolved the +// container ID and host PID. It runs executor.Checkpoint to the destination, verifies the +// artifact directory, and writes the snapshot-complete sentinel. On dump or verification +// failure it SIGKILLs the CUDA-locked process before returning the error. +func (w *NodeController) executorCheckpoint(ctx context.Context, params CheckpointParams) error { + log := logr.FromContextOrDiscard(ctx) + + req := executor.CheckpointRequest{ + ContainerID: params.ContainerID, + ContainerName: params.ContainerName, + CheckpointID: params.CheckpointID, + CheckpointLocation: params.HostPath, + StartedAt: params.StartedAt, + NodeName: w.config.NodeName, + PodName: params.Pod.Name, + PodNamespace: params.Pod.Namespace, + Clientset: w.clientset, + } + if err := executor.Checkpoint(ctx, w.runtime, log, req, w.config); err != nil { + w.killCheckpointProcess(log, params.ContainerPID, "checkpoint failed") + return fmt.Errorf("checkpoint: %w", err) + } + + info, statErr := os.Stat(params.HostPath) + if statErr != nil || !info.IsDir() { + w.killCheckpointProcess(log, params.ContainerPID, "checkpoint verification failed") + if statErr != nil { + return fmt.Errorf("verify checkpoint path %s: %w", params.HostPath, statErr) + } + return fmt.Errorf("verify checkpoint path %s: not a directory", params.HostPath) + } + + if err := snapshotruntime.WriteControlSentinel(params.ContainerPID, snapshotprotocol.SnapshotCompleteFile); err != nil { + w.killCheckpointProcess(log, params.ContainerPID, "checkpoint sentinel failed") + return fmt.Errorf("write snapshot-complete sentinel: %w", err) + } + return nil +} + +// killCheckpointProcess signals the CUDA-locked process so it does not hang after a failed dump. +func (w *NodeController) killCheckpointProcess(log logr.Logger, pid int, reason string) { + if err := snapshotruntime.SendSignalToPID(log, pid, syscall.SIGKILL, reason); err != nil { + log.Error(err, "Failed to signal checkpoint process", "reason", reason) + } +} + +// containerIDForName returns the running container's CRI-stripped ID, or "" if absent. +func containerIDForName(pod *corev1.Pod, containerName string) string { + for _, cs := range pod.Status.ContainerStatuses { + if cs.Name == containerName { + return snapshotruntime.StripCRIScheme(cs.ContainerID) + } + } + return "" +} + +// isContentTerminal reports whether the work order already has a terminal condition. +func isContentTerminal(content *nvidiacomv1alpha1.SnapshotContent) bool { + for _, t := range []string{nvidiacomv1alpha1.SnapshotConditionReady, nvidiacomv1alpha1.SnapshotConditionFailed} { + if cond := meta.FindStatusCondition(content.Status.Conditions, t); cond != nil && cond.Status == metav1.ConditionTrue { + return true + } + } + return false +} + +// artifactPresent reports whether a completed checkpoint directory already exists on disk. +func artifactPresent(destination string) bool { + info, err := os.Stat(destination) + return err == nil && info.IsDir() +} + +// contentNameFromInformerObj extracts the object name from a dynamic informer object, +// handling the DeletedFinalStateUnknown tombstone. +func contentNameFromInformerObj(obj interface{}) (string, bool) { + if accessor, err := meta.Accessor(obj); err == nil { + return accessor.GetName(), true + } + tombstone, ok := obj.(cache.DeletedFinalStateUnknown) + if !ok { + return "", false + } + accessor, err := meta.Accessor(tombstone.Obj) + if err != nil { + return "", false + } + return accessor.GetName(), true +} diff --git a/deploy/snapshot/internal/controller/snapshotcontent_reconciler.go b/deploy/snapshot/internal/controller/snapshotcontent_reconciler.go deleted file mode 100644 index 6314716a395a..000000000000 --- a/deploy/snapshot/internal/controller/snapshotcontent_reconciler.go +++ /dev/null @@ -1,300 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -package controller - -import ( - "context" - "fmt" - "os" - "path/filepath" - "strings" - "sync" - "time" - - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/api/meta" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" - - nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" - "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types" - snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" -) - -// quiesceRequeueInterval is how often the reconciler re-checks a not-yet-Ready source pod. -const quiesceRequeueInterval = 2 * time.Second - -// SnapshotContentReconciler is the per-node CSI-style driver. It picks up SnapshotContent -// work orders for its node, dumps the source container, and writes only -// SnapshotContent.status (snapshotHandle + Ready/Failed). It holds no finalizer. -type SnapshotContentReconciler struct { - client.Client - Clientset kubernetes.Interface - Config *types.AgentConfig - NodeName string - HolderID string - Checkpointer NodeCheckpointer - - inFlight map[string]struct{} - inFlightMu sync.Mutex -} - -// Reconcile drives one SnapshotContent through provenance checks, quiesce, dump, and the -// terminal status write. It never mutates spec and writes status via Status().Patch only. -func (scr *SnapshotContentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - logger := log.FromContext(ctx) - - content := &nvidiacomv1alpha1.SnapshotContent{} - if err := scr.Get(ctx, req.NamespacedName, content); err != nil { - return ctrl.Result{}, client.IgnoreNotFound(err) - } - - if content.Spec.Source.NodeName != scr.NodeName { - return ctrl.Result{}, nil - } - - // Idempotency: terminal status means the work is done. - if isContentTerminal(content) { - return ctrl.Result{}, nil - } - - checkpointID, version, err := storageCoordsFromContent(content) - if err != nil { - return scr.writeFailed(ctx, content, "MissingStorageCoords", err) - } - - destination, err := scr.resolveDestination(checkpointID, version) - if err != nil { - return scr.writeFailed(ctx, content, "InvalidDestination", err) - } - - // Resume: a present artifact with unwritten status means a prior dump finished but the - // status write did not. The artifact dir exists only after the executor's atomic rename, - // so its presence means a completed dump; read via the agent's mounted volume, never - // /host/proc//root (there is no live PID on resume). - if artifactPresent(destination) { - return scr.writeReady(ctx, content) - } - - key := req.NamespacedName.String() - if !scr.tryAcquire(key) { - return ctrl.Result{}, nil - } - releaseInFlight := true - defer func() { - if releaseInFlight { - scr.release(key) - } - }() - - pod, result, err := scr.resolveSourcePod(ctx, content) - if err != nil || pod == nil { - return result, err - } - - containerName, err := snapshotprotocol.TargetContainersFromAnnotations(pod.Annotations, 1, 1) - if err != nil { - return scr.writeFailed(ctx, content, "MissingTargetContainer", err) - } - if !isContainerReady(pod, containerName[0]) { - logger.V(1).Info("Source container not ready, requeueing to quiesce", "pod", pod.Name, "container", containerName[0]) - return ctrl.Result{RequeueAfter: quiesceRequeueInterval}, nil - } - - leaseKey := client.ObjectKey{Namespace: content.Spec.SnapshotRef.Namespace, Name: content.Name} - acquired, err := scr.acquireLease(ctx, leaseKey) - if err != nil { - return ctrl.Result{}, err - } - if !acquired { - return ctrl.Result{RequeueAfter: quiesceRequeueInterval}, nil - } - - releaseInFlight = false - go scr.runCheckpoint(ctx, content, pod, containerName[0], checkpointID, destination, leaseKey, key) - return ctrl.Result{}, nil -} - -// runCheckpoint executes the dump under a renewed lease and writes the terminal status. -func (scr *SnapshotContentReconciler) runCheckpoint( - ctx context.Context, - content *nvidiacomv1alpha1.SnapshotContent, - pod *corev1.Pod, - containerName, checkpointID, destination string, - leaseKey client.ObjectKey, - inFlightKey string, -) { - logger := log.FromContext(ctx) - defer scr.release(inFlightKey) - - leaseCtx, stopLease := context.WithCancel(ctx) - defer stopLease() - go scr.renewLease(leaseCtx, leaseKey) - defer func() { - releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - if err := scr.releaseLease(releaseCtx, leaseKey); err != nil { - logger.Error(err, "Failed to release checkpoint lease", "lease", leaseKey.String()) - } - }() - - params := CheckpointParams{ - Pod: pod, - ContainerName: containerName, - CheckpointID: checkpointID, - HostPath: destination, - ContainerPath: destination, - StartedAt: time.Now(), - } - if err := scr.Checkpointer.Checkpoint(leaseCtx, params); err != nil { - logger.Error(err, "Checkpoint failed", "content", content.Name) - if _, werr := scr.writeFailed(ctx, content, "CheckpointFailed", err); werr != nil { - logger.Error(werr, "Failed to write SnapshotContent failed status", "content", content.Name) - } - return - } - - if _, err := scr.writeReady(ctx, content); err != nil { - logger.Error(err, "Failed to write SnapshotContent ready status", "content", content.Name) - } -} - -// resolveSourcePod loads the source pod and enforces UID provenance and pod liveness. -// It returns (nil, result, err) when the caller should return result/err instead of dumping. -func (scr *SnapshotContentReconciler) resolveSourcePod(ctx context.Context, content *nvidiacomv1alpha1.SnapshotContent) (*corev1.Pod, ctrl.Result, error) { - pod := &corev1.Pod{} - key := client.ObjectKey{Namespace: content.Spec.SnapshotRef.Namespace, Name: content.Spec.Source.PodRef.Name} - if err := scr.Get(ctx, key, pod); err != nil { - if apierrors.IsNotFound(err) { - return nil, ctrl.Result{RequeueAfter: quiesceRequeueInterval}, nil - } - return nil, ctrl.Result{}, err - } - if content.Spec.Source.PodRef.UID != "" && pod.UID != content.Spec.Source.PodRef.UID { - result, err := scr.writeFailed(ctx, content, "StalePodReference", - fmt.Errorf("source pod %q UID %q does not match work order UID %q", pod.Name, pod.UID, content.Spec.Source.PodRef.UID)) - return nil, result, err - } - if pod.DeletionTimestamp != nil || pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodSucceeded { - result, err := scr.writeFailed(ctx, content, "SourcePodGone", - fmt.Errorf("source pod %q is no longer running (phase %s)", pod.Name, pod.Status.Phase)) - return nil, result, err - } - return pod, ctrl.Result{}, nil -} - -// resolveDestination computes the artifact directory on the agent's mounted volume. -func (scr *SnapshotContentReconciler) resolveDestination(checkpointID, version string) (string, error) { - resolved, err := snapshotprotocol.ResolveCheckpointStorage(checkpointID, version, snapshotprotocol.Storage{ - Type: scr.Config.Storage.Type, - BasePath: scr.Config.Storage.BasePath, - }) - if err != nil { - return "", err - } - location := resolved.Location - if !filepath.IsAbs(location) || filepath.Clean(location) != location { - return "", fmt.Errorf("checkpoint location must be an absolute, clean path: %q", location) - } - return location, nil -} - -// writeReady patches status with the Ready condition. -func (scr *SnapshotContentReconciler) writeReady(ctx context.Context, content *nvidiacomv1alpha1.SnapshotContent) (ctrl.Result, error) { - patch := client.MergeFrom(content.DeepCopy()) - meta.SetStatusCondition(&content.Status.Conditions, metav1.Condition{ - Type: nvidiacomv1alpha1.SnapshotConditionReady, - Status: metav1.ConditionTrue, - Reason: "Captured", - Message: "Checkpoint captured and verified", - }) - if err := scr.Status().Patch(ctx, content, patch); err != nil { - return ctrl.Result{}, fmt.Errorf("patch SnapshotContent ready status: %w", err) - } - return ctrl.Result{}, nil -} - -// writeFailed patches status with the Failed condition. -func (scr *SnapshotContentReconciler) writeFailed(ctx context.Context, content *nvidiacomv1alpha1.SnapshotContent, reason string, cause error) (ctrl.Result, error) { - patch := client.MergeFrom(content.DeepCopy()) - meta.SetStatusCondition(&content.Status.Conditions, metav1.Condition{ - Type: nvidiacomv1alpha1.SnapshotConditionFailed, - Status: metav1.ConditionTrue, - Reason: reason, - Message: cause.Error(), - }) - if err := scr.Status().Patch(ctx, content, patch); err != nil { - return ctrl.Result{}, fmt.Errorf("patch SnapshotContent failed status: %w", err) - } - return ctrl.Result{}, nil -} - -// tryAcquire claims the in-flight slot for a work order, returning false if already held. -func (scr *SnapshotContentReconciler) tryAcquire(key string) bool { - scr.inFlightMu.Lock() - defer scr.inFlightMu.Unlock() - if scr.inFlight == nil { - scr.inFlight = make(map[string]struct{}) - } - if _, held := scr.inFlight[key]; held { - return false - } - scr.inFlight[key] = struct{}{} - return true -} - -// release frees the in-flight slot for a work order. -func (scr *SnapshotContentReconciler) release(key string) { - scr.inFlightMu.Lock() - defer scr.inFlightMu.Unlock() - delete(scr.inFlight, key) -} - -// SetupWithManager registers the reconciler. The manager cache is label-scoped to this -// node, so a defense-in-depth nodeName predicate is enough; no extra watches are added. -func (scr *SnapshotContentReconciler) SetupWithManager(mgr ctrl.Manager) error { - return ctrl.NewControllerManagedBy(mgr). - For(&nvidiacomv1alpha1.SnapshotContent{}). - Complete(scr) -} - -// isContentTerminal reports whether the work order already has a terminal condition. -func isContentTerminal(content *nvidiacomv1alpha1.SnapshotContent) bool { - for _, t := range []string{nvidiacomv1alpha1.SnapshotConditionReady, nvidiacomv1alpha1.SnapshotConditionFailed} { - if cond := meta.FindStatusCondition(content.Status.Conditions, t); cond != nil && cond.Status == metav1.ConditionTrue { - return true - } - } - return false -} - -// storageCoordsFromContent reads the checkpoint ID (label) and artifact version -// (annotation) carried on the work order. A missing/blank checkpoint ID is fatal; the -// version falls back to the default only when the annotation is entirely absent. -func storageCoordsFromContent(content *nvidiacomv1alpha1.SnapshotContent) (string, string, error) { - checkpointID := strings.TrimSpace(content.Labels[snapshotprotocol.CheckpointIDLabel]) - if checkpointID == "" { - return "", "", fmt.Errorf("missing %s label", snapshotprotocol.CheckpointIDLabel) - } - version, ok := content.Annotations[snapshotprotocol.CheckpointArtifactVersionAnnotation] - if !ok { - version = snapshotprotocol.DefaultCheckpointArtifactVersion - } - version = strings.TrimSpace(version) - if version == "" { - return "", "", fmt.Errorf("blank %s annotation", snapshotprotocol.CheckpointArtifactVersionAnnotation) - } - return checkpointID, version, nil -} - -// artifactPresent reports whether a completed checkpoint directory already exists on disk. -func artifactPresent(destination string) bool { - info, err := os.Stat(destination) - return err == nil && info.IsDir() -} - diff --git a/deploy/snapshot/internal/controller/snapshotcontent_reconciler_test.go b/deploy/snapshot/internal/controller/snapshotcontent_reconciler_test.go deleted file mode 100644 index 1a96782854b0..000000000000 --- a/deploy/snapshot/internal/controller/snapshotcontent_reconciler_test.go +++ /dev/null @@ -1,238 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -package controller - -import ( - "context" - "errors" - "os" - "path/filepath" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/meta" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - k8sfake "k8s.io/client-go/kubernetes/fake" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - crfake "sigs.k8s.io/controller-runtime/pkg/client/fake" - - nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" - snapshottypes "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types" - snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" -) - -// fakeCheckpointer records calls and returns a configured error. -type fakeCheckpointer struct { - called bool - err error -} - -func (fc *fakeCheckpointer) Checkpoint(_ context.Context, _ CheckpointParams) error { - fc.called = true - return fc.err -} - -func contentScheme(t *testing.T) *runtime.Scheme { - t.Helper() - s := runtime.NewScheme() - require.NoError(t, nvidiacomv1alpha1.AddToScheme(s)) - require.NoError(t, corev1.AddToScheme(s)) - return s -} - -func makeContentReconciler(t *testing.T, checkpointer NodeCheckpointer, objs ...client.Object) *SnapshotContentReconciler { - t.Helper() - s := contentScheme(t) - return &SnapshotContentReconciler{ - Client: crfake.NewClientBuilder().WithScheme(s).WithObjects(objs...). - WithStatusSubresource(&nvidiacomv1alpha1.SnapshotContent{}).Build(), - Clientset: k8sfake.NewClientset(), - Config: &snapshottypes.AgentConfig{NodeName: "node-a", Storage: snapshottypes.StorageSpec{Type: "pvc", BasePath: t.TempDir()}}, - NodeName: "node-a", - HolderID: "snapshot-agent/test", - Checkpointer: checkpointer, - inFlight: make(map[string]struct{}), - } -} - -func makeWorkOrder(name, node, checkpointID string) *nvidiacomv1alpha1.SnapshotContent { - return &nvidiacomv1alpha1.SnapshotContent{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Labels: map[string]string{snapshotprotocol.CheckpointIDLabel: checkpointID}, - Annotations: map[string]string{snapshotprotocol.CheckpointArtifactVersionAnnotation: "1"}, - }, - Spec: nvidiacomv1alpha1.SnapshotContentSpec{ - SnapshotRef: nvidiacomv1alpha1.SnapshotReference{Namespace: "inference", Name: "snapshot-" + checkpointID}, - Source: nvidiacomv1alpha1.SnapshotContentSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0", UID: types.UID("pod-uid")}, NodeName: node}, - }, - } -} - -func reconcileContent(t *testing.T, r *SnapshotContentReconciler, name string) ctrl.Result { - t.Helper() - res, err := r.Reconcile(context.Background(), ctrl.Request{NamespacedName: types.NamespacedName{Name: name}}) - require.NoError(t, err) - return res -} - -func getContent(t *testing.T, r *SnapshotContentReconciler, name string) *nvidiacomv1alpha1.SnapshotContent { - t.Helper() - c := &nvidiacomv1alpha1.SnapshotContent{} - require.NoError(t, r.Get(context.Background(), types.NamespacedName{Name: name}, c)) - return c -} - -func TestSnapshotContentReconciler_IgnoresOtherNode(t *testing.T) { - content := makeWorkOrder("snapshotcontent-x", "node-b", "x") - fc := &fakeCheckpointer{} - r := makeContentReconciler(t, fc, content) - - reconcileContent(t, r, content.Name) - assert.False(t, fc.called) - got := getContent(t, r, content.Name) - assert.Empty(t, got.Status.Conditions) -} - -func TestSnapshotContentReconciler_InFlightGuard(t *testing.T) { - content := makeWorkOrder("snapshotcontent-x", "node-a", "x") - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid")}, - Spec: corev1.PodSpec{NodeName: "node-a"}, - Status: corev1.PodStatus{Phase: corev1.PodRunning}, - } - r := makeContentReconciler(t, &fakeCheckpointer{}, content, pod) - // Pre-mark the work order in-flight; the reconcile must short-circuit. - r.inFlight["/snapshotcontent-x"] = struct{}{} - - res := reconcileContent(t, r, content.Name) - assert.Zero(t, res.RequeueAfter) - got := getContent(t, r, content.Name) - assert.Empty(t, got.Status.Conditions) -} - -func TestSnapshotContentReconciler_MissingCheckpointIDFails(t *testing.T) { - content := makeWorkOrder("snapshotcontent-x", "node-a", "x") - delete(content.Labels, snapshotprotocol.CheckpointIDLabel) - r := makeContentReconciler(t, &fakeCheckpointer{}, content) - - reconcileContent(t, r, content.Name) - got := getContent(t, r, content.Name) - cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) - require.NotNil(t, cond) - assert.Equal(t, "MissingStorageCoords", cond.Reason) -} - -func TestSnapshotContentReconciler_ResumeWritesReady(t *testing.T) { - content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") - r := makeContentReconciler(t, &fakeCheckpointer{}, content) - // Pre-create the artifact directory at the resolved destination. - dest := filepath.Join(r.Config.Storage.BasePath, "abc", "versions", "1") - require.NoError(t, os.MkdirAll(dest, 0o755)) - - reconcileContent(t, r, content.Name) - got := getContent(t, r, content.Name) - cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady) - require.NotNil(t, cond) -} - -func TestSnapshotContentReconciler_PodNotFoundBacksOff(t *testing.T) { - content := makeWorkOrder("snapshotcontent-x", "node-a", "x") - r := makeContentReconciler(t, &fakeCheckpointer{}, content) // no pod - - res := reconcileContent(t, r, content.Name) - assert.Positive(t, res.RequeueAfter) - got := getContent(t, r, content.Name) - assert.Empty(t, got.Status.Conditions) -} - -func TestSnapshotContentReconciler_StalePodUIDFails(t *testing.T) { - content := makeWorkOrder("snapshotcontent-x", "node-a", "x") - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("different-uid")}, - Spec: corev1.PodSpec{NodeName: "node-a"}, - Status: corev1.PodStatus{Phase: corev1.PodRunning}, - } - r := makeContentReconciler(t, &fakeCheckpointer{}, content, pod) - - reconcileContent(t, r, content.Name) - got := getContent(t, r, content.Name) - cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) - require.NotNil(t, cond) - assert.Equal(t, "StalePodReference", cond.Reason) -} - -func TestSnapshotContentReconciler_PodFailedFails(t *testing.T) { - content := makeWorkOrder("snapshotcontent-x", "node-a", "x") - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid")}, - Spec: corev1.PodSpec{NodeName: "node-a"}, - Status: corev1.PodStatus{Phase: corev1.PodFailed}, - } - r := makeContentReconciler(t, &fakeCheckpointer{}, content, pod) - - reconcileContent(t, r, content.Name) - got := getContent(t, r, content.Name) - cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) - require.NotNil(t, cond) - assert.Equal(t, "SourcePodGone", cond.Reason) -} - -func TestSnapshotContentReconciler_NotReadyQuiesceRequeue(t *testing.T) { - content := makeWorkOrder("snapshotcontent-x", "node-a", "x") - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid"), - Annotations: map[string]string{snapshotprotocol.TargetContainersAnnotation: "main"}, - }, - Spec: corev1.PodSpec{NodeName: "node-a"}, - Status: corev1.PodStatus{ - Phase: corev1.PodRunning, - ContainerStatuses: []corev1.ContainerStatus{{Name: "main", Ready: false}}, - }, - } - r := makeContentReconciler(t, &fakeCheckpointer{}, content, pod) - - res := reconcileContent(t, r, content.Name) - assert.Positive(t, res.RequeueAfter) - got := getContent(t, r, content.Name) - assert.Empty(t, got.Status.Conditions) -} - -func TestSnapshotContentReconciler_RunCheckpointWritesReadyOnSuccess(t *testing.T) { - content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") - fc := &fakeCheckpointer{} - r := makeContentReconciler(t, fc, content) - pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid")}} - leaseKey := client.ObjectKey{Namespace: "inference", Name: content.Name} - - r.runCheckpoint(context.Background(), content, pod, "main", "abc", - filepath.Join(r.Config.Storage.BasePath, "abc", "versions", "1"), leaseKey, "/snapshotcontent-abc") - - assert.True(t, fc.called) - got := getContent(t, r, content.Name) - cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady) - require.NotNil(t, cond) -} - -func TestSnapshotContentReconciler_RunCheckpointWritesFailedOnError(t *testing.T) { - content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") - fc := &fakeCheckpointer{err: errors.New("criu boom")} - r := makeContentReconciler(t, fc, content) - pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid")}} - leaseKey := client.ObjectKey{Namespace: "inference", Name: content.Name} - - r.runCheckpoint(context.Background(), content, pod, "main", "abc", - filepath.Join(r.Config.Storage.BasePath, "abc", "versions", "1"), leaseKey, "/snapshotcontent-abc") - - got := getContent(t, r, content.Name) - cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) - require.NotNil(t, cond) - assert.Equal(t, "CheckpointFailed", cond.Reason) -} diff --git a/deploy/snapshot/internal/controller/snapshotcontent_test.go b/deploy/snapshot/internal/controller/snapshotcontent_test.go new file mode 100644 index 000000000000..7f30ac8f0512 --- /dev/null +++ b/deploy/snapshot/internal/controller/snapshotcontent_test.go @@ -0,0 +1,348 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "context" + "errors" + "os" + "path/filepath" + "sync" + "testing" + "time" + + "github.com/go-logr/logr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + k8sfake "k8s.io/client-go/kubernetes/fake" + "sigs.k8s.io/controller-runtime/pkg/client" + crfake "sigs.k8s.io/controller-runtime/pkg/client/fake" + + nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" + snapshottypes "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types" + snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" +) + +// fakeCheckpointer records calls behind the checkpointFn seam and returns a configured error. +type fakeCheckpointer struct { + mu sync.Mutex + called bool + params CheckpointParams + err error +} + +// fn is the checkpointFn seam the NodeController invokes for the dump. +func (fc *fakeCheckpointer) fn(_ context.Context, params CheckpointParams) error { + fc.mu.Lock() + defer fc.mu.Unlock() + fc.called = true + fc.params = params + return fc.err +} + +// wasCalled reports whether the seam was invoked. +func (fc *fakeCheckpointer) wasCalled() bool { + fc.mu.Lock() + defer fc.mu.Unlock() + return fc.called +} + +// lastParams returns the params from the most recent seam invocation. +func (fc *fakeCheckpointer) lastParams() CheckpointParams { + fc.mu.Lock() + defer fc.mu.Unlock() + return fc.params +} + +// contentScheme builds a scheme with the SnapshotContent and core types registered. +func contentScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + require.NoError(t, nvidiacomv1alpha1.AddToScheme(s)) + require.NoError(t, corev1.AddToScheme(s)) + return s +} + +// makeNodeController builds a NodeController wired to a fake typed client, runtime, and seam. +func makeNodeController(t *testing.T, fc *fakeCheckpointer, objs ...client.Object) *NodeController { + t.Helper() + s := contentScheme(t) + w := &NodeController{ + config: &snapshottypes.AgentConfig{NodeName: "node-a", Storage: snapshottypes.StorageSpec{Type: "pvc", BasePath: t.TempDir()}}, + clientset: k8sfake.NewClientset(), + client: crfake.NewClientBuilder().WithScheme(s).WithObjects(objs...). + WithStatusSubresource(&nvidiacomv1alpha1.SnapshotContent{}).Build(), + runtime: &fakeRuntime{}, + log: logr.Discard(), + holderID: "snapshot-agent/test", + inFlight: make(map[string]struct{}), + } + w.checkpointFn = fc.fn + return w +} + +// makeWorkOrder builds a SnapshotContent work order pinned to a node and checkpoint id. +// Capture parameters now live on the source pod, so the work order carries only the node +// label and spec. +func makeWorkOrder(name, node, checkpointID string) *nvidiacomv1alpha1.SnapshotContent { + return &nvidiacomv1alpha1.SnapshotContent{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{snapshotprotocol.SnapshotNodeLabel: node}, + }, + Spec: nvidiacomv1alpha1.SnapshotContentSpec{ + SnapshotRef: nvidiacomv1alpha1.SnapshotReference{Namespace: "inference", Name: "snapshot-" + checkpointID}, + Source: nvidiacomv1alpha1.SnapshotContentSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0", UID: types.UID("pod-uid")}, NodeName: node}, + }, + } +} + +// makeSourcePod builds a ready source pod that carries the capture parameters the agent reads: +// the checkpoint-id label, the target-container annotation, and the storage/version annotations +// checkpointLocationsFromPod needs. +func makeSourcePod(checkpointID string) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-0", + Namespace: "inference", + UID: types.UID("pod-uid"), + Labels: map[string]string{snapshotprotocol.CheckpointIDLabel: checkpointID}, + Annotations: map[string]string{ + snapshotprotocol.TargetContainersAnnotation: "main", + snapshotprotocol.CheckpointArtifactVersionAnnotation: "1", + }, + }, + Spec: corev1.PodSpec{NodeName: "node-a"}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", Ready: true, ContainerID: "containerd://abc123"}, + }, + }, + } +} + +// getContent reads a SnapshotContent back from the fake client. +func getContent(t *testing.T, w *NodeController, name string) *nvidiacomv1alpha1.SnapshotContent { + t.Helper() + c := &nvidiacomv1alpha1.SnapshotContent{} + require.NoError(t, w.client.Get(context.Background(), types.NamespacedName{Name: name}, c)) + return c +} + +func TestReconcileSnapshotContent_IgnoresOtherNode(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-b", "x") + fc := &fakeCheckpointer{} + w := makeNodeController(t, fc, content) + + w.reconcileSnapshotContent(context.Background(), content.Name) + assert.False(t, fc.wasCalled()) + got := getContent(t, w, content.Name) + assert.Empty(t, got.Status.Conditions) +} + +func TestReconcileSnapshotContent_InFlightGuard(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid")}, + Spec: corev1.PodSpec{NodeName: "node-a"}, + Status: corev1.PodStatus{Phase: corev1.PodRunning}, + } + w := makeNodeController(t, &fakeCheckpointer{}, content, pod) + // Pre-mark the work order in-flight; the reconcile must short-circuit. + w.inFlight["snapshotcontent-x"] = struct{}{} + + w.reconcileSnapshotContent(context.Background(), content.Name) + got := getContent(t, w, content.Name) + assert.Empty(t, got.Status.Conditions) +} + +func TestReconcileSnapshotContent_MissingCheckpointIDFails(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + pod := makeSourcePod("x") + delete(pod.Labels, snapshotprotocol.CheckpointIDLabel) + w := makeNodeController(t, &fakeCheckpointer{}, content, pod) + + w.reconcileSnapshotContent(context.Background(), content.Name) + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "MissingCheckpointID", cond.Reason) +} + +func TestReconcileSnapshotContent_CheckpointIDMismatchFails(t *testing.T) { + // Work order name embeds "abc" but the source pod label says "xyz". + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + pod := makeSourcePod("xyz") + w := makeNodeController(t, &fakeCheckpointer{}, content, pod) + + w.reconcileSnapshotContent(context.Background(), content.Name) + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "CheckpointIDMismatch", cond.Reason) +} + +func TestReconcileSnapshotContent_ResumeWritesReady(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + pod := makeSourcePod("abc") + fc := &fakeCheckpointer{} + w := makeNodeController(t, fc, content, pod) + w.runtime = &fakeRuntime{resolveContainerPID: 4242} + // Pre-create the artifact directory at the resolved destination so the resume check fires. + dest := filepath.Join(w.config.Storage.BasePath, "abc", "versions", "1") + require.NoError(t, os.MkdirAll(dest, 0o755)) + + w.reconcileSnapshotContent(context.Background(), content.Name) + assert.False(t, fc.wasCalled()) + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady) + require.NotNil(t, cond) +} + +func TestReconcileSnapshotContent_PodMountResolvesContainerPID(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + pod := makeSourcePod("abc") + fc := &fakeCheckpointer{} + w := makeNodeController(t, fc, content, pod) + w.config.Storage.AccessMode = snapshottypes.StorageAccessModePodMount + rt := &fakeRuntime{resolveContainerPID: 4242} + w.runtime = rt + + w.reconcileSnapshotContent(context.Background(), content.Name) + + // podMount mode resolves the container PID and feeds it through checkpointLocationsFromPod + // (a zero PID would fail there with a different reason). The subsequent live-PID validation + // fails in a unit test because /host/proc/ does not exist, which proves the non-zero + // PID flowed through to validatePodMountContainerPID. + assert.Contains(t, rt.resolvedContainerIDs, "abc123") + assert.False(t, fc.wasCalled()) + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "ContainerChanged", cond.Reason) +} + +func TestReconcileSnapshotContent_PodNotFoundNoOp(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + w := makeNodeController(t, &fakeCheckpointer{}, content) // no pod + + w.reconcileSnapshotContent(context.Background(), content.Name) + got := getContent(t, w, content.Name) + assert.Empty(t, got.Status.Conditions) +} + +func TestReconcileSnapshotContent_StalePodUIDFails(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("different-uid")}, + Spec: corev1.PodSpec{NodeName: "node-a"}, + Status: corev1.PodStatus{Phase: corev1.PodRunning}, + } + w := makeNodeController(t, &fakeCheckpointer{}, content, pod) + + w.reconcileSnapshotContent(context.Background(), content.Name) + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "StalePodReference", cond.Reason) +} + +func TestReconcileSnapshotContent_PodFailedFails(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid")}, + Spec: corev1.PodSpec{NodeName: "node-a"}, + Status: corev1.PodStatus{Phase: corev1.PodFailed}, + } + w := makeNodeController(t, &fakeCheckpointer{}, content, pod) + + w.reconcileSnapshotContent(context.Background(), content.Name) + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "SourcePodGone", cond.Reason) +} + +func TestReconcileSnapshotContent_NotReadyQuiesceNoOp(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + pod := makeSourcePod("x") + pod.Status.ContainerStatuses[0].Ready = false + fc := &fakeCheckpointer{} + w := makeNodeController(t, fc, content, pod) + + w.reconcileSnapshotContent(context.Background(), content.Name) + assert.False(t, fc.wasCalled()) + got := getContent(t, w, content.Name) + assert.Empty(t, got.Status.Conditions) +} + +func TestReconcileSnapshotContent_CapturesFromPod(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + pod := makeSourcePod("abc") + fc := &fakeCheckpointer{} + w := makeNodeController(t, fc, content, pod) + w.runtime = &fakeRuntime{resolveContainerPID: 7} + + w.reconcileSnapshotContent(context.Background(), content.Name) + require.Eventually(t, fc.wasCalled, time.Second, 5*time.Millisecond) + + // Capture parameters are read from the source pod, not from SnapshotContent metadata. + params := fc.lastParams() + assert.Equal(t, "abc", params.CheckpointID) + assert.Equal(t, "main", params.ContainerName) + assert.Equal(t, "abc123", params.ContainerID) + assert.Equal(t, 7, params.ContainerPID) + // agentMount: HostPath == ContainerPath == resolved destination. + dest := filepath.Join(w.config.Storage.BasePath, "abc", "versions", "1") + assert.Equal(t, dest, params.HostPath) + assert.Equal(t, dest, params.ContainerPath) + + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady) + require.NotNil(t, cond) +} + +func TestRunCheckpoint_WritesReadyOnSuccess(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + fc := &fakeCheckpointer{} + w := makeNodeController(t, fc, content) + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid")}} + leaseKey := client.ObjectKey{Namespace: "inference", Name: content.Name} + loc := checkpointLocations{ + HostPath: filepath.Join(w.config.Storage.BasePath, "abc", "versions", "1"), + ContainerPath: filepath.Join(w.config.Storage.BasePath, "abc", "versions", "1"), + } + + w.runCheckpoint(context.Background(), content, pod, "main", "abc123", 7, "abc", loc, leaseKey, "snapshotcontent-abc") + + assert.True(t, fc.wasCalled()) + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady) + require.NotNil(t, cond) +} + +func TestRunCheckpoint_WritesFailedOnError(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + fc := &fakeCheckpointer{err: errors.New("criu boom")} + w := makeNodeController(t, fc, content) + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid")}} + leaseKey := client.ObjectKey{Namespace: "inference", Name: content.Name} + loc := checkpointLocations{ + HostPath: filepath.Join(w.config.Storage.BasePath, "abc", "versions", "1"), + ContainerPath: filepath.Join(w.config.Storage.BasePath, "abc", "versions", "1"), + } + + w.runCheckpoint(context.Background(), content, pod, "main", "abc123", 7, "abc", loc, leaseKey, "snapshotcontent-abc") + + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "CheckpointFailed", cond.Reason) +} diff --git a/deploy/snapshot/internal/controller/util.go b/deploy/snapshot/internal/controller/util.go index c791b03f7e4d..778ce9f161e4 100644 --- a/deploy/snapshot/internal/controller/util.go +++ b/deploy/snapshot/internal/controller/util.go @@ -85,11 +85,11 @@ func annotatePod(ctx context.Context, clientset kubernetes.Interface, log logr.L // acquireLease acquires or renews a checkpoint lease at an arbitrary namespace/name key, // returning false when another live holder owns it. -func (scr *SnapshotContentReconciler) acquireLease(ctx context.Context, key client.ObjectKey) (bool, error) { +func (w *NodeController) acquireLease(ctx context.Context, key client.ObjectKey) (bool, error) { now := metav1.NewMicroTime(time.Now()) leaseDurationSeconds := int32(checkpointLeaseDuration.Seconds()) - leaseClient := scr.Clientset.CoordinationV1().Leases(key.Namespace) + leaseClient := w.clientset.CoordinationV1().Leases(key.Namespace) existing, err := leaseClient.Get(ctx, key.Name, metav1.GetOptions{}) if err != nil { if !apierrors.IsNotFound(err) { @@ -98,7 +98,7 @@ func (scr *SnapshotContentReconciler) acquireLease(ctx context.Context, key clie lease := &coordinationv1.Lease{ ObjectMeta: metav1.ObjectMeta{Name: key.Name, Namespace: key.Namespace}, Spec: coordinationv1.LeaseSpec{ - HolderIdentity: &scr.HolderID, + HolderIdentity: &w.holderID, LeaseDurationSeconds: &leaseDurationSeconds, AcquireTime: &now, RenewTime: &now, @@ -115,10 +115,10 @@ func (scr *SnapshotContentReconciler) acquireLease(ctx context.Context, key clie if !checkpointLeaseExpired(existing, now.Time) && existing.Spec.HolderIdentity != nil && - *existing.Spec.HolderIdentity != scr.HolderID { + *existing.Spec.HolderIdentity != w.holderID { return false, nil } - existing.Spec.HolderIdentity = &scr.HolderID + existing.Spec.HolderIdentity = &w.holderID existing.Spec.LeaseDurationSeconds = &leaseDurationSeconds if existing.Spec.AcquireTime == nil || checkpointLeaseExpired(existing, now.Time) { existing.Spec.AcquireTime = &now @@ -134,7 +134,7 @@ func (scr *SnapshotContentReconciler) acquireLease(ctx context.Context, key clie } // renewLease periodically renews the lease until ctx is cancelled. -func (scr *SnapshotContentReconciler) renewLease(ctx context.Context, key client.ObjectKey) { +func (w *NodeController) renewLease(ctx context.Context, key client.ObjectKey) { ticker := time.NewTicker(checkpointLeaseRenewInterval) defer ticker.Stop() for { @@ -142,7 +142,7 @@ func (scr *SnapshotContentReconciler) renewLease(ctx context.Context, key client case <-ctx.Done(): return case <-ticker.C: - if err := scr.renewLeaseOnce(ctx, key); err != nil { + if err := w.renewLeaseOnce(ctx, key); err != nil { log.FromContext(ctx).Error(err, "Failed to renew checkpoint lease", "lease", key.String()) return } @@ -151,14 +151,14 @@ func (scr *SnapshotContentReconciler) renewLease(ctx context.Context, key client } // renewLeaseOnce bumps the lease renew time, failing if this holder no longer owns it. -func (scr *SnapshotContentReconciler) renewLeaseOnce(ctx context.Context, key client.ObjectKey) error { - leaseClient := scr.Clientset.CoordinationV1().Leases(key.Namespace) +func (w *NodeController) renewLeaseOnce(ctx context.Context, key client.ObjectKey) error { + leaseClient := w.clientset.CoordinationV1().Leases(key.Namespace) lease, err := leaseClient.Get(ctx, key.Name, metav1.GetOptions{}) if err != nil { return fmt.Errorf("get checkpoint lease %s for renewal: %w", key.String(), err) } - if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity != scr.HolderID { - return fmt.Errorf("checkpoint lease %s is no longer held by %q", key.String(), scr.HolderID) + if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity != w.holderID { + return fmt.Errorf("checkpoint lease %s is no longer held by %q", key.String(), w.holderID) } now := metav1.NewMicroTime(time.Now()) leaseDurationSeconds := int32(checkpointLeaseDuration.Seconds()) @@ -171,8 +171,8 @@ func (scr *SnapshotContentReconciler) renewLeaseOnce(ctx context.Context, key cl } // releaseLease deletes the lease if this holder owns it. -func (scr *SnapshotContentReconciler) releaseLease(ctx context.Context, key client.ObjectKey) error { - leaseClient := scr.Clientset.CoordinationV1().Leases(key.Namespace) +func (w *NodeController) releaseLease(ctx context.Context, key client.ObjectKey) error { + leaseClient := w.clientset.CoordinationV1().Leases(key.Namespace) lease, err := leaseClient.Get(ctx, key.Name, metav1.GetOptions{}) if err != nil { if apierrors.IsNotFound(err) { @@ -180,7 +180,7 @@ func (scr *SnapshotContentReconciler) releaseLease(ctx context.Context, key clie } return fmt.Errorf("get checkpoint lease %s for release: %w", key.String(), err) } - if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity != scr.HolderID { + if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity != w.holderID { return nil } if err := leaseClient.Delete(ctx, key.Name, metav1.DeleteOptions{}); err != nil && !apierrors.IsNotFound(err) { From 935c5b0b75dc77516ed13db81664051b9776b8c1 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Mon, 15 Jun 2026 23:16:48 +0300 Subject: [PATCH 08/14] fix(operator): fail checkpoint on unbound Snapshot failure Signed-off-by: Ron Kahn --- .../controller/dynamocheckpoint_controller.go | 17 +++++++---------- .../dynamocheckpoint_controller_test.go | 15 +++++++++++++++ 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/deploy/operator/internal/controller/dynamocheckpoint_controller.go b/deploy/operator/internal/controller/dynamocheckpoint_controller.go index 38949fc9d1b1..154619048ef1 100644 --- a/deploy/operator/internal/controller/dynamocheckpoint_controller.go +++ b/deploy/operator/internal/controller/dynamocheckpoint_controller.go @@ -365,16 +365,13 @@ func (r *CheckpointReconciler) observeSnapshot(ctx context.Context, ckpt *nvidia return ctrl.Result{}, err } - // Read Snapshot.status only once it is bound; an unbound Snapshot is still being - // set up by the SnapshotReconciler. Observe the terminal state and delegate the - // status write to a dedicated function. - if snap.Status.BoundSnapshotContentName != nil { - if nvidiacomv1alpha1.IsSnapshotSucceeded(snap) { - return r.markCheckpointReady(ctx, ckpt, checkpointID, snapshotConditionMessage(snap, nvidiacomv1alpha1.SnapshotConditionReady)) - } - if nvidiacomv1alpha1.IsSnapshotFailed(snap) { - return r.failCreating(ctx, ckpt, "SnapshotFailed", snapshotConditionMessage(snap, nvidiacomv1alpha1.SnapshotConditionFailed)) - } + // A Snapshot can fail before it is bound (e.g. the SnapshotReconciler rejects the + // source pod), so always observe Failed. Ready is only meaningful once bound. + if nvidiacomv1alpha1.IsSnapshotFailed(snap) { + return r.failCreating(ctx, ckpt, "SnapshotFailed", snapshotConditionMessage(snap, nvidiacomv1alpha1.SnapshotConditionFailed)) + } + if snap.Status.BoundSnapshotContentName != nil && nvidiacomv1alpha1.IsSnapshotSucceeded(snap) { + return r.markCheckpointReady(ctx, ckpt, checkpointID, snapshotConditionMessage(snap, nvidiacomv1alpha1.SnapshotConditionReady)) } // Hang guard 1: the owned Job failed while the Snapshot is still non-terminal. diff --git a/deploy/operator/internal/controller/dynamocheckpoint_controller_test.go b/deploy/operator/internal/controller/dynamocheckpoint_controller_test.go index fcde4baded99..12e79664afae 100644 --- a/deploy/operator/internal/controller/dynamocheckpoint_controller_test.go +++ b/deploy/operator/internal/controller/dynamocheckpoint_controller_test.go @@ -897,6 +897,21 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) { assert.Contains(t, updated.Status.Message, "from agent") }) + t.Run("unbound Snapshot Failed transitions checkpoint to Failed", func(t *testing.T) { + ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) + job := newCheckpointJob(defaultCheckpointJobName) + snap := ownedSnapshot(ckpt, nvidiacomv1alpha1.SnapshotConditionFailed) + snap.Status.BoundSnapshotContentName = nil // failed before binding + + r := makeCheckpointReconciler(s, ckpt, job, snap, newOwnedPod(podNameFromJob(job.Name), job)) + _, err := r.handleCreating(ctx, ckpt) + require.NoError(t, err) + + updated := &nvidiacomv1alpha1.DynamoCheckpoint{} + require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) + assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseFailed, updated.Status.Phase) + }) + t.Run("failed Job while Snapshot non-terminal transitions to Failed", func(t *testing.T) { ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) job := newCheckpointJob(defaultCheckpointJobName) From 7d5659bb6a8df6c405039744ffea30feb1cbf3dc Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Mon, 15 Jun 2026 23:16:48 +0300 Subject: [PATCH 09/14] fix(snapshot): add helm RBAC document separator Signed-off-by: Ron Kahn --- deploy/helm/charts/snapshot/templates/role.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/deploy/helm/charts/snapshot/templates/role.yaml b/deploy/helm/charts/snapshot/templates/role.yaml index 1830f487b346..c8bc68bfd6d0 100644 --- a/deploy/helm/charts/snapshot/templates/role.yaml +++ b/deploy/helm/charts/snapshot/templates/role.yaml @@ -56,6 +56,7 @@ rules: {{- if .Values.rbac.create }} {{- if not .Values.rbac.namespaceRestricted }} +--- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: From 2c0647a8cbbe4215c75ed6deb64647c8f5e9f10e Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Mon, 15 Jun 2026 23:38:37 +0300 Subject: [PATCH 10/14] refactor(operator): drop Snapshot.spec.checkpointID, use pod label Signed-off-by: Ron Kahn --- .../operator/crds/nvidia.com_snapshots.yaml | 24 ++-------- .../operator/api/v1alpha1/snapshot_types.go | 16 ------- .../api/v1alpha1/snapshot_types_test.go | 9 ++-- .../crd/bases/nvidia.com_snapshots.yaml | 24 ++-------- .../controller/checkpoint_snapshot.go | 1 - .../controller/checkpoint_snapshot_test.go | 6 +-- .../dynamocheckpoint_controller_test.go | 5 +- .../controller/snapshot_reconciler.go | 25 +++++++--- .../controller/snapshot_reconciler_test.go | 47 ++++++++++++------- 9 files changed, 67 insertions(+), 90 deletions(-) diff --git a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshots.yaml b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshots.yaml index f6a8567176cd..013efd804f8b 100644 --- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshots.yaml +++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshots.yaml @@ -21,10 +21,6 @@ spec: scope: Namespaced versions: - additionalPrinterColumns: - - description: Artifact identity - jsonPath: .spec.checkpointID - name: CheckpointID - type: string - description: Bound SnapshotContent jsonPath: .status.boundSnapshotContentName name: Content @@ -64,21 +60,12 @@ spec: description: |- SnapshotSpec defines the desired state of Snapshot. - Minimal "trigger" shape: it names what to capture (an existing pod) and the - artifact identity (CheckpointID). Capture parameters the node agent needs at - dump time (target container, storage base path) are read from the referenced - pod's existing annotations and mounts, not duplicated here. The spec is - immutable after creation. + Minimal "trigger" shape: it names what to capture (an existing pod). All + capture parameters the node agent needs at dump time (checkpoint ID, target + container, storage base path) are read from the referenced pod's existing + labels/annotations and mounts, not duplicated here. The spec is immutable + after creation. properties: - checkpointID: - description: |- - CheckpointID is the stable artifact identity and the on-PVC artifact - subdirectory name (//versions//). It is - the primary key of the storage contract shared with the restore path and - is immutable after creation. - maxLength: 253 - minLength: 1 - type: string source: description: |- Source identifies the captured workload. It is a struct (rather than an @@ -106,7 +93,6 @@ spec: - podRef type: object required: - - checkpointID - source type: object status: diff --git a/deploy/operator/api/v1alpha1/snapshot_types.go b/deploy/operator/api/v1alpha1/snapshot_types.go index 5ecc9d162763..14500742c904 100644 --- a/deploy/operator/api/v1alpha1/snapshot_types.go +++ b/deploy/operator/api/v1alpha1/snapshot_types.go @@ -44,22 +44,7 @@ func IsSnapshotFailed(s *Snapshot) bool { } // SnapshotSpec defines the desired state of Snapshot. -// -// Minimal "trigger" shape: it names what to capture (an existing pod) and the -// artifact identity (CheckpointID). Capture parameters the node agent needs at -// dump time (target container, storage base path) are read from the referenced -// pod's existing annotations and mounts, not duplicated here. The spec is -// immutable after creation. type SnapshotSpec struct { - // CheckpointID is the stable artifact identity and the on-PVC artifact - // subdirectory name (//versions//). It is - // the primary key of the storage contract shared with the restore path and - // is immutable after creation. - // +kubebuilder:validation:Required - // +kubebuilder:validation:MinLength=1 - // +kubebuilder:validation:MaxLength=253 - CheckpointID string `json:"checkpointID"` - // Source identifies the captured workload. It is a struct (rather than an // inlined reference) so future source variants can be added additively. // +kubebuilder:validation:Required @@ -105,7 +90,6 @@ type SnapshotStatus struct { // +kubebuilder:object:root=true // +kubebuilder:subresource:status // +kubebuilder:resource:scope=Namespaced,shortName=snap -// +kubebuilder:printcolumn:name="CheckpointID",type="string",JSONPath=".spec.checkpointID",description="Artifact identity" // +kubebuilder:printcolumn:name="Content",type="string",JSONPath=".status.boundSnapshotContentName",description="Bound SnapshotContent" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status",description="Ready condition" // +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" diff --git a/deploy/operator/api/v1alpha1/snapshot_types_test.go b/deploy/operator/api/v1alpha1/snapshot_types_test.go index 5ff451ed0a20..57ae50b7d99e 100644 --- a/deploy/operator/api/v1alpha1/snapshot_types_test.go +++ b/deploy/operator/api/v1alpha1/snapshot_types_test.go @@ -46,8 +46,7 @@ func TestSnapshotDeepCopyIsIndependent(t *testing.T) { original := &Snapshot{ ObjectMeta: metav1.ObjectMeta{Name: "snap-a", Namespace: "inference"}, Spec: SnapshotSpec{ - CheckpointID: "abc123", - Source: SnapshotSource{PodRef: PodReference{Name: "worker-0"}}, + Source: SnapshotSource{PodRef: PodReference{Name: "worker-0"}}, }, Status: SnapshotStatus{ Conditions: []metav1.Condition{{Type: "Ready", Status: metav1.ConditionTrue, Reason: "Captured"}}, @@ -59,10 +58,10 @@ func TestSnapshotDeepCopyIsIndependent(t *testing.T) { t.Fatalf("DeepCopy is not equal to original") } - clone.Spec.CheckpointID = "mutated" + clone.Spec.Source.PodRef.Name = "mutated" clone.Status.Conditions[0].Reason = "Changed" - if original.Spec.CheckpointID != "abc123" { - t.Errorf("mutating clone spec changed original: got %q", original.Spec.CheckpointID) + if original.Spec.Source.PodRef.Name != "worker-0" { + t.Errorf("mutating clone spec changed original: got %q", original.Spec.Source.PodRef.Name) } if original.Status.Conditions[0].Reason != "Captured" { t.Errorf("mutating clone condition changed original: got %q", original.Status.Conditions[0].Reason) diff --git a/deploy/operator/config/crd/bases/nvidia.com_snapshots.yaml b/deploy/operator/config/crd/bases/nvidia.com_snapshots.yaml index f6a8567176cd..013efd804f8b 100644 --- a/deploy/operator/config/crd/bases/nvidia.com_snapshots.yaml +++ b/deploy/operator/config/crd/bases/nvidia.com_snapshots.yaml @@ -21,10 +21,6 @@ spec: scope: Namespaced versions: - additionalPrinterColumns: - - description: Artifact identity - jsonPath: .spec.checkpointID - name: CheckpointID - type: string - description: Bound SnapshotContent jsonPath: .status.boundSnapshotContentName name: Content @@ -64,21 +60,12 @@ spec: description: |- SnapshotSpec defines the desired state of Snapshot. - Minimal "trigger" shape: it names what to capture (an existing pod) and the - artifact identity (CheckpointID). Capture parameters the node agent needs at - dump time (target container, storage base path) are read from the referenced - pod's existing annotations and mounts, not duplicated here. The spec is - immutable after creation. + Minimal "trigger" shape: it names what to capture (an existing pod). All + capture parameters the node agent needs at dump time (checkpoint ID, target + container, storage base path) are read from the referenced pod's existing + labels/annotations and mounts, not duplicated here. The spec is immutable + after creation. properties: - checkpointID: - description: |- - CheckpointID is the stable artifact identity and the on-PVC artifact - subdirectory name (//versions//). It is - the primary key of the storage contract shared with the restore path and - is immutable after creation. - maxLength: 253 - minLength: 1 - type: string source: description: |- Source identifies the captured workload. It is a struct (rather than an @@ -106,7 +93,6 @@ spec: - podRef type: object required: - - checkpointID - source type: object status: diff --git a/deploy/operator/internal/controller/checkpoint_snapshot.go b/deploy/operator/internal/controller/checkpoint_snapshot.go index df75a0cf51f7..13c9993abdee 100644 --- a/deploy/operator/internal/controller/checkpoint_snapshot.go +++ b/deploy/operator/internal/controller/checkpoint_snapshot.go @@ -92,7 +92,6 @@ func buildSnapshot(ckpt *nvidiacomv1alpha1.DynamoCheckpoint, checkpointID, sourc Labels: map[string]string{snapshotprotocol.CheckpointIDLabel: checkpointID}, }, Spec: nvidiacomv1alpha1.SnapshotSpec{ - CheckpointID: checkpointID, Source: nvidiacomv1alpha1.SnapshotSource{ PodRef: nvidiacomv1alpha1.PodReference{Name: sourcePodName}, }, diff --git a/deploy/operator/internal/controller/checkpoint_snapshot_test.go b/deploy/operator/internal/controller/checkpoint_snapshot_test.go index 877d8e4a72a0..e7516c9a0876 100644 --- a/deploy/operator/internal/controller/checkpoint_snapshot_test.go +++ b/deploy/operator/internal/controller/checkpoint_snapshot_test.go @@ -22,6 +22,7 @@ import ( "testing" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" + snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" batchv1 "k8s.io/api/batch/v1" @@ -107,7 +108,7 @@ func TestEnsureSnapshot_CreatesWhenAbsent(t *testing.T) { snap := &nvidiacomv1alpha1.Snapshot{} require.NoError(t, r.Get(context.Background(), client.ObjectKey{Namespace: testNamespace, Name: snapshotName(testHash)}, snap)) - assert.Equal(t, testHash, snap.Spec.CheckpointID) + assert.Equal(t, testHash, snap.Labels[snapshotprotocol.CheckpointIDLabel]) assert.Equal(t, "worker-xyz", snap.Spec.Source.PodRef.Name) assert.True(t, metav1.IsControlledBy(snap, ckpt), "snapshot must be controlled by the checkpoint") } @@ -130,8 +131,7 @@ func TestEnsureSnapshot_ErrorsWhenNotOwned(t *testing.T) { foreign := &nvidiacomv1alpha1.Snapshot{ ObjectMeta: metav1.ObjectMeta{Name: snapshotName(testHash), Namespace: testNamespace}, Spec: nvidiacomv1alpha1.SnapshotSpec{ - CheckpointID: testHash, - Source: nvidiacomv1alpha1.SnapshotSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "someone-else"}}, + Source: nvidiacomv1alpha1.SnapshotSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "someone-else"}}, }, } r := makeCheckpointReconciler(checkpointTestScheme(), ckpt, foreign) diff --git a/deploy/operator/internal/controller/dynamocheckpoint_controller_test.go b/deploy/operator/internal/controller/dynamocheckpoint_controller_test.go index 12e79664afae..62178d7e086b 100644 --- a/deploy/operator/internal/controller/dynamocheckpoint_controller_test.go +++ b/deploy/operator/internal/controller/dynamocheckpoint_controller_test.go @@ -828,7 +828,7 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) { snap := &nvidiacomv1alpha1.Snapshot{} require.NoError(t, r.Get(ctx, types.NamespacedName{Name: snapshotName(testHash), Namespace: testNamespace}, snap)) - assert.Equal(t, testHash, snap.Spec.CheckpointID) + assert.Equal(t, testHash, snap.Labels[snapshotprotocol.CheckpointIDLabel]) assert.Equal(t, "worker-0", snap.Spec.Source.PodRef.Name) assert.True(t, metav1.IsControlledBy(snap, ckpt)) }) @@ -850,8 +850,7 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) { }}, }, Spec: nvidiacomv1alpha1.SnapshotSpec{ - CheckpointID: testHash, - Source: nvidiacomv1alpha1.SnapshotSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0"}}, + Source: nvidiacomv1alpha1.SnapshotSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0"}}, }, Status: nvidiacomv1alpha1.SnapshotStatus{BoundSnapshotContentName: &bound}, } diff --git a/deploy/operator/internal/controller/snapshot_reconciler.go b/deploy/operator/internal/controller/snapshot_reconciler.go index 973fe0b1f64f..11ea87a09672 100644 --- a/deploy/operator/internal/controller/snapshot_reconciler.go +++ b/deploy/operator/internal/controller/snapshot_reconciler.go @@ -29,7 +29,6 @@ import ( "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/validation" "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" @@ -115,11 +114,14 @@ func (sr *SnapshotReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{RequeueAfter: jitteredBackoff(snapshotPodResolveBackoffBase)}, nil } - contentName := snapshotContentName(snap.Spec.CheckpointID) - if errs := validation.IsDNS1123Subdomain(contentName); len(errs) > 0 || len(contentName) > maxResourceNameLength { - return sr.failSnapshot(ctx, snap, "InvalidContentName", - fmt.Errorf("composed SnapshotContent name %q is invalid: too long or not a DNS subdomain", contentName)) + // The checkpoint ID is carried as a label (set by the DynamoCheckpoint controller), + // not a spec field; the agent independently reads it from the source pod. + id := snap.Labels[snapshotprotocol.CheckpointIDLabel] + if id == "" { + return sr.failSnapshot(ctx, snap, "MissingCheckpointID", + fmt.Errorf("snapshot %q missing %s label", snap.Name, snapshotprotocol.CheckpointIDLabel)) } + contentName := snapshotContentName(id) content, err := sr.ensureSnapshotContent(ctx, snap, contentName, pod) if err != nil { @@ -260,7 +262,18 @@ func (sr *SnapshotReconciler) handleDelete(ctx context.Context, snap *nvidiacomv return ctrl.Result{}, nil } - contentName := snapshotContentName(snap.Spec.CheckpointID) + // Without a checkpoint-id label no SnapshotContent could have been bound; drop the + // finalizer rather than misroute a delete to a wrongly-named object. + id := snap.Labels[snapshotprotocol.CheckpointIDLabel] + if id == "" { + controllerutil.RemoveFinalizer(snap, snapshotFinalizer) + if err := sr.Update(ctx, snap); err != nil { + return ctrl.Result{}, fmt.Errorf("remove snapshot finalizer: %w", err) + } + return ctrl.Result{}, nil + } + + contentName := snapshotContentName(id) content := &nvidiacomv1alpha1.SnapshotContent{ObjectMeta: metav1.ObjectMeta{Name: contentName}} if err := sr.Delete(ctx, content); err != nil && !apierrors.IsNotFound(err) { return ctrl.Result{}, fmt.Errorf("delete SnapshotContent %q: %w", contentName, err) diff --git a/deploy/operator/internal/controller/snapshot_reconciler_test.go b/deploy/operator/internal/controller/snapshot_reconciler_test.go index f4e2f06429c6..1790d4e99c00 100644 --- a/deploy/operator/internal/controller/snapshot_reconciler_test.go +++ b/deploy/operator/internal/controller/snapshot_reconciler_test.go @@ -19,7 +19,6 @@ package controller import ( "context" - "strings" "testing" "github.com/stretchr/testify/assert" @@ -59,15 +58,14 @@ func makeSnapshotReconciler(s *runtime.Scheme, objs ...client.Object) *SnapshotR func makeSnapshotForReconcile(checkpointID, podName string) *nvidiacomv1alpha1.Snapshot { return &nvidiacomv1alpha1.Snapshot{ ObjectMeta: metav1.ObjectMeta{ - Name: "snapshot-" + checkpointID, - Namespace: "inference", - UID: types.UID("snap-uid"), - Finalizers: []string{snapshotFinalizer}, - Annotations: map[string]string{snapshotprotocol.CheckpointArtifactVersionAnnotation: "3"}, + Name: "snapshot-" + checkpointID, + Namespace: "inference", + UID: types.UID("snap-uid"), + Finalizers: []string{snapshotFinalizer}, + Labels: map[string]string{snapshotprotocol.CheckpointIDLabel: checkpointID}, }, Spec: nvidiacomv1alpha1.SnapshotSpec{ - CheckpointID: checkpointID, - Source: nvidiacomv1alpha1.SnapshotSource{PodRef: nvidiacomv1alpha1.PodReference{Name: podName}}, + Source: nvidiacomv1alpha1.SnapshotSource{PodRef: nvidiacomv1alpha1.PodReference{Name: podName}}, }, } } @@ -186,16 +184,10 @@ func TestSnapshotReconciler_RescheduleFailsSnapshot(t *testing.T) { assert.Equal(t, "PodRescheduled", cond.Reason) } -func TestSnapshotReconciler_ComposedNameTooLongFails(t *testing.T) { +func TestSnapshotReconciler_MissingCheckpointIDLabelFails(t *testing.T) { s := snapshotReconcilerScheme() - longID := strings.Repeat("a", 250) // "snapshotcontent-" + 250 = 266 > 253 - snap := &nvidiacomv1alpha1.Snapshot{ - ObjectMeta: metav1.ObjectMeta{Name: "snapshot-x", Namespace: "inference", Finalizers: []string{snapshotFinalizer}}, - Spec: nvidiacomv1alpha1.SnapshotSpec{ - CheckpointID: longID, - Source: nvidiacomv1alpha1.SnapshotSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0"}}, - }, - } + snap := makeSnapshotForReconcile("abc123", "worker-0") + delete(snap.Labels, snapshotprotocol.CheckpointIDLabel) r := makeSnapshotReconciler(s, snap, scheduledPod("worker-0", "node-a")) reconcileSnapshot(t, r, snap.Name) @@ -204,7 +196,26 @@ func TestSnapshotReconciler_ComposedNameTooLongFails(t *testing.T) { require.NoError(t, r.Get(context.Background(), types.NamespacedName{Namespace: "inference", Name: snap.Name}, updated)) cond := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) require.NotNil(t, cond) - assert.Equal(t, "InvalidContentName", cond.Reason) + assert.Equal(t, "MissingCheckpointID", cond.Reason) +} + +func TestSnapshotReconciler_DeleteWithoutLabelDropsFinalizer(t *testing.T) { + s := snapshotReconcilerScheme() + now := metav1.Now() + snap := makeSnapshotForReconcile("abc123", "worker-0") + snap.DeletionTimestamp = &now + delete(snap.Labels, snapshotprotocol.CheckpointIDLabel) + r := makeSnapshotReconciler(s, snap) + + reconcileSnapshot(t, r, snap.Name) + + gone := &nvidiacomv1alpha1.Snapshot{} + err := r.Get(context.Background(), types.NamespacedName{Namespace: "inference", Name: snap.Name}, gone) + if err == nil { + assert.False(t, controllerutil.ContainsFinalizer(gone, snapshotFinalizer)) + } else { + assert.True(t, apierrors.IsNotFound(err)) + } } func TestSnapshotReconciler_CascadeDelete(t *testing.T) { From 99ff47a820b21317f0294fa0a5e5f801c84c233f Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Wed, 17 Jun 2026 11:39:53 +0300 Subject: [PATCH 11/14] refactor(snapshot): drop agent content-name id cross-check The agent read the checkpoint id from the source pod label but also reverse-parsed it out of the SnapshotContent name and failed on mismatch. That re-introduced the name-encoding coupling removed from Snapshot.spec. The pod is the sole source of truth; the content name is now opaque. Naming stays the operator's producer-side concern. Signed-off-by: Ron Kahn --- .../internal/controller/snapshotcontent.go | 10 ++------ .../controller/snapshotcontent_test.go | 24 ++++++++++++------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/deploy/snapshot/internal/controller/snapshotcontent.go b/deploy/snapshot/internal/controller/snapshotcontent.go index 91c72f30c567..d40c3e085736 100644 --- a/deploy/snapshot/internal/controller/snapshotcontent.go +++ b/deploy/snapshot/internal/controller/snapshotcontent.go @@ -89,20 +89,14 @@ func (w *NodeController) reconcileSnapshotContent(ctx context.Context, name stri return } - // Capture parameters come from the source pod. The checkpoint ID is the pod label, and it - // must agree with the ID embedded in the work order name (snapshotcontent-). + // Capture parameters come from the source pod, which is the single source of truth. The + // checkpoint ID is the pod label; the work order name is treated as opaque (never parsed). id := strings.TrimSpace(pod.Labels[snapshotprotocol.CheckpointIDLabel]) if id == "" { w.writeFailed(ctx, content, "MissingCheckpointID", fmt.Errorf("source pod %q missing %s label", pod.Name, snapshotprotocol.CheckpointIDLabel)) return } - expected := strings.TrimPrefix(content.Name, "snapshotcontent-") - if id != expected { - w.writeFailed(ctx, content, "CheckpointIDMismatch", - fmt.Errorf("source pod checkpoint id %q does not match work order id %q", id, expected)) - return - } containerName, err := snapshotprotocol.TargetContainersFromAnnotations(pod.Annotations, 1, 1) if err != nil { diff --git a/deploy/snapshot/internal/controller/snapshotcontent_test.go b/deploy/snapshot/internal/controller/snapshotcontent_test.go index 7f30ac8f0512..4e8aba6ff3b8 100644 --- a/deploy/snapshot/internal/controller/snapshotcontent_test.go +++ b/deploy/snapshot/internal/controller/snapshotcontent_test.go @@ -176,17 +176,25 @@ func TestReconcileSnapshotContent_MissingCheckpointIDFails(t *testing.T) { assert.Equal(t, "MissingCheckpointID", cond.Reason) } -func TestReconcileSnapshotContent_CheckpointIDMismatchFails(t *testing.T) { - // Work order name embeds "abc" but the source pod label says "xyz". - content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") - pod := makeSourcePod("xyz") - w := makeNodeController(t, &fakeCheckpointer{}, content, pod) +func TestReconcileSnapshotContent_OpaqueNameUsesPodLabel(t *testing.T) { + // The work order name does not encode the pod's checkpoint id: the name is opaque and the + // pod label is the sole source of truth. Capture must proceed using the pod label ("abc"). + content := makeWorkOrder("snapshotcontent-unrelated-name", "node-a", "abc") + pod := makeSourcePod("abc") + fc := &fakeCheckpointer{} + w := makeNodeController(t, fc, content, pod) + w.runtime = &fakeRuntime{resolveContainerPID: 7} w.reconcileSnapshotContent(context.Background(), content.Name) + require.Eventually(t, fc.wasCalled, time.Second, 5*time.Millisecond) + + // The checkpoint id and destination come from the pod label, not the work order name. + params := fc.lastParams() + assert.Equal(t, "abc", params.CheckpointID) + assert.Equal(t, filepath.Join(w.config.Storage.BasePath, "abc", "versions", "1"), params.HostPath) + got := getContent(t, w, content.Name) - cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) - require.NotNil(t, cond) - assert.Equal(t, "CheckpointIDMismatch", cond.Reason) + require.NotNil(t, meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady)) } func TestReconcileSnapshotContent_ResumeWritesReady(t *testing.T) { From 739ca31cbdb84e03bdf262e30ee5bf2bc50960ce Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Wed, 17 Jun 2026 17:43:57 +0300 Subject: [PATCH 12/14] feat(snapshot): unstick pod on failed checkpoint container Restore the pre-migration active unstick in the agent capture path: when a checkpoint container exits non-zero, SIGKILL the source pod's still-running containers so a quiesced/CUDA-locked workload cannot hang forever, then fail the SnapshotContent. The kill runs while holding the in-flight key, so it never races a live dump. Signed-off-by: Ron Kahn --- .../internal/controller/snapshotcontent.go | 61 +++++++++++++++++++ .../controller/snapshotcontent_test.go | 49 +++++++++++++++ 2 files changed, 110 insertions(+) diff --git a/deploy/snapshot/internal/controller/snapshotcontent.go b/deploy/snapshot/internal/controller/snapshotcontent.go index d40c3e085736..874d72ce7860 100644 --- a/deploy/snapshot/internal/controller/snapshotcontent.go +++ b/deploy/snapshot/internal/controller/snapshotcontent.go @@ -5,6 +5,7 @@ package controller import ( "context" + "errors" "fmt" "os" "strings" @@ -89,6 +90,14 @@ func (w *NodeController) reconcileSnapshotContent(ctx context.Context, name stri return } + // Active unstick: if any checkpoint container has already exited non-zero, force-terminate + // the pod's still-running containers so a quiesced/CUDA-locked workload cannot hang forever, + // and fail the work order. This runs while we hold the in-flight key, so it can never race a + // live dump (a dump in flight means tryAcquire above would have returned). + if w.failCheckpointOnContainerExit(ctx, content, pod) { + return + } + // Capture parameters come from the source pod, which is the single source of truth. The // checkpoint ID is the pod label; the work order name is treated as opaque (never parsed). id := strings.TrimSpace(pod.Labels[snapshotprotocol.CheckpointIDLabel]) @@ -226,6 +235,58 @@ func (w *NodeController) resolveSourcePod(ctx context.Context, content *nvidiaco return pod, true } +// failCheckpointOnContainerExit fails the work order and force-terminates the source pod's +// still-running containers when any checkpoint container has terminated non-zero. It returns +// true when a failure was handled and the caller must stop. Init containers +// (pod.Status.InitContainerStatuses) are intentionally out of scope. +func (w *NodeController) failCheckpointOnContainerExit(ctx context.Context, content *nvidiacomv1alpha1.SnapshotContent, pod *corev1.Pod) bool { + var failed *corev1.ContainerStatus + for i := range pod.Status.ContainerStatuses { + cs := &pod.Status.ContainerStatuses[i] + if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 { + failed = cs + break + } + } + if failed == nil { + return false + } + + term := failed.State.Terminated + message := fmt.Sprintf("checkpoint container %q terminated with exit code %d", failed.Name, term.ExitCode) + if term.Reason != "" { + message = fmt.Sprintf("%s: %s", message, term.Reason) + } + logger := w.log.WithValues("content", content.Name, "container", failed.Name) + logger.Info("Checkpoint container failed", "exit_code", term.ExitCode, "reason", term.Reason) + emitPodEvent(ctx, w.clientset, logger, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", message) + w.killRunningContainers(ctx, logger, pod, fmt.Sprintf("checkpoint container %s failed", failed.Name)) + w.writeFailed(ctx, content, "CheckpointContainerFailed", errors.New(message)) + return true +} + +// killRunningContainers SIGKILLs every still-running container in the pod, resolving each +// container's host PID through the node runtime. Best-effort: resolution and signal errors are +// logged and skipped so one stuck container does not block terminating the rest. +func (w *NodeController) killRunningContainers(ctx context.Context, logger logr.Logger, pod *corev1.Pod, reason string) { + for _, cs := range pod.Status.ContainerStatuses { + if cs.State.Running == nil || cs.ContainerID == "" { + continue + } + containerID := snapshotruntime.StripCRIScheme(cs.ContainerID) + resolveCtx, cancel := context.WithTimeout(ctx, containerResolveAttemptTimeout) + pid, _, err := w.runtime.ResolveContainer(resolveCtx, containerID) + cancel() + if err != nil { + logger.Error(err, "Failed to resolve running checkpoint container", "container", cs.Name) + continue + } + if err := snapshotruntime.SendSignalToPID(logger, pid, syscall.SIGKILL, reason); err != nil { + logger.Error(err, "Failed to signal running checkpoint container", "container", cs.Name) + } + } +} + // writeReady patches status with the Ready condition. func (w *NodeController) writeReady(ctx context.Context, content *nvidiacomv1alpha1.SnapshotContent) { patch := client.MergeFrom(content.DeepCopy()) diff --git a/deploy/snapshot/internal/controller/snapshotcontent_test.go b/deploy/snapshot/internal/controller/snapshotcontent_test.go index 4e8aba6ff3b8..7d0de2bfd6b6 100644 --- a/deploy/snapshot/internal/controller/snapshotcontent_test.go +++ b/deploy/snapshot/internal/controller/snapshotcontent_test.go @@ -176,6 +176,55 @@ func TestReconcileSnapshotContent_MissingCheckpointIDFails(t *testing.T) { assert.Equal(t, "MissingCheckpointID", cond.Reason) } +func TestReconcileSnapshotContent_FailedContainerUnsticksAndFails(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-0", + Namespace: "inference", + UID: types.UID("pod-uid"), + Labels: map[string]string{snapshotprotocol.CheckpointIDLabel: "abc"}, + Annotations: map[string]string{snapshotprotocol.TargetContainersAnnotation: "main"}, + }, + Spec: corev1.PodSpec{NodeName: "node-a"}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, ContainerID: "containerd://main-id"}, + {Name: "helper", State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{ExitCode: 1, Reason: "Error"}}, ContainerID: "containerd://helper-id"}, + }, + }, + } + fc := &fakeCheckpointer{} + rt := &fakeRuntime{} // PID 0 → ResolveContainer errors → SendSignalToPID skipped (no real signal sent) + w := makeNodeController(t, fc, content, pod) + w.runtime = rt + + w.reconcileSnapshotContent(context.Background(), content.Name) + + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "CheckpointContainerFailed", cond.Reason) + assert.Contains(t, cond.Message, "helper") + assert.True(t, sawEventReason(w.clientset.(*k8sfake.Clientset), "CheckpointFailed")) + // Only the still-running sibling is resolved for the SIGKILL; the dead container is skipped. + assert.Equal(t, []string{"main-id"}, rt.resolvedContainerIDs) + assert.False(t, fc.wasCalled()) + assert.Empty(t, w.inFlight) +} + +func TestFailCheckpointOnContainerExit_IgnoresCleanExit(t *testing.T) { + w := makeNodeController(t, &fakeCheckpointer{}) + pod := &corev1.Pod{Status: corev1.PodStatus{ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}}, + {Name: "helper", State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{ExitCode: 0}}}, + }}} + + handled := w.failCheckpointOnContainerExit(context.Background(), &nvidiacomv1alpha1.SnapshotContent{}, pod) + assert.False(t, handled) +} + func TestReconcileSnapshotContent_OpaqueNameUsesPodLabel(t *testing.T) { // The work order name does not encode the pod's checkpoint id: the name is opaque and the // pod label is the sole source of truth. Capture must proceed using the pod label ("abc"). From f342d36b1a229359385db1da650969b81d02b2b6 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Wed, 17 Jun 2026 18:30:07 +0300 Subject: [PATCH 13/14] feat(snapshot): drive capture on source-pod events via podRef index The capture path was driven only by the SnapshotContent informer (10s resync), so pod status changes (a checkpoint container crashing, the target becoming ready) were only acted on at resync. Add a source-pod informer plus a podRef index on the content informer: a pod event maps O(1) to its work order and re-drives reconcileSnapshotContent, choosing the oldest non-terminal content. The resync stays as a backstop. Signed-off-by: Ron Kahn --- .../internal/controller/controller.go | 118 +++++++++++++ .../controller/snapshotcontent_test.go | 163 ++++++++++++++++++ 2 files changed, 281 insertions(+) diff --git a/deploy/snapshot/internal/controller/controller.go b/deploy/snapshot/internal/controller/controller.go index 376edcb2ca9a..cef39cf0506a 100644 --- a/deploy/snapshot/internal/controller/controller.go +++ b/deploy/snapshot/internal/controller/controller.go @@ -19,6 +19,7 @@ import ( corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -56,6 +57,10 @@ type NodeController struct { inFlight map[string]struct{} inFlightMu sync.Mutex + // contentIndexer is the SnapshotContent informer's indexer, indexed by source pod + // (podRefIndex). The source-pod informer uses it to map a pod event back to its work order. + contentIndexer cache.Indexer + stopCh chan struct{} } @@ -190,6 +195,12 @@ func (w *NodeController) Run(ctx context.Context) error { }, ) contentInformer := dynFactory.ForResource(snapshotContentGVR).Informer() + // Index work orders by their source pod so a source-pod event maps back to its + // SnapshotContent in O(1). Must be registered before the informer starts. + if err := contentInformer.AddIndexers(cache.Indexers{podRefIndex: podRefIndexFunc}); err != nil { + return fmt.Errorf("failed to add snapshot-content podRef indexer: %w", err) + } + w.contentIndexer = contentInformer.GetIndexer() if _, err := contentInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { if name, ok := contentNameFromInformerObj(obj); ok { @@ -207,6 +218,37 @@ func (w *NodeController) Run(ctx context.Context) error { go dynFactory.Start(w.stopCh) syncFuncs = append(syncFuncs, contentInformer.HasSynced) + // Source-pod informer: capture-source pods carry CheckpointSourceLabel=true. A pod status + // change (a checkpoint container crashing, or the target becoming ready) does not touch the + // SnapshotContent, so without this trigger it would only be acted on at the content informer's + // resync. It needs its own factory: its selector is disjoint from the restore informer's. + sourceSelector := labels.SelectorFromSet(labels.Set{snapshotprotocol.CheckpointSourceLabel: "true"}).String() + sourceFactoryOpts := append([]informers.SharedInformerOption{ + informers.WithTweakListOptions(func(opts *metav1.ListOptions) { + opts.LabelSelector = sourceSelector + }), + }, nsOptions...) + sourceFactory := informers.NewSharedInformerFactoryWithOptions( + w.clientset, 30*time.Second, sourceFactoryOpts..., + ) + sourceInformer := sourceFactory.Core().V1().Pods().Informer() + if _, err := sourceInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + if pod, ok := podFromInformerObj(obj); ok { + w.enqueueContentForSourcePod(ctx, pod) + } + }, + UpdateFunc: func(_, newObj interface{}) { + if pod, ok := podFromInformerObj(newObj); ok { + w.enqueueContentForSourcePod(ctx, pod) + } + }, + }); err != nil { + return fmt.Errorf("failed to add source-pod informer handler: %w", err) + } + go sourceFactory.Start(w.stopCh) + syncFuncs = append(syncFuncs, sourceInformer.HasSynced) + if !cache.WaitForCacheSync(w.stopCh, syncFuncs...) { return fmt.Errorf("failed to sync informer caches") } @@ -571,6 +613,82 @@ func (w *NodeController) release(podKey string) { delete(w.inFlight, podKey) } +// podRefIndex is the SnapshotContent informer index keyed by source pod ("/"). +const podRefIndex = "byPodRef" + +// podRefIndexFunc indexes a SnapshotContent by its source pod ("/"). +// It runs against the dynamic informer's *unstructured.Unstructured objects; an unexpected type or a +// missing field yields no index entry (nil) rather than an error, so it never poisons the index. +func podRefIndexFunc(obj interface{}) ([]string, error) { + u, ok := obj.(*unstructured.Unstructured) + if !ok { + return nil, nil + } + ns, _, _ := unstructured.NestedString(u.Object, "spec", "snapshotRef", "namespace") + name, _, _ := unstructured.NestedString(u.Object, "spec", "source", "podRef", "name") + if ns == "" || name == "" { + return nil, nil + } + return []string{ns + "/" + name}, nil +} + +// contentFromInformerObj converts a dynamic informer object (or its DeletedFinalStateUnknown +// tombstone) to a typed SnapshotContent. It returns false on an unexpected type. +func contentFromInformerObj(obj interface{}) (*nvidiacomv1alpha1.SnapshotContent, bool) { + if tombstone, ok := obj.(cache.DeletedFinalStateUnknown); ok { + obj = tombstone.Obj + } + u, ok := obj.(*unstructured.Unstructured) + if !ok { + return nil, false + } + content := &nvidiacomv1alpha1.SnapshotContent{} + if err := runtime.DefaultUnstructuredConverter.FromUnstructured(u.Object, content); err != nil { + return nil, false + } + return content, true +} + +// chooseActiveContent returns the name of the oldest non-terminal SnapshotContent among the indexed +// objects (oldest first by CreationTimestamp, ties broken by Name), or "" when none are active. +// Driving the oldest until it finishes gives deterministic, stable selection across pod events. +func chooseActiveContent(objs []interface{}) string { + var chosen *nvidiacomv1alpha1.SnapshotContent + for _, obj := range objs { + content, ok := contentFromInformerObj(obj) + if !ok || isContentTerminal(content) { + continue + } + if chosen == nil || + content.CreationTimestamp.Before(&chosen.CreationTimestamp) || + (content.CreationTimestamp.Equal(&chosen.CreationTimestamp) && content.Name < chosen.Name) { + chosen = content + } + } + if chosen == nil { + return "" + } + return chosen.Name +} + +// enqueueContentForSourcePod maps a source-pod event back to its SnapshotContent and re-drives the +// capture reconcile, so pod status changes are caught without waiting for the content resync. The +// content informer still independently reconciles every work order, so a non-chosen content is +// never starved (worst case it is driven at the resync cadence). +func (w *NodeController) enqueueContentForSourcePod(ctx context.Context, pod *corev1.Pod) { + if pod.Spec.NodeName != w.config.NodeName { + return + } + objs, err := w.contentIndexer.ByIndex(podRefIndex, pod.Namespace+"/"+pod.Name) + if err != nil { + w.log.Error(err, "Failed to look up SnapshotContent by source pod", "pod", fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)) + return + } + if name := chooseActiveContent(objs); name != "" { + w.reconcileSnapshotContent(ctx, name) + } +} + func (w *NodeController) checkpointLocationsFromPod(pod *corev1.Pod, checkpointID string, hostPID int) (checkpointLocations, error) { rawBasePath, hasBasePathAnnotation := pod.Annotations[snapshotprotocol.CheckpointStorageBasePathAnnotation] basePath := strings.TrimSpace(rawBasePath) diff --git a/deploy/snapshot/internal/controller/snapshotcontent_test.go b/deploy/snapshot/internal/controller/snapshotcontent_test.go index 7d0de2bfd6b6..e8e6314f0d3a 100644 --- a/deploy/snapshot/internal/controller/snapshotcontent_test.go +++ b/deploy/snapshot/internal/controller/snapshotcontent_test.go @@ -18,9 +18,11 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" k8sfake "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/tools/cache" "sigs.k8s.io/controller-runtime/pkg/client" crfake "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -403,3 +405,164 @@ func TestRunCheckpoint_WritesFailedOnError(t *testing.T) { require.NotNil(t, cond) assert.Equal(t, "CheckpointFailed", cond.Reason) } + +// mustUnstructured converts a typed object to the *unstructured.Unstructured the dynamic informer +// (and thus the podRef index) stores. +func mustUnstructured(t *testing.T, obj runtime.Object) *unstructured.Unstructured { + t.Helper() + m, err := runtime.DefaultUnstructuredConverter.ToUnstructured(obj) + require.NoError(t, err) + return &unstructured.Unstructured{Object: m} +} + +// contentForWorker0 builds a SnapshotContent referencing pod inference/worker-0 with a given +// creation time, optionally carrying a terminal condition (SnapshotConditionReady/Failed). +func contentForWorker0(name string, created metav1.Time, terminal string) *nvidiacomv1alpha1.SnapshotContent { + c := &nvidiacomv1alpha1.SnapshotContent{ + ObjectMeta: metav1.ObjectMeta{Name: name, CreationTimestamp: created}, + Spec: nvidiacomv1alpha1.SnapshotContentSpec{ + SnapshotRef: nvidiacomv1alpha1.SnapshotReference{Namespace: "inference", Name: "snapshot-" + name}, + Source: nvidiacomv1alpha1.SnapshotContentSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0", UID: types.UID("pod-uid")}, NodeName: "node-a"}, + }, + } + if terminal != "" { + meta.SetStatusCondition(&c.Status.Conditions, metav1.Condition{Type: terminal, Status: metav1.ConditionTrue, Reason: "Done"}) + } + return c +} + +func TestPodRefIndexFunc(t *testing.T) { + keys, err := podRefIndexFunc(mustUnstructured(t, contentForWorker0("snapshotcontent-abc", metav1.Unix(1000, 0), ""))) + require.NoError(t, err) + assert.Equal(t, []string{"inference/worker-0"}, keys) +} + +func TestPodRefIndexFunc_MissingFieldsOrWrongType(t *testing.T) { + keys, err := podRefIndexFunc(&unstructured.Unstructured{Object: map[string]interface{}{"spec": map[string]interface{}{}}}) + require.NoError(t, err) + assert.Nil(t, keys) + + keys, err = podRefIndexFunc("not-unstructured") + require.NoError(t, err) + assert.Nil(t, keys) +} + +func TestContentFromInformerObj(t *testing.T) { + u := mustUnstructured(t, contentForWorker0("snapshotcontent-abc", metav1.Unix(1000, 0), "")) + + c, ok := contentFromInformerObj(u) + require.True(t, ok) + assert.Equal(t, "snapshotcontent-abc", c.Name) + + c, ok = contentFromInformerObj(cache.DeletedFinalStateUnknown{Key: "k", Obj: u}) + require.True(t, ok) + assert.Equal(t, "snapshotcontent-abc", c.Name) + + _, ok = contentFromInformerObj(cache.DeletedFinalStateUnknown{Key: "k", Obj: "bad"}) + assert.False(t, ok) + _, ok = contentFromInformerObj("bad") + assert.False(t, ok) +} + +func TestChooseActiveContent_OldestNonTerminalWins(t *testing.T) { + // "snapshotcontent-a" sorts first by name but is newer; oldest-by-CreationTimestamp must win. + newer := mustUnstructured(t, contentForWorker0("snapshotcontent-a", metav1.Unix(2000, 0), "")) + older := mustUnstructured(t, contentForWorker0("snapshotcontent-b", metav1.Unix(1000, 0), "")) + assert.Equal(t, "snapshotcontent-b", chooseActiveContent([]interface{}{newer, older})) +} + +func TestChooseActiveContent_SkipsTerminalAndTieBreaksByName(t *testing.T) { + terminal := mustUnstructured(t, contentForWorker0("snapshotcontent-old", metav1.Unix(1000, 0), nvidiacomv1alpha1.SnapshotConditionReady)) + tieA := mustUnstructured(t, contentForWorker0("snapshotcontent-a", metav1.Unix(2000, 0), "")) + tieB := mustUnstructured(t, contentForWorker0("snapshotcontent-b", metav1.Unix(2000, 0), "")) + assert.Equal(t, "snapshotcontent-a", chooseActiveContent([]interface{}{terminal, tieB, tieA})) +} + +func TestChooseActiveContent_AllTerminalReturnsEmpty(t *testing.T) { + ready := mustUnstructured(t, contentForWorker0("snapshotcontent-a", metav1.Unix(1000, 0), nvidiacomv1alpha1.SnapshotConditionReady)) + failed := mustUnstructured(t, contentForWorker0("snapshotcontent-b", metav1.Unix(2000, 0), nvidiacomv1alpha1.SnapshotConditionFailed)) + assert.Equal(t, "", chooseActiveContent([]interface{}{ready, failed})) +} + +// podWithFailedSibling builds the inference/worker-0 source pod with the target Running and a +// sibling Terminated non-zero, so a reconcile triggers the unstick. +func podWithFailedSibling() *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-0", + Namespace: "inference", + UID: types.UID("pod-uid"), + Labels: map[string]string{snapshotprotocol.CheckpointIDLabel: "abc"}, + Annotations: map[string]string{snapshotprotocol.TargetContainersAnnotation: "main"}, + }, + Spec: corev1.PodSpec{NodeName: "node-a"}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, ContainerID: "containerd://main-id"}, + {Name: "helper", State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{ExitCode: 1, Reason: "Error"}}, ContainerID: "containerd://helper-id"}, + }, + }, + } +} + +func seedIndex(t *testing.T, contents ...*nvidiacomv1alpha1.SnapshotContent) cache.Indexer { + t.Helper() + idx := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{podRefIndex: podRefIndexFunc}) + for _, c := range contents { + require.NoError(t, idx.Add(mustUnstructured(t, c))) + } + return idx +} + +func TestEnqueueContentForSourcePod_TriggersUnstick(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + content.CreationTimestamp = metav1.Unix(1000, 0) + pod := podWithFailedSibling() + fc := &fakeCheckpointer{} + rt := &fakeRuntime{} + w := makeNodeController(t, fc, content, pod) + w.runtime = rt + w.contentIndexer = seedIndex(t, content) + + w.enqueueContentForSourcePod(context.Background(), pod) + + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "CheckpointContainerFailed", cond.Reason) + assert.Equal(t, []string{"main-id"}, rt.resolvedContainerIDs) + assert.False(t, fc.wasCalled()) +} + +func TestEnqueueContentForSourcePod_PodNotIndexedNoOp(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + pod := podWithFailedSibling() + w := makeNodeController(t, &fakeCheckpointer{}, content, pod) + w.contentIndexer = seedIndex(t) // empty index + + w.enqueueContentForSourcePod(context.Background(), pod) + assert.Empty(t, getContent(t, w, content.Name).Status.Conditions) +} + +func TestEnqueueContentForSourcePod_OtherNodeNoOp(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + pod := podWithFailedSibling() + pod.Spec.NodeName = "node-b" + w := makeNodeController(t, &fakeCheckpointer{}, content, pod) + w.contentIndexer = seedIndex(t, content) + + w.enqueueContentForSourcePod(context.Background(), pod) + assert.Empty(t, getContent(t, w, content.Name).Status.Conditions) +} + +func TestEnqueueContentForSourcePod_IndexErrorNoOp(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + pod := podWithFailedSibling() + w := makeNodeController(t, &fakeCheckpointer{}, content, pod) + // Indexer without podRefIndex registered → ByIndex returns an error; enqueue must log and no-op. + w.contentIndexer = cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{}) + + w.enqueueContentForSourcePod(context.Background(), pod) + assert.Empty(t, getContent(t, w, content.Name).Status.Conditions) +} From 2845bad26bbe28748109cf7dafaee6c928d448e1 Mon Sep 17 00:00:00 2001 From: Ron Kahn Date: Mon, 22 Jun 2026 09:50:33 +0300 Subject: [PATCH 14/14] refactor(snapshot): single pod-driven capture path + pre-bind gate Both informers funneled into one content-centric reconcile that mixed pre-bind validation and capture. Split into reconcileSourcePod (the one capture flow, pod-driven, picks the oldest active work order) and a thin reconcileSnapshotContent pre-bind gate that validates the source pod (NotFound/UID/gone -> writeFailed) and triggers the pod reconcile. The unstick runs before the gone-guard so a crashed container is always SIGKILLed. classifySourcePod replaces resolveSourcePod. Signed-off-by: Ron Kahn --- .../internal/controller/controller.go | 22 +--- .../internal/controller/snapshotcontent.go | 116 +++++++++++++----- .../controller/snapshotcontent_test.go | 77 +++++++++--- 3 files changed, 142 insertions(+), 73 deletions(-) diff --git a/deploy/snapshot/internal/controller/controller.go b/deploy/snapshot/internal/controller/controller.go index cef39cf0506a..9535dd8be512 100644 --- a/deploy/snapshot/internal/controller/controller.go +++ b/deploy/snapshot/internal/controller/controller.go @@ -235,12 +235,12 @@ func (w *NodeController) Run(ctx context.Context) error { if _, err := sourceInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { if pod, ok := podFromInformerObj(obj); ok { - w.enqueueContentForSourcePod(ctx, pod) + w.reconcileSourcePod(ctx, pod) } }, UpdateFunc: func(_, newObj interface{}) { if pod, ok := podFromInformerObj(newObj); ok { - w.enqueueContentForSourcePod(ctx, pod) + w.reconcileSourcePod(ctx, pod) } }, }); err != nil { @@ -671,24 +671,6 @@ func chooseActiveContent(objs []interface{}) string { return chosen.Name } -// enqueueContentForSourcePod maps a source-pod event back to its SnapshotContent and re-drives the -// capture reconcile, so pod status changes are caught without waiting for the content resync. The -// content informer still independently reconciles every work order, so a non-chosen content is -// never starved (worst case it is driven at the resync cadence). -func (w *NodeController) enqueueContentForSourcePod(ctx context.Context, pod *corev1.Pod) { - if pod.Spec.NodeName != w.config.NodeName { - return - } - objs, err := w.contentIndexer.ByIndex(podRefIndex, pod.Namespace+"/"+pod.Name) - if err != nil { - w.log.Error(err, "Failed to look up SnapshotContent by source pod", "pod", fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)) - return - } - if name := chooseActiveContent(objs); name != "" { - w.reconcileSnapshotContent(ctx, name) - } -} - func (w *NodeController) checkpointLocationsFromPod(pod *corev1.Pod, checkpointID string, hostPID int) (checkpointLocations, error) { rawBasePath, hasBasePathAnnotation := pod.Annotations[snapshotprotocol.CheckpointStorageBasePathAnnotation] basePath := strings.TrimSpace(rawBasePath) diff --git a/deploy/snapshot/internal/controller/snapshotcontent.go b/deploy/snapshot/internal/controller/snapshotcontent.go index 874d72ce7860..c2b77cdaa879 100644 --- a/deploy/snapshot/internal/controller/snapshotcontent.go +++ b/deploy/snapshot/internal/controller/snapshotcontent.go @@ -47,11 +47,11 @@ type CheckpointParams struct { StartedAt time.Time } -// reconcileSnapshotContent drives one SnapshotContent work order through provenance -// checks, quiesce, dump, and the terminal status write. Capture parameters come from the -// source pod's labels/annotations, never from SnapshotContent metadata. It never mutates -// spec and writes status via Status().Patch only. There is no requeue mechanism here: a -// not-yet-Ready source pod is re-driven by the 10s SnapshotContent resync and pod events. +// reconcileSnapshotContent is the pre-bind gate for a SnapshotContent work order. It validates the +// source pod (existence and provenance) and, when the pod is valid, hands off to reconcileSourcePod +// — the single capture path. It never runs the capture flow itself. Driven by the content informer +// (Add/Update) and its 10s resync; the resync is the backstop that eventually writes a terminal +// failure for a work order whose source pod is gone. func (w *NodeController) reconcileSnapshotContent(ctx context.Context, name string) { logger := w.log.WithValues("content", name) @@ -68,12 +68,68 @@ func (w *NodeController) reconcileSnapshotContent(ctx context.Context, name stri if content.Spec.Source.NodeName != w.config.NodeName { return } - // Idempotency: terminal status means the work is done. if isContentTerminal(content) { return } + pod := &corev1.Pod{} + key := client.ObjectKey{Namespace: content.Spec.SnapshotRef.Namespace, Name: content.Spec.Source.PodRef.Name} + if err := w.client.Get(ctx, key, pod); err != nil { + if apierrors.IsNotFound(err) { + // The operator creates the SnapshotContent only after the source pod exists, and this + // is a linearizable (quorum) Get, so NotFound means the pod was deleted, not a + // creation race: fail the work order terminally. + w.writeFailed(ctx, content, "SourcePodNotFound", fmt.Errorf("source pod %q not found", key.String())) + return + } + logger.Error(err, "Failed to get source pod", "pod", key.String()) + return + } + if reason, msg := classifySourcePod(content, pod); reason != "" { + w.writeFailed(ctx, content, reason, errors.New(msg)) + return + } + + // Pod is valid: hand off to the single capture path. + w.reconcileSourcePod(ctx, pod) +} + +// reconcileSourcePod is the single capture path. It is driven by source-pod events and by +// reconcileSnapshotContent once the pod is validated. It selects the oldest active work order for +// the pod and drives the unstick + dump. Capture parameters come from the source pod, which is the +// single source of truth; it never mutates spec and writes status via Status().Patch only. The +// triggering content event (if any) may name a different work order than the one chosen here — the +// event is only a trigger; chooseActiveContent picks the oldest active SnapshotContent for the pod. +func (w *NodeController) reconcileSourcePod(ctx context.Context, pod *corev1.Pod) { + if pod.Spec.NodeName != w.config.NodeName { + return + } + if w.contentIndexer == nil { + return + } + objs, err := w.contentIndexer.ByIndex(podRefIndex, pod.Namespace+"/"+pod.Name) + if err != nil { + w.log.Error(err, "Failed to look up SnapshotContent by source pod", "pod", fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)) + return + } + name := chooseActiveContent(objs) + if name == "" { + return + } + logger := w.log.WithValues("content", name) + + content := &nvidiacomv1alpha1.SnapshotContent{} + if err := w.client.Get(ctx, client.ObjectKey{Name: name}, content); err != nil { + if !apierrors.IsNotFound(err) { + logger.Error(err, "Failed to get SnapshotContent") + } + return + } + if isContentTerminal(content) { + return + } + key := content.Name if !w.tryAcquire(key) { return @@ -85,16 +141,18 @@ func (w *NodeController) reconcileSnapshotContent(ctx context.Context, name stri } }() - pod, ok := w.resolveSourcePod(ctx, content) - if !ok { + // Active unstick first: a non-zero container exit must SIGKILL the pod's still-running + // containers and fail the work order even when the pod is already Phase==Failed (which the + // gone-guard below would otherwise short-circuit). This runs while we hold the in-flight key, + // so it can never race a live dump (a dump in flight means tryAcquire above would have returned). + if w.failCheckpointOnContainerExit(ctx, content, pod) { return } - - // Active unstick: if any checkpoint container has already exited non-zero, force-terminate - // the pod's still-running containers so a quiesced/CUDA-locked workload cannot hang forever, - // and fail the work order. This runs while we hold the in-flight key, so it can never race a - // live dump (a dump in flight means tryAcquire above would have returned). - if w.failCheckpointOnContainerExit(ctx, content, pod) { + // Provenance/liveness guard. The terminal writeFailed for these is owned by + // reconcileSnapshotContent (pre-bind); here we only skip capture and let the content resync + // write the failure. + if reason, _ := classifySourcePod(content, pod); reason != "" { + logger.V(1).Info("Skipping capture; source pod not usable", "reason", reason, "pod", pod.Name) return } @@ -209,30 +267,20 @@ func (w *NodeController) runCheckpoint( w.writeReady(ctx, content) } -// resolveSourcePod loads the source pod and enforces UID provenance and pod liveness. -// It returns (nil, false) when the caller should stop (status already written or backoff). -func (w *NodeController) resolveSourcePod(ctx context.Context, content *nvidiacomv1alpha1.SnapshotContent) (*corev1.Pod, bool) { - pod := &corev1.Pod{} - key := client.ObjectKey{Namespace: content.Spec.SnapshotRef.Namespace, Name: content.Spec.Source.PodRef.Name} - if err := w.client.Get(ctx, key, pod); err != nil { - if apierrors.IsNotFound(err) { - // Pod not yet observed; the resync re-drives this work order. - return nil, false - } - w.log.Error(err, "Failed to get source pod", "content", content.Name, "pod", key.String()) - return nil, false - } +// classifySourcePod reports whether the source pod is unusable for capture, returning a terminal +// failure reason and message ("" reason means the pod is valid). It is pure: callers decide whether +// to writeFailed (reconcileSnapshotContent, pre-bind) or merely skip capture (reconcileSourcePod +// guard). Pod existence (NotFound) is handled by the caller, which holds the Get error. +func classifySourcePod(content *nvidiacomv1alpha1.SnapshotContent, pod *corev1.Pod) (string, string) { if content.Spec.Source.PodRef.UID != "" && pod.UID != content.Spec.Source.PodRef.UID { - w.writeFailed(ctx, content, "StalePodReference", - fmt.Errorf("source pod %q UID %q does not match work order UID %q", pod.Name, pod.UID, content.Spec.Source.PodRef.UID)) - return nil, false + return "StalePodReference", + fmt.Sprintf("source pod %q UID %q does not match work order UID %q", pod.Name, pod.UID, content.Spec.Source.PodRef.UID) } if pod.DeletionTimestamp != nil || pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodSucceeded { - w.writeFailed(ctx, content, "SourcePodGone", - fmt.Errorf("source pod %q is no longer running (phase %s)", pod.Name, pod.Status.Phase)) - return nil, false + return "SourcePodGone", + fmt.Sprintf("source pod %q is no longer running (phase %s)", pod.Name, pod.Status.Phase) } - return pod, true + return "", "" } // failCheckpointOnContainerExit fails the work order and force-terminates the source pod's diff --git a/deploy/snapshot/internal/controller/snapshotcontent_test.go b/deploy/snapshot/internal/controller/snapshotcontent_test.go index e8e6314f0d3a..ab45818c26f9 100644 --- a/deploy/snapshot/internal/controller/snapshotcontent_test.go +++ b/deploy/snapshot/internal/controller/snapshotcontent_test.go @@ -71,19 +71,29 @@ func contentScheme(t *testing.T) *runtime.Scheme { return s } -// makeNodeController builds a NodeController wired to a fake typed client, runtime, and seam. +// makeNodeController builds a NodeController wired to a fake typed client, runtime, and seam. Any +// SnapshotContent in objs is also added to the podRef index (mirroring the content informer's +// cache) so the pod-driven reconcileSourcePod can resolve it; tests that need a different index +// state override w.contentIndexer after construction. func makeNodeController(t *testing.T, fc *fakeCheckpointer, objs ...client.Object) *NodeController { t.Helper() s := contentScheme(t) + idx := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{podRefIndex: podRefIndexFunc}) + for _, o := range objs { + if sc, ok := o.(*nvidiacomv1alpha1.SnapshotContent); ok { + require.NoError(t, idx.Add(mustUnstructured(t, sc))) + } + } w := &NodeController{ config: &snapshottypes.AgentConfig{NodeName: "node-a", Storage: snapshottypes.StorageSpec{Type: "pvc", BasePath: t.TempDir()}}, clientset: k8sfake.NewClientset(), client: crfake.NewClientBuilder().WithScheme(s).WithObjects(objs...). WithStatusSubresource(&nvidiacomv1alpha1.SnapshotContent{}).Build(), - runtime: &fakeRuntime{}, - log: logr.Discard(), - holderID: "snapshot-agent/test", - inFlight: make(map[string]struct{}), + runtime: &fakeRuntime{}, + log: logr.Discard(), + holderID: "snapshot-agent/test", + inFlight: make(map[string]struct{}), + contentIndexer: idx, } w.checkpointFn = fc.fn return w @@ -288,13 +298,44 @@ func TestReconcileSnapshotContent_PodMountResolvesContainerPID(t *testing.T) { assert.Equal(t, "ContainerChanged", cond.Reason) } -func TestReconcileSnapshotContent_PodNotFoundNoOp(t *testing.T) { +func TestReconcileSnapshotContent_PodNotFoundFails(t *testing.T) { content := makeWorkOrder("snapshotcontent-x", "node-a", "x") w := makeNodeController(t, &fakeCheckpointer{}, content) // no pod w.reconcileSnapshotContent(context.Background(), content.Name) got := getContent(t, w, content.Name) - assert.Empty(t, got.Status.Conditions) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "SourcePodNotFound", cond.Reason) +} + +func TestClassifySourcePod(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") // PodRef Name worker-0, UID pod-uid + running := func(uid string, phase corev1.PodPhase, deleting bool) *corev1.Pod { + p := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID(uid)}, + Status: corev1.PodStatus{Phase: phase}, + } + if deleting { + now := metav1.Now() + p.DeletionTimestamp = &now + } + return p + } + + reason, _ := classifySourcePod(content, running("pod-uid", corev1.PodRunning, false)) + assert.Equal(t, "", reason) + + reason, _ = classifySourcePod(content, running("other-uid", corev1.PodRunning, false)) + assert.Equal(t, "StalePodReference", reason) + + for _, phase := range []corev1.PodPhase{corev1.PodFailed, corev1.PodSucceeded} { + reason, _ = classifySourcePod(content, running("pod-uid", phase, false)) + assert.Equal(t, "SourcePodGone", reason) + } + + reason, _ = classifySourcePod(content, running("pod-uid", corev1.PodRunning, true)) + assert.Equal(t, "SourcePodGone", reason) } func TestReconcileSnapshotContent_StalePodUIDFails(t *testing.T) { @@ -515,7 +556,7 @@ func seedIndex(t *testing.T, contents ...*nvidiacomv1alpha1.SnapshotContent) cac return idx } -func TestEnqueueContentForSourcePod_TriggersUnstick(t *testing.T) { +func TestReconcileSourcePod_TriggersUnstick(t *testing.T) { content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") content.CreationTimestamp = metav1.Unix(1000, 0) pod := podWithFailedSibling() @@ -523,9 +564,8 @@ func TestEnqueueContentForSourcePod_TriggersUnstick(t *testing.T) { rt := &fakeRuntime{} w := makeNodeController(t, fc, content, pod) w.runtime = rt - w.contentIndexer = seedIndex(t, content) - w.enqueueContentForSourcePod(context.Background(), pod) + w.reconcileSourcePod(context.Background(), pod) got := getContent(t, w, content.Name) cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) @@ -535,34 +575,33 @@ func TestEnqueueContentForSourcePod_TriggersUnstick(t *testing.T) { assert.False(t, fc.wasCalled()) } -func TestEnqueueContentForSourcePod_PodNotIndexedNoOp(t *testing.T) { +func TestReconcileSourcePod_PodNotIndexedNoOp(t *testing.T) { content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") pod := podWithFailedSibling() w := makeNodeController(t, &fakeCheckpointer{}, content, pod) - w.contentIndexer = seedIndex(t) // empty index + w.contentIndexer = seedIndex(t) // override: empty index - w.enqueueContentForSourcePod(context.Background(), pod) + w.reconcileSourcePod(context.Background(), pod) assert.Empty(t, getContent(t, w, content.Name).Status.Conditions) } -func TestEnqueueContentForSourcePod_OtherNodeNoOp(t *testing.T) { +func TestReconcileSourcePod_OtherNodeNoOp(t *testing.T) { content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") pod := podWithFailedSibling() pod.Spec.NodeName = "node-b" w := makeNodeController(t, &fakeCheckpointer{}, content, pod) - w.contentIndexer = seedIndex(t, content) - w.enqueueContentForSourcePod(context.Background(), pod) + w.reconcileSourcePod(context.Background(), pod) assert.Empty(t, getContent(t, w, content.Name).Status.Conditions) } -func TestEnqueueContentForSourcePod_IndexErrorNoOp(t *testing.T) { +func TestReconcileSourcePod_IndexErrorNoOp(t *testing.T) { content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") pod := podWithFailedSibling() w := makeNodeController(t, &fakeCheckpointer{}, content, pod) - // Indexer without podRefIndex registered → ByIndex returns an error; enqueue must log and no-op. + // Indexer without podRefIndex registered → ByIndex returns an error; reconcile must log and no-op. w.contentIndexer = cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{}) - w.enqueueContentForSourcePod(context.Background(), pod) + w.reconcileSourcePod(context.Background(), pod) assert.Empty(t, getContent(t, w, content.Name).Status.Conditions) }