diff --git a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshotcontents.yaml b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshotcontents.yaml index 99dfbc11dee1..000f8955d691 100644 --- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshotcontents.yaml +++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshotcontents.yaml @@ -63,7 +63,8 @@ spec: spec: description: |- SnapshotContentSpec defines the desired state of SnapshotContent. It is - populated by the node agent at creation time and is immutable thereafter. + populated by the SnapshotReconciler (operator) at creation time and is + immutable thereafter. properties: snapshotRef: description: |- @@ -86,18 +87,35 @@ spec: - namespace type: object source: - description: Source locates the physical artifact via a self-contained, opaque handle. + description: 'Source describes what to capture: the source pod and the node it runs on.' properties: - snapshotHandle: + nodeName: description: |- - SnapshotHandle is a self-contained, opaque artifact locator. The v1alpha1 - PVC format is: - pvc://////versions/ - It fully locates the artifact without correlating any other field. + NodeName is the node the source pod runs on, denormalized from the live + pod so it travels with PodRef as one immutable unit and selects the node + agent that performs the dump. minLength: 1 type: string + podRef: + description: |- + PodRef identifies the pod to dump. Its UID guards against dumping a + same-named recreation of the pod. + properties: + name: + description: Name of the source pod. + minLength: 1 + type: string + uid: + description: |- + UID of the source pod, recorded so the node agent dumps that specific + pod and not a same-named recreation. + type: string + required: + - name + type: object required: - - snapshotHandle + - nodeName + - podRef type: object required: - snapshotRef @@ -164,11 +182,6 @@ spec: - type type: object type: array - snapshotHandle: - description: |- - SnapshotHandle mirrors spec.source.snapshotHandle once the node agent has - verified the artifact. - type: string type: object type: object x-kubernetes-validations: diff --git a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshots.yaml b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshots.yaml index a5101b641309..013efd804f8b 100644 --- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshots.yaml +++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshots.yaml @@ -21,10 +21,6 @@ spec: scope: Namespaced versions: - additionalPrinterColumns: - - description: Artifact identity - jsonPath: .spec.checkpointID - name: CheckpointID - type: string - description: Bound SnapshotContent jsonPath: .status.boundSnapshotContentName name: Content @@ -64,21 +60,12 @@ spec: description: |- SnapshotSpec defines the desired state of Snapshot. - Minimal "trigger" shape: it names what to capture (an existing pod) and the - artifact identity (CheckpointID). Capture parameters the node agent needs at - dump time (target container, storage base path) are read from the referenced - pod's existing annotations and mounts, not duplicated here. The spec is - immutable after creation. + Minimal "trigger" shape: it names what to capture (an existing pod). All + capture parameters the node agent needs at dump time (checkpoint ID, target + container, storage base path) are read from the referenced pod's existing + labels/annotations and mounts, not duplicated here. The spec is immutable + after creation. properties: - checkpointID: - description: |- - CheckpointID is the stable artifact identity and the on-PVC artifact - subdirectory name (//versions//). It is - the primary key of the storage contract shared with the restore path and - is immutable after creation. - maxLength: 253 - minLength: 1 - type: string source: description: |- Source identifies the captured workload. It is a struct (rather than an @@ -94,6 +81,11 @@ spec: description: Name of the source pod. minLength: 1 type: string + uid: + description: |- + UID of the source pod, recorded so the node agent dumps that specific + pod and not a same-named recreation. + type: string required: - name type: object @@ -101,7 +93,6 @@ spec: - podRef type: object required: - - checkpointID - source type: object status: diff --git a/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml b/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml index bae96a422a04..0b9c0ef7e899 100644 --- a/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml +++ b/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml @@ -330,6 +330,7 @@ rules: - dynamographdeploymentscalingadapters - dynamomodels - dynamoworkermetadatas + - snapshotcontents - snapshots verbs: - create @@ -347,6 +348,7 @@ rules: - dynamographdeploymentrequests/finalizers - dynamographdeployments/finalizers - dynamomodels/finalizers + - snapshots/finalizers verbs: - update - apiGroups: @@ -358,6 +360,8 @@ rules: - dynamographdeployments/status - dynamographdeploymentscalingadapters/status - dynamomodels/status + - snapshotcontents/status + - snapshots/status verbs: - get - patch diff --git a/deploy/helm/charts/snapshot/templates/role.yaml b/deploy/helm/charts/snapshot/templates/role.yaml index 282a402b51e1..c8bc68bfd6d0 100644 --- a/deploy/helm/charts/snapshot/templates/role.yaml +++ b/deploy/helm/charts/snapshot/templates/role.yaml @@ -31,7 +31,32 @@ rules: - apiGroups: ["resource.k8s.io"] resources: ["resourceclaims"] verbs: ["get", "list"] -{{- else }} +{{- end }} +{{- end }} + +{{- if .Values.rbac.create }} +--- +# SnapshotContent is cluster-scoped, so the agent always needs a ClusterRole for it. +# The agent reads work orders and writes only their status; it never creates, deletes, +# or touches Snapshots (the work order is self-contained). +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "snapshot.fullname" . }}-agent-snapshotcontents + labels: + {{- include "snapshot.labels" . | nindent 4 }} +rules: + - apiGroups: ["nvidia.com"] + resources: ["snapshotcontents"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: ["nvidia.com"] + resources: ["snapshotcontents/status"] + verbs: ["update", "patch"] +{{- end }} + +{{- if .Values.rbac.create }} +{{- if not .Values.rbac.namespaceRestricted }} +--- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: diff --git a/deploy/helm/charts/snapshot/templates/rolebinding.yaml b/deploy/helm/charts/snapshot/templates/rolebinding.yaml index b65dba17952b..f78af5e7579e 100644 --- a/deploy/helm/charts/snapshot/templates/rolebinding.yaml +++ b/deploy/helm/charts/snapshot/templates/rolebinding.yaml @@ -1,6 +1,25 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +{{- if .Values.rbac.create }} +--- +# Bind agent to the cluster-scoped SnapshotContent ClusterRole (capture work orders). +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "snapshot.fullname" . }}-agent-snapshotcontents + labels: + {{- include "snapshot.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "snapshot.fullname" . }}-agent-snapshotcontents +subjects: + - kind: ServiceAccount + name: {{ include "snapshot.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} +{{- end }} + {{- if .Values.rbac.create }} {{- if .Values.rbac.namespaceRestricted }} apiVersion: rbac.authorization.k8s.io/v1 diff --git a/deploy/operator/api/v1alpha1/snapshot_types.go b/deploy/operator/api/v1alpha1/snapshot_types.go index 8efe51ca4526..14500742c904 100644 --- a/deploy/operator/api/v1alpha1/snapshot_types.go +++ b/deploy/operator/api/v1alpha1/snapshot_types.go @@ -18,7 +18,9 @@ package v1alpha1 import ( + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" ) // Snapshot and SnapshotContent status condition types. Both objects share this @@ -31,23 +33,18 @@ const ( SnapshotConditionFailed = "Failed" ) +// IsSnapshotSucceeded reports whether the Snapshot's Ready condition is True. +func IsSnapshotSucceeded(s *Snapshot) bool { + return meta.IsStatusConditionTrue(s.Status.Conditions, SnapshotConditionReady) +} + +// IsSnapshotFailed reports whether the Snapshot's Failed condition is True. +func IsSnapshotFailed(s *Snapshot) bool { + return meta.IsStatusConditionTrue(s.Status.Conditions, SnapshotConditionFailed) +} + // SnapshotSpec defines the desired state of Snapshot. -// -// Minimal "trigger" shape: it names what to capture (an existing pod) and the -// artifact identity (CheckpointID). Capture parameters the node agent needs at -// dump time (target container, storage base path) are read from the referenced -// pod's existing annotations and mounts, not duplicated here. The spec is -// immutable after creation. type SnapshotSpec struct { - // CheckpointID is the stable artifact identity and the on-PVC artifact - // subdirectory name (//versions//). It is - // the primary key of the storage contract shared with the restore path and - // is immutable after creation. - // +kubebuilder:validation:Required - // +kubebuilder:validation:MinLength=1 - // +kubebuilder:validation:MaxLength=253 - CheckpointID string `json:"checkpointID"` - // Source identifies the captured workload. It is a struct (rather than an // inlined reference) so future source variants can be added additively. // +kubebuilder:validation:Required @@ -69,6 +66,11 @@ type PodReference struct { // +kubebuilder:validation:Required // +kubebuilder:validation:MinLength=1 Name string `json:"name"` + + // UID of the source pod, recorded so the node agent dumps that specific + // pod and not a same-named recreation. + // +optional + UID types.UID `json:"uid,omitempty"` } // SnapshotStatus defines the observed state of Snapshot. @@ -88,7 +90,6 @@ type SnapshotStatus struct { // +kubebuilder:object:root=true // +kubebuilder:subresource:status // +kubebuilder:resource:scope=Namespaced,shortName=snap -// +kubebuilder:printcolumn:name="CheckpointID",type="string",JSONPath=".spec.checkpointID",description="Artifact identity" // +kubebuilder:printcolumn:name="Content",type="string",JSONPath=".status.boundSnapshotContentName",description="Bound SnapshotContent" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status",description="Ready condition" // +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" diff --git a/deploy/operator/api/v1alpha1/snapshot_types_test.go b/deploy/operator/api/v1alpha1/snapshot_types_test.go index d562910e5f2c..57ae50b7d99e 100644 --- a/deploy/operator/api/v1alpha1/snapshot_types_test.go +++ b/deploy/operator/api/v1alpha1/snapshot_types_test.go @@ -46,8 +46,7 @@ func TestSnapshotDeepCopyIsIndependent(t *testing.T) { original := &Snapshot{ ObjectMeta: metav1.ObjectMeta{Name: "snap-a", Namespace: "inference"}, Spec: SnapshotSpec{ - CheckpointID: "abc123", - Source: SnapshotSource{PodRef: PodReference{Name: "worker-0"}}, + Source: SnapshotSource{PodRef: PodReference{Name: "worker-0"}}, }, Status: SnapshotStatus{ Conditions: []metav1.Condition{{Type: "Ready", Status: metav1.ConditionTrue, Reason: "Captured"}}, @@ -59,10 +58,10 @@ func TestSnapshotDeepCopyIsIndependent(t *testing.T) { t.Fatalf("DeepCopy is not equal to original") } - clone.Spec.CheckpointID = "mutated" + clone.Spec.Source.PodRef.Name = "mutated" clone.Status.Conditions[0].Reason = "Changed" - if original.Spec.CheckpointID != "abc123" { - t.Errorf("mutating clone spec changed original: got %q", original.Spec.CheckpointID) + if original.Spec.Source.PodRef.Name != "worker-0" { + t.Errorf("mutating clone spec changed original: got %q", original.Spec.Source.PodRef.Name) } if original.Status.Conditions[0].Reason != "Captured" { t.Errorf("mutating clone condition changed original: got %q", original.Status.Conditions[0].Reason) @@ -76,7 +75,10 @@ func TestSnapshotContentDeepCopyIsIndependent(t *testing.T) { ObjectMeta: metav1.ObjectMeta{Name: "content-a"}, Spec: SnapshotContentSpec{ SnapshotRef: SnapshotReference{Namespace: "inference", Name: "snap-a", UID: types.UID("uid-1")}, - Source: SnapshotContentSource{SnapshotHandle: "pvc://inference/ckpt-pvc/checkpoints/abc123/versions/1"}, + Source: SnapshotContentSource{ + PodRef: PodReference{Name: "worker-0", UID: types.UID("pod-uid-1")}, + NodeName: "node-a", + }, }, Status: SnapshotContentStatus{ Conditions: []metav1.Condition{{Type: "Ready", Status: metav1.ConditionTrue, Reason: "Bound"}}, @@ -88,10 +90,10 @@ func TestSnapshotContentDeepCopyIsIndependent(t *testing.T) { t.Fatalf("DeepCopy is not equal to original") } - clone.Spec.Source.SnapshotHandle = "mutated" + clone.Spec.Source.PodRef.Name = "mutated" clone.Status.Conditions[0].Reason = "Changed" - if original.Spec.Source.SnapshotHandle != "pvc://inference/ckpt-pvc/checkpoints/abc123/versions/1" { - t.Errorf("mutating clone changed original handle: got %q", original.Spec.Source.SnapshotHandle) + if original.Spec.Source.PodRef.Name != "worker-0" { + t.Errorf("mutating clone changed original podRef name: got %q", original.Spec.Source.PodRef.Name) } if original.Status.Conditions[0].Reason != "Bound" { t.Errorf("mutating clone condition changed original: got %q", original.Status.Conditions[0].Reason) diff --git a/deploy/operator/api/v1alpha1/snapshotcontent_types.go b/deploy/operator/api/v1alpha1/snapshotcontent_types.go index 7a970f5b959d..79274590d587 100644 --- a/deploy/operator/api/v1alpha1/snapshotcontent_types.go +++ b/deploy/operator/api/v1alpha1/snapshotcontent_types.go @@ -18,19 +18,31 @@ package v1alpha1 import ( + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" ) +// IsSnapshotContentSucceeded reports whether the SnapshotContent's Ready condition is True. +func IsSnapshotContentSucceeded(c *SnapshotContent) bool { + return meta.IsStatusConditionTrue(c.Status.Conditions, SnapshotConditionReady) +} + +// IsSnapshotContentFailed reports whether the SnapshotContent's Failed condition is True. +func IsSnapshotContentFailed(c *SnapshotContent) bool { + return meta.IsStatusConditionTrue(c.Status.Conditions, SnapshotConditionFailed) +} + // SnapshotContentSpec defines the desired state of SnapshotContent. It is -// populated by the node agent at creation time and is immutable thereafter. +// populated by the SnapshotReconciler (operator) at creation time and is +// immutable thereafter. type SnapshotContentSpec struct { // SnapshotRef is the back-pointer to the bound Snapshot. It may span // namespaces because SnapshotContent is cluster-scoped. // +kubebuilder:validation:Required SnapshotRef SnapshotReference `json:"snapshotRef"` - // Source locates the physical artifact via a self-contained, opaque handle. + // Source describes what to capture: the source pod and the node it runs on. // +kubebuilder:validation:Required Source SnapshotContentSource `json:"source"` } @@ -51,24 +63,24 @@ type SnapshotReference struct { UID types.UID `json:"uid,omitempty"` } -// SnapshotContentSource locates the physical checkpoint artifact. +// SnapshotContentSource is the immutable source descriptor: what to dump +// (PodRef) and where it runs (NodeName). type SnapshotContentSource struct { - // SnapshotHandle is a self-contained, opaque artifact locator. The v1alpha1 - // PVC format is: - // pvc://////versions/ - // It fully locates the artifact without correlating any other field. + // PodRef identifies the pod to dump. Its UID guards against dumping a + // same-named recreation of the pod. + // +kubebuilder:validation:Required + PodRef PodReference `json:"podRef"` + + // NodeName is the node the source pod runs on, denormalized from the live + // pod so it travels with PodRef as one immutable unit and selects the node + // agent that performs the dump. // +kubebuilder:validation:Required // +kubebuilder:validation:MinLength=1 - SnapshotHandle string `json:"snapshotHandle"` + NodeName string `json:"nodeName"` } // SnapshotContentStatus defines the observed state of SnapshotContent. type SnapshotContentStatus struct { - // SnapshotHandle mirrors spec.source.snapshotHandle once the node agent has - // verified the artifact. - // +optional - SnapshotHandle *string `json:"snapshotHandle,omitempty"` - // Conditions reflect the latest observations of the SnapshotContent's state. // Standard types are Ready and Failed. // +optional diff --git a/deploy/operator/api/v1alpha1/zz_generated.deepcopy.go b/deploy/operator/api/v1alpha1/zz_generated.deepcopy.go index 119d1b89ce90..50b7e7b75452 100644 --- a/deploy/operator/api/v1alpha1/zz_generated.deepcopy.go +++ b/deploy/operator/api/v1alpha1/zz_generated.deepcopy.go @@ -1943,6 +1943,7 @@ func (in *SnapshotContentList) DeepCopyObject() runtime.Object { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SnapshotContentSource) DeepCopyInto(out *SnapshotContentSource) { *out = *in + out.PodRef = in.PodRef } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SnapshotContentSource. @@ -1975,11 +1976,6 @@ func (in *SnapshotContentSpec) DeepCopy() *SnapshotContentSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SnapshotContentStatus) DeepCopyInto(out *SnapshotContentStatus) { *out = *in - if in.SnapshotHandle != nil { - in, out := &in.SnapshotHandle, &out.SnapshotHandle - *out = new(string) - **out = **in - } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]metav1.Condition, len(*in)) diff --git a/deploy/operator/cmd/main.go b/deploy/operator/cmd/main.go index de54570bcf2f..320ee1c88e48 100644 --- a/deploy/operator/cmd/main.go +++ b/deploy/operator/cmd/main.go @@ -264,6 +264,11 @@ func main() { mgrOpts.Cache.DefaultNamespaces = map[string]cache.Config{ restrictedNamespace: {}, } + // SnapshotContent is cluster-scoped, so DefaultNamespaces does not cover it. + // Register it cluster-wide explicitly so the SnapshotReconciler can watch it. + mgrOpts.Cache.ByObject = map[client.Object]cache.ByObject{ + &nvidiacomv1alpha1.SnapshotContent{}: {}, + } setupLog.Info("Restricted namespace configured, launching in restricted mode", "namespace", restrictedNamespace) banner := strings.Repeat("=", 80) @@ -710,6 +715,13 @@ func registerControllers( return fmt.Errorf("unable to create DynamoCheckpoint controller: %w", err) } + if err = (&controller.SnapshotReconciler{ + Client: mgr.GetClient(), + Recorder: mgr.GetEventRecorderFor("snapshot"), + }).SetupWithManager(mgr); err != nil { + return fmt.Errorf("unable to create Snapshot controller: %w", err) + } + if runtimeConfig.GroveEnabled { if err = controller.NewFailoverCascadeReconciler( mgr.GetClient(), diff --git a/deploy/operator/config/crd/bases/nvidia.com_snapshotcontents.yaml b/deploy/operator/config/crd/bases/nvidia.com_snapshotcontents.yaml index 99dfbc11dee1..000f8955d691 100644 --- a/deploy/operator/config/crd/bases/nvidia.com_snapshotcontents.yaml +++ b/deploy/operator/config/crd/bases/nvidia.com_snapshotcontents.yaml @@ -63,7 +63,8 @@ spec: spec: description: |- SnapshotContentSpec defines the desired state of SnapshotContent. It is - populated by the node agent at creation time and is immutable thereafter. + populated by the SnapshotReconciler (operator) at creation time and is + immutable thereafter. properties: snapshotRef: description: |- @@ -86,18 +87,35 @@ spec: - namespace type: object source: - description: Source locates the physical artifact via a self-contained, opaque handle. + description: 'Source describes what to capture: the source pod and the node it runs on.' properties: - snapshotHandle: + nodeName: description: |- - SnapshotHandle is a self-contained, opaque artifact locator. The v1alpha1 - PVC format is: - pvc://////versions/ - It fully locates the artifact without correlating any other field. + NodeName is the node the source pod runs on, denormalized from the live + pod so it travels with PodRef as one immutable unit and selects the node + agent that performs the dump. minLength: 1 type: string + podRef: + description: |- + PodRef identifies the pod to dump. Its UID guards against dumping a + same-named recreation of the pod. + properties: + name: + description: Name of the source pod. + minLength: 1 + type: string + uid: + description: |- + UID of the source pod, recorded so the node agent dumps that specific + pod and not a same-named recreation. + type: string + required: + - name + type: object required: - - snapshotHandle + - nodeName + - podRef type: object required: - snapshotRef @@ -164,11 +182,6 @@ spec: - type type: object type: array - snapshotHandle: - description: |- - SnapshotHandle mirrors spec.source.snapshotHandle once the node agent has - verified the artifact. - type: string type: object type: object x-kubernetes-validations: diff --git a/deploy/operator/config/crd/bases/nvidia.com_snapshots.yaml b/deploy/operator/config/crd/bases/nvidia.com_snapshots.yaml index a5101b641309..013efd804f8b 100644 --- a/deploy/operator/config/crd/bases/nvidia.com_snapshots.yaml +++ b/deploy/operator/config/crd/bases/nvidia.com_snapshots.yaml @@ -21,10 +21,6 @@ spec: scope: Namespaced versions: - additionalPrinterColumns: - - description: Artifact identity - jsonPath: .spec.checkpointID - name: CheckpointID - type: string - description: Bound SnapshotContent jsonPath: .status.boundSnapshotContentName name: Content @@ -64,21 +60,12 @@ spec: description: |- SnapshotSpec defines the desired state of Snapshot. - Minimal "trigger" shape: it names what to capture (an existing pod) and the - artifact identity (CheckpointID). Capture parameters the node agent needs at - dump time (target container, storage base path) are read from the referenced - pod's existing annotations and mounts, not duplicated here. The spec is - immutable after creation. + Minimal "trigger" shape: it names what to capture (an existing pod). All + capture parameters the node agent needs at dump time (checkpoint ID, target + container, storage base path) are read from the referenced pod's existing + labels/annotations and mounts, not duplicated here. The spec is immutable + after creation. properties: - checkpointID: - description: |- - CheckpointID is the stable artifact identity and the on-PVC artifact - subdirectory name (//versions//). It is - the primary key of the storage contract shared with the restore path and - is immutable after creation. - maxLength: 253 - minLength: 1 - type: string source: description: |- Source identifies the captured workload. It is a struct (rather than an @@ -94,6 +81,11 @@ spec: description: Name of the source pod. minLength: 1 type: string + uid: + description: |- + UID of the source pod, recorded so the node agent dumps that specific + pod and not a same-named recreation. + type: string required: - name type: object @@ -101,7 +93,6 @@ spec: - podRef type: object required: - - checkpointID - source type: object status: diff --git a/deploy/operator/config/rbac/role.yaml b/deploy/operator/config/rbac/role.yaml index 5ec2ac826ca9..678282e51e9d 100644 --- a/deploy/operator/config/rbac/role.yaml +++ b/deploy/operator/config/rbac/role.yaml @@ -227,6 +227,7 @@ rules: - dynamographdeployments - dynamographdeploymentscalingadapters - dynamomodels + - snapshotcontents - snapshots verbs: - create @@ -244,6 +245,7 @@ rules: - dynamographdeploymentrequests/finalizers - dynamographdeployments/finalizers - dynamomodels/finalizers + - snapshots/finalizers verbs: - update - apiGroups: @@ -255,6 +257,8 @@ rules: - dynamographdeployments/status - dynamographdeploymentscalingadapters/status - dynamomodels/status + - snapshotcontents/status + - snapshots/status verbs: - get - patch diff --git a/deploy/operator/internal/controller/checkpoint_snapshot.go b/deploy/operator/internal/controller/checkpoint_snapshot.go index df75a0cf51f7..13c9993abdee 100644 --- a/deploy/operator/internal/controller/checkpoint_snapshot.go +++ b/deploy/operator/internal/controller/checkpoint_snapshot.go @@ -92,7 +92,6 @@ func buildSnapshot(ckpt *nvidiacomv1alpha1.DynamoCheckpoint, checkpointID, sourc Labels: map[string]string{snapshotprotocol.CheckpointIDLabel: checkpointID}, }, Spec: nvidiacomv1alpha1.SnapshotSpec{ - CheckpointID: checkpointID, Source: nvidiacomv1alpha1.SnapshotSource{ PodRef: nvidiacomv1alpha1.PodReference{Name: sourcePodName}, }, diff --git a/deploy/operator/internal/controller/checkpoint_snapshot_test.go b/deploy/operator/internal/controller/checkpoint_snapshot_test.go index 877d8e4a72a0..e7516c9a0876 100644 --- a/deploy/operator/internal/controller/checkpoint_snapshot_test.go +++ b/deploy/operator/internal/controller/checkpoint_snapshot_test.go @@ -22,6 +22,7 @@ import ( "testing" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" + snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" batchv1 "k8s.io/api/batch/v1" @@ -107,7 +108,7 @@ func TestEnsureSnapshot_CreatesWhenAbsent(t *testing.T) { snap := &nvidiacomv1alpha1.Snapshot{} require.NoError(t, r.Get(context.Background(), client.ObjectKey{Namespace: testNamespace, Name: snapshotName(testHash)}, snap)) - assert.Equal(t, testHash, snap.Spec.CheckpointID) + assert.Equal(t, testHash, snap.Labels[snapshotprotocol.CheckpointIDLabel]) assert.Equal(t, "worker-xyz", snap.Spec.Source.PodRef.Name) assert.True(t, metav1.IsControlledBy(snap, ckpt), "snapshot must be controlled by the checkpoint") } @@ -130,8 +131,7 @@ func TestEnsureSnapshot_ErrorsWhenNotOwned(t *testing.T) { foreign := &nvidiacomv1alpha1.Snapshot{ ObjectMeta: metav1.ObjectMeta{Name: snapshotName(testHash), Namespace: testNamespace}, Spec: nvidiacomv1alpha1.SnapshotSpec{ - CheckpointID: testHash, - Source: nvidiacomv1alpha1.SnapshotSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "someone-else"}}, + Source: nvidiacomv1alpha1.SnapshotSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "someone-else"}}, }, } r := makeCheckpointReconciler(checkpointTestScheme(), ckpt, foreign) diff --git a/deploy/operator/internal/controller/dynamocheckpoint_controller.go b/deploy/operator/internal/controller/dynamocheckpoint_controller.go index 4c221792ae5d..154619048ef1 100644 --- a/deploy/operator/internal/controller/dynamocheckpoint_controller.go +++ b/deploy/operator/internal/controller/dynamocheckpoint_controller.go @@ -25,12 +25,12 @@ import ( appsv1 "k8s.io/api/apps/v1" batchv1 "k8s.io/api/batch/v1" - coordinationv1 "k8s.io/api/coordination/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/record" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" @@ -301,8 +301,6 @@ func (r *CheckpointReconciler) failPendingCheckpoint( } func (r *CheckpointReconciler) handleCreating(ctx context.Context, ckpt *nvidiacomv1alpha1.DynamoCheckpoint) (ctrl.Result, error) { - logger := log.FromContext(ctx) - if ckpt.Status.JobName == "" { ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhasePending ckpt.Status.Message = "checkpoint job is missing from status" @@ -352,73 +350,99 @@ func (r *CheckpointReconciler) handleCreating(ctx context.Context, ckpt *nvidiac return ctrl.Result{}, err } - var lease *coordinationv1.Lease - leaseKey := client.ObjectKey{Namespace: job.Namespace, Name: job.Name} - lease = &coordinationv1.Lease{} - if err := r.Get(ctx, leaseKey, lease); err != nil { - if !apierrors.IsNotFound(err) { - return ctrl.Result{}, err - } - lease = nil - } - - now := time.Now() - checkpointWorkerActive := false - if lease != nil && lease.Spec.LeaseDurationSeconds != nil { - // The snapshot-agent owns and renews this lease while it is still finalizing - // checkpoint state. A Job can complete before the agent writes the terminal - // checkpoint annotation, so we keep requeuing until the lease is no longer active. - lastRenewal := lease.Spec.RenewTime - if lastRenewal == nil { - lastRenewal = lease.Spec.AcquireTime - } - if lastRenewal != nil { - checkpointWorkerActive = !now.After(lastRenewal.Time.Add(time.Duration(*lease.Spec.LeaseDurationSeconds) * time.Second)) + return r.observeSnapshot(ctx, ckpt, job, checkpointID) +} + +// observeSnapshot maps the bound Snapshot's status (and the owned Job's failure / deadline +// hang guards) onto the DynamoCheckpoint phase. Completion cascades up from SnapshotContent +// → Snapshot → DynamoCheckpoint, so this never reads the Job's terminal annotation. +func (r *CheckpointReconciler) observeSnapshot(ctx context.Context, ckpt *nvidiacomv1alpha1.DynamoCheckpoint, job *batchv1.Job, checkpointID string) (ctrl.Result, error) { + snap := &nvidiacomv1alpha1.Snapshot{} + if err := r.Get(ctx, client.ObjectKey{Namespace: ckpt.Namespace, Name: snapshotName(checkpointID)}, snap); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{RequeueAfter: time.Second}, nil } + return ctrl.Result{}, err + } + + // A Snapshot can fail before it is bound (e.g. the SnapshotReconciler rejects the + // source pod), so always observe Failed. Ready is only meaningful once bound. + if nvidiacomv1alpha1.IsSnapshotFailed(snap) { + return r.failCreating(ctx, ckpt, "SnapshotFailed", snapshotConditionMessage(snap, nvidiacomv1alpha1.SnapshotConditionFailed)) + } + if snap.Status.BoundSnapshotContentName != nil && nvidiacomv1alpha1.IsSnapshotSucceeded(snap) { + return r.markCheckpointReady(ctx, ckpt, checkpointID, snapshotConditionMessage(snap, nvidiacomv1alpha1.SnapshotConditionReady)) } - observation := snapshotprotocol.ObserveCheckpointJob(job, checkpointWorkerActive) - switch observation.Phase { - case snapshotprotocol.CheckpointObservationPhaseWaitingForConfirmation: - logger.V(1).Info("Checkpoint job is complete but checkpoint worker is still active; waiting for terminal watcher status", "job", job.Name) - return ctrl.Result{RequeueAfter: time.Second}, nil - case snapshotprotocol.CheckpointObservationPhaseReady: - logger.Info("Checkpoint Job succeeded", "job", job.Name) - r.Recorder.Event(ckpt, corev1.EventTypeNormal, "CheckpointReady", observation.Message) + // Hang guard 1: the owned Job failed while the Snapshot is still non-terminal. + if jobFailed, message := checkpointJobFailed(job); jobFailed { + return r.failCreating(ctx, ckpt, "JobFailed", message) + } - now := metav1.Now() - ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseReady - ckpt.Status.CreatedAt = &now - ckpt.Status.Message = "" - meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{ - Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted), - Status: metav1.ConditionTrue, - Reason: observation.Reason, - Message: observation.Message, - }) - if err := r.Status().Update(ctx, ckpt); err != nil { - return ctrl.Result{}, err + // Hang guard 2: the Job ran past its deadline without a terminal Snapshot. + if job.Spec.ActiveDeadlineSeconds != nil { + deadline := job.CreationTimestamp.Add(time.Duration(*job.Spec.ActiveDeadlineSeconds) * time.Second) + if time.Now().After(deadline) { + return r.failCreating(ctx, ckpt, "CheckpointDeadlineExceeded", + fmt.Sprintf("checkpoint did not complete before the Job deadline (%s)", deadline.Format(time.RFC3339))) } - return ctrl.Result{}, nil - case snapshotprotocol.CheckpointObservationPhaseFailed: - logger.Info("Checkpoint Job failed", "job", job.Name, "message", observation.Message) - r.Recorder.Event(ckpt, corev1.EventTypeWarning, "CheckpointFailed", observation.Message) + } - ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseFailed - ckpt.Status.Message = observation.Message - meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{ - Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted), - Status: metav1.ConditionFalse, - Reason: observation.Reason, - Message: observation.Message, - }) - if err := r.Status().Update(ctx, ckpt); err != nil { - return ctrl.Result{}, err + return ctrl.Result{}, nil +} + +// failCreating marks the DynamoCheckpoint Failed with a completion-condition reason. +func (r *CheckpointReconciler) failCreating(ctx context.Context, ckpt *nvidiacomv1alpha1.DynamoCheckpoint, reason, message string) (ctrl.Result, error) { + log.FromContext(ctx).Info("Checkpoint failed", "reason", reason, "message", message) + r.Recorder.Event(ckpt, corev1.EventTypeWarning, "CheckpointFailed", message) + ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseFailed + ckpt.Status.Message = message + meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{ + Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted), + Status: metav1.ConditionFalse, + Reason: reason, + Message: message, + }) + return ctrl.Result{}, r.Status().Update(ctx, ckpt) +} + +// markCheckpointReady marks the DynamoCheckpoint Ready after its bound Snapshot succeeded. +func (r *CheckpointReconciler) markCheckpointReady(ctx context.Context, ckpt *nvidiacomv1alpha1.DynamoCheckpoint, checkpointID, message string) (ctrl.Result, error) { + log.FromContext(ctx).Info("Checkpoint ready", "checkpointID", checkpointID) + r.Recorder.Event(ckpt, corev1.EventTypeNormal, "CheckpointReady", message) + ckpt.Status.Phase = nvidiacomv1alpha1.DynamoCheckpointPhaseReady + ckpt.Status.CheckpointID = checkpointID + ckpt.Status.CreatedAt = ptr.To(metav1.Now()) + ckpt.Status.Message = "" + meta.SetStatusCondition(&ckpt.Status.Conditions, metav1.Condition{ + Type: string(nvidiacomv1alpha1.DynamoCheckpointConditionJobCompleted), + Status: metav1.ConditionTrue, + Reason: "SnapshotReady", + Message: message, + }) + return ctrl.Result{}, r.Status().Update(ctx, ckpt) +} + +// snapshotConditionMessage returns the message of the named Snapshot condition, or "". +func snapshotConditionMessage(snap *nvidiacomv1alpha1.Snapshot, condType string) string { + if cond := meta.FindStatusCondition(snap.Status.Conditions, condType); cond != nil { + return cond.Message + } + return "" +} + +// checkpointJobFailed reports whether the Job has a True JobFailed condition. +func checkpointJobFailed(job *batchv1.Job) (bool, string) { + for _, condition := range job.Status.Conditions { + if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue { + message := "checkpoint job failed" + if condition.Message != "" { + message = fmt.Sprintf("%s: %s", message, condition.Message) + } + return true, message } - return ctrl.Result{}, nil - default: - return ctrl.Result{}, nil } + return false, "" } //nolint:gocyclo @@ -504,6 +528,15 @@ func (r *CheckpointReconciler) SetupWithManager(mgr ctrl.Manager) error { UpdateFunc: func(ue event.UpdateEvent) bool { return true }, GenericFunc: func(ge event.GenericEvent) bool { return true }, })). + Owns(&nvidiacomv1alpha1.Snapshot{}, builder.WithPredicates(predicate.Funcs{ + // Ignore create (we just created it). Watch update (status mirror) and + // delete (re-enqueue to recreate / unblock). Delete is safe: reconcile + // exits at the deletion-timestamp guard before reaching observeSnapshot. + CreateFunc: func(ce event.CreateEvent) bool { return false }, + DeleteFunc: func(de event.DeleteEvent) bool { return true }, + UpdateFunc: func(ue event.UpdateEvent) bool { return true }, + GenericFunc: func(ge event.GenericEvent) bool { return false }, + })). WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config, r.RuntimeConfig)). Complete(r) } diff --git a/deploy/operator/internal/controller/dynamocheckpoint_controller_test.go b/deploy/operator/internal/controller/dynamocheckpoint_controller_test.go index fdb63a8f93f1..62178d7e086b 100644 --- a/deploy/operator/internal/controller/dynamocheckpoint_controller_test.go +++ b/deploy/operator/internal/controller/dynamocheckpoint_controller_test.go @@ -119,19 +119,6 @@ func makeTestCheckpoint(phase nvidiacomv1alpha1.DynamoCheckpointPhase) *nvidiaco } } -func makeCheckpointLease(name string, renewTime time.Time, durationSeconds int32) *coordinationv1.Lease { - renewMicroTime := metav1.NewMicroTime(renewTime) - return &coordinationv1.Lease{ - ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: testNamespace}, - Spec: coordinationv1.LeaseSpec{ - HolderIdentity: ptr.To("snapshot-agent/test"), - LeaseDurationSeconds: &durationSeconds, - AcquireTime: &renewMicroTime, - RenewTime: &renewMicroTime, - }, - } -} - func requireCheckpointContainer(t *testing.T, containers []corev1.Container, name string) *corev1.Container { t.Helper() if container := findCheckpointContainer(containers, name); container != nil { @@ -841,200 +828,130 @@ func TestCheckpointReconciler_HandleCreating(t *testing.T) { snap := &nvidiacomv1alpha1.Snapshot{} require.NoError(t, r.Get(ctx, types.NamespacedName{Name: snapshotName(testHash), Namespace: testNamespace}, snap)) - assert.Equal(t, testHash, snap.Spec.CheckpointID) + assert.Equal(t, testHash, snap.Labels[snapshotprotocol.CheckpointIDLabel]) assert.Equal(t, "worker-0", snap.Spec.Source.PodRef.Name) assert.True(t, metav1.IsControlledBy(snap, ckpt)) }) - t.Run("succeeded job transitions to Ready", func(t *testing.T) { - ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) - job := &batchv1.Job{ + // ownedSnapshot returns a Snapshot owned by ckpt and bound to a SnapshotContent, + // carrying the given terminal condition (empty type leaves it Pending). + ownedSnapshot := func(ckpt *nvidiacomv1alpha1.DynamoCheckpoint, condType string) *nvidiacomv1alpha1.Snapshot { + bound := "snapshotcontent-" + testHash + snap := &nvidiacomv1alpha1.Snapshot{ ObjectMeta: metav1.ObjectMeta{ - Name: defaultCheckpointJobName, - Namespace: testNamespace, - Annotations: map[string]string{snapshotprotocol.CheckpointStatusAnnotation: snapshotprotocol.CheckpointStatusCompleted}, + Name: snapshotName(testHash), + Namespace: testNamespace, + OwnerReferences: []metav1.OwnerReference{{ + APIVersion: nvidiacomv1alpha1.GroupVersion.String(), + Kind: "DynamoCheckpoint", + Name: ckpt.Name, + UID: ckpt.UID, + Controller: ptr.To(true), + }}, }, - Status: batchv1.JobStatus{ - Succeeded: 1, - Conditions: []batchv1.JobCondition{ - {Type: batchv1.JobComplete, Status: corev1.ConditionTrue, LastTransitionTime: metav1.Now()}, - }, + Spec: nvidiacomv1alpha1.SnapshotSpec{ + Source: nvidiacomv1alpha1.SnapshotSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0"}}, }, + Status: nvidiacomv1alpha1.SnapshotStatus{BoundSnapshotContentName: &bound}, + } + if condType != "" { + snap.Status.Conditions = []metav1.Condition{{ + Type: condType, + Status: metav1.ConditionTrue, + Reason: "Test", + Message: condType + " from agent", + }} } + return snap + } - r := makeCheckpointReconciler(s, ckpt, job, newOwnedPod(podNameFromJob(job.Name), job)) + t.Run("Snapshot Ready transitions checkpoint to Ready", func(t *testing.T) { + ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) + job := newCheckpointJob(defaultCheckpointJobName) + snap := ownedSnapshot(ckpt, nvidiacomv1alpha1.SnapshotConditionReady) + + r := makeCheckpointReconciler(s, ckpt, job, snap, newOwnedPod(podNameFromJob(job.Name), job)) _, err := r.handleCreating(ctx, ckpt) require.NoError(t, err) updated := &nvidiacomv1alpha1.DynamoCheckpoint{} require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseReady, updated.Status.Phase) + assert.Equal(t, testHash, updated.Status.CheckpointID) assert.NotNil(t, updated.Status.CreatedAt) }) - t.Run("failed job transitions to Failed", func(t *testing.T) { - ckpt := makeCreatingCkpt(testHash, "job-fail") - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{Name: "job-fail", Namespace: testNamespace}, - Status: batchv1.JobStatus{ - Conditions: []batchv1.JobCondition{{Type: batchv1.JobFailed, Status: corev1.ConditionTrue}}, - }, - } + t.Run("Snapshot Failed transitions checkpoint to Failed", func(t *testing.T) { + ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) + job := newCheckpointJob(defaultCheckpointJobName) + snap := ownedSnapshot(ckpt, nvidiacomv1alpha1.SnapshotConditionFailed) - r := makeCheckpointReconciler(s, ckpt, job, newOwnedPod(podNameFromJob(job.Name), job)) + r := makeCheckpointReconciler(s, ckpt, job, snap, newOwnedPod(podNameFromJob(job.Name), job)) _, err := r.handleCreating(ctx, ckpt) require.NoError(t, err) updated := &nvidiacomv1alpha1.DynamoCheckpoint{} require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseFailed, updated.Status.Phase) + assert.Contains(t, updated.Status.Message, "from agent") }) - t.Run("completed job without completion annotation waits while lease is active", func(t *testing.T) { - ckpt := makeCreatingCkpt(testHash, "job-missing-status-active-lease") - completionTime := metav1.NewTime(time.Now()) - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{Name: "job-missing-status-active-lease", Namespace: testNamespace}, - Status: batchv1.JobStatus{ - Succeeded: 1, - CompletionTime: &completionTime, - Conditions: []batchv1.JobCondition{ - {Type: batchv1.JobComplete, Status: corev1.ConditionTrue, LastTransitionTime: completionTime}, - }, - }, - } - lease := makeCheckpointLease("job-missing-status-active-lease", time.Now(), 30) - - r := makeCheckpointReconciler(s, ckpt, job, lease, newOwnedPod(podNameFromJob(job.Name), job)) - result, err := r.handleCreating(ctx, ckpt) - require.NoError(t, err) - assert.Equal(t, time.Second, result.RequeueAfter) - - updated := &nvidiacomv1alpha1.DynamoCheckpoint{} - require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) - assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseCreating, updated.Status.Phase) - }) + t.Run("unbound Snapshot Failed transitions checkpoint to Failed", func(t *testing.T) { + ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) + job := newCheckpointJob(defaultCheckpointJobName) + snap := ownedSnapshot(ckpt, nvidiacomv1alpha1.SnapshotConditionFailed) + snap.Status.BoundSnapshotContentName = nil // failed before binding - t.Run("completed job without completion annotation transitions to Failed once lease expires", func(t *testing.T) { - ckpt := makeCreatingCkpt(testHash, "job-missing-status") - completionTime := metav1.NewTime(time.Now().Add(-time.Minute)) - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{Name: "job-missing-status", Namespace: testNamespace}, - Status: batchv1.JobStatus{ - Succeeded: 1, - CompletionTime: &completionTime, - Conditions: []batchv1.JobCondition{ - {Type: batchv1.JobComplete, Status: corev1.ConditionTrue, LastTransitionTime: completionTime}, - }, - }, - } - r := makeCheckpointReconciler(s, ckpt, job, newOwnedPod(podNameFromJob(job.Name), job)) + r := makeCheckpointReconciler(s, ckpt, job, snap, newOwnedPod(podNameFromJob(job.Name), job)) _, err := r.handleCreating(ctx, ckpt) require.NoError(t, err) updated := &nvidiacomv1alpha1.DynamoCheckpoint{} require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseFailed, updated.Status.Phase) - assert.Contains(t, updated.Status.Message, "without snapshot-agent completion confirmation") }) - t.Run("completed job with failed completion annotation transitions to Failed", func(t *testing.T) { - ckpt := makeCreatingCkpt(testHash, "job-agent-failed") - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: "job-agent-failed", - Namespace: testNamespace, - Annotations: map[string]string{snapshotprotocol.CheckpointStatusAnnotation: snapshotprotocol.CheckpointStatusFailed}, - }, - Status: batchv1.JobStatus{ - Succeeded: 1, - Conditions: []batchv1.JobCondition{ - {Type: batchv1.JobComplete, Status: corev1.ConditionTrue, LastTransitionTime: metav1.Now()}, - }, - }, + t.Run("failed Job while Snapshot non-terminal transitions to Failed", func(t *testing.T) { + ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) + job := newCheckpointJob(defaultCheckpointJobName) + job.Status = batchv1.JobStatus{ + Conditions: []batchv1.JobCondition{{Type: batchv1.JobFailed, Status: corev1.ConditionTrue, Message: "deadline"}}, } + snap := ownedSnapshot(ckpt, "") - r := makeCheckpointReconciler(s, ckpt, job, newOwnedPod(podNameFromJob(job.Name), job)) + r := makeCheckpointReconciler(s, ckpt, job, snap, newOwnedPod(podNameFromJob(job.Name), job)) _, err := r.handleCreating(ctx, ckpt) require.NoError(t, err) updated := &nvidiacomv1alpha1.DynamoCheckpoint{} require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseFailed, updated.Status.Phase) - assert.Contains(t, updated.Status.Message, "snapshot-agent reported checkpoint failure") }) - t.Run("running job with failed checkpoint annotation transitions to Failed", func(t *testing.T) { - ckpt := makeCreatingCkpt(testHash, "job-running-agent-failed") - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: "job-running-agent-failed", - Namespace: testNamespace, - Annotations: map[string]string{snapshotprotocol.CheckpointStatusAnnotation: snapshotprotocol.CheckpointStatusFailed}, - }, - Status: batchv1.JobStatus{Active: 1}, - } + t.Run("past deadline without terminal Snapshot transitions to Failed", func(t *testing.T) { + ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) + job := newCheckpointJob(defaultCheckpointJobName) + job.CreationTimestamp = metav1.NewTime(time.Now().Add(-time.Hour)) + job.Spec.ActiveDeadlineSeconds = ptr.To(int64(60)) + snap := ownedSnapshot(ckpt, "") - r := makeCheckpointReconciler(s, ckpt, job, newOwnedPod(podNameFromJob(job.Name), job)) + r := makeCheckpointReconciler(s, ckpt, job, snap, newOwnedPod(podNameFromJob(job.Name), job)) _, err := r.handleCreating(ctx, ckpt) require.NoError(t, err) updated := &nvidiacomv1alpha1.DynamoCheckpoint{} require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseFailed, updated.Status.Phase) - assert.Equal(t, "Checkpoint job failed", updated.Status.Message) + assert.Contains(t, updated.Status.Message, "deadline") }) - t.Run("running job keeps Creating phase", func(t *testing.T) { - ckpt := makeCreatingCkpt(testHash, "job-run") - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{Name: "job-run", Namespace: testNamespace}, - Status: batchv1.JobStatus{Active: 1}, - } - - r := makeCheckpointReconciler(s, ckpt, job, newOwnedPod(podNameFromJob(job.Name), job)) - _, err := r.handleCreating(ctx, ckpt) - require.NoError(t, err) - - updated := &nvidiacomv1alpha1.DynamoCheckpoint{} - require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) - assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseCreating, updated.Status.Phase) - }) - - t.Run("in-flight version changes do not relabel the running job's artifact", func(t *testing.T) { + t.Run("Snapshot not yet found requeues without changing phase", func(t *testing.T) { ckpt := makeCreatingCkpt(testHash, defaultCheckpointJobName) - ckpt.Annotations = map[string]string{snapshotprotocol.CheckpointArtifactVersionAnnotation: "2"} - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: defaultCheckpointJobName, - Namespace: testNamespace, - Annotations: map[string]string{snapshotprotocol.CheckpointStatusAnnotation: snapshotprotocol.CheckpointStatusCompleted}, - }, - Status: batchv1.JobStatus{ - Succeeded: 1, - Conditions: []batchv1.JobCondition{ - {Type: batchv1.JobComplete, Status: corev1.ConditionTrue, LastTransitionTime: metav1.Now()}, - }, - }, - } - - r := makeCheckpointReconciler(s, ckpt, job, newOwnedPod(podNameFromJob(job.Name), job)) - _, err := r.handleCreating(ctx, ckpt) - require.NoError(t, err) - - updated := &nvidiacomv1alpha1.DynamoCheckpoint{} - require.NoError(t, r.Get(ctx, types.NamespacedName{Name: testHash, Namespace: testNamespace}, updated)) - assert.Equal(t, nvidiacomv1alpha1.DynamoCheckpointPhaseReady, updated.Status.Phase) - }) - - t.Run("succeeded count without complete condition keeps Creating phase", func(t *testing.T) { - ckpt := makeCreatingCkpt(testHash, "job-succeeded-not-complete") - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{Name: "job-succeeded-not-complete", Namespace: testNamespace}, - Status: batchv1.JobStatus{Succeeded: 1}, - } + job := newCheckpointJob(defaultCheckpointJobName) r := makeCheckpointReconciler(s, ckpt, job, newOwnedPod(podNameFromJob(job.Name), job)) + // ensureSnapshot will create the Snapshot; without a status it stays Pending, + // so the checkpoint remains Creating. _, err := r.handleCreating(ctx, ckpt) require.NoError(t, err) diff --git a/deploy/operator/internal/controller/snapshot_reconciler.go b/deploy/operator/internal/controller/snapshot_reconciler.go new file mode 100644 index 000000000000..11ea87a09672 --- /dev/null +++ b/deploy/operator/internal/controller/snapshot_reconciler.go @@ -0,0 +1,341 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controller + +import ( + "context" + "errors" + "fmt" + "math/rand" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" + snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" +) + +const ( + // snapshotFinalizer is set on the Snapshot so its bound SnapshotContent is + // deleted before the Snapshot is removed. + snapshotFinalizer = "nvidia.com/snapshot-content-cleanup" + + // snapshotContentFieldManager is the Server-Side Apply field owner for SnapshotContents. + snapshotContentFieldManager = "dynamo-snapshot-controller" + + // snapshotPodResolveBackoffBase is the minimum requeue delay while waiting for the + // source pod to be scheduled; jitter is added on top to avoid a synchronized hot loop. + snapshotPodResolveBackoffBase = 2 * time.Second + + // snapshotContentDeleteRequeue is the delay between cascade-delete progress checks. + snapshotContentDeleteRequeue = time.Second + + // maxResourceNameLength is the Kubernetes object name limit (RFC 1123 subdomain). + maxResourceNameLength = 253 +) + +// errSnapshotPodUnscheduled signals that the source pod is not yet scheduled and the +// reconcile should retry with backoff rather than fail. +var errSnapshotPodUnscheduled = errors.New("source pod is not yet scheduled to a node") + +// SnapshotReconciler reconciles a Snapshot: it creates the bound, cluster-scoped +// SnapshotContent work order for the node agent, mirrors the agent's terminal status +// back to the Snapshot, and cascades deletion to the SnapshotContent. +type SnapshotReconciler struct { + client.Client + Recorder record.EventRecorder +} + +// +kubebuilder:rbac:groups=nvidia.com,resources=snapshots,verbs=get;list;watch;update;patch +// +kubebuilder:rbac:groups=nvidia.com,resources=snapshots/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=nvidia.com,resources=snapshots/finalizers,verbs=update +// +kubebuilder:rbac:groups=nvidia.com,resources=snapshotcontents,verbs=create;get;list;watch;update;patch;delete +// +kubebuilder:rbac:groups=nvidia.com,resources=snapshotcontents/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch + +// Reconcile drives a Snapshot through binding, status mirroring, and cascade deletion. +func (sr *SnapshotReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + snap := &nvidiacomv1alpha1.Snapshot{} + if err := sr.Get(ctx, req.NamespacedName, snap); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + if !snap.GetDeletionTimestamp().IsZero() { + return sr.handleDelete(ctx, snap) + } + + if !controllerutil.ContainsFinalizer(snap, snapshotFinalizer) { + controllerutil.AddFinalizer(snap, snapshotFinalizer) + if err := sr.Update(ctx, snap); err != nil { + return ctrl.Result{}, fmt.Errorf("add snapshot finalizer: %w", err) + } + return ctrl.Result{}, nil + } + + pod, err := sr.getSourcePod(ctx, snap) + if err != nil { + if apierrors.IsNotFound(err) { + logger.V(1).Info("Source pod not found, backing off", "snapshot", snap.Name) + return ctrl.Result{RequeueAfter: jitteredBackoff(snapshotPodResolveBackoffBase)}, nil + } + return ctrl.Result{}, err + } + if err := validateSourcePod(pod); err != nil { + logger.V(1).Info("Source pod not ready, backing off", "snapshot", snap.Name, "reason", err.Error()) + return ctrl.Result{RequeueAfter: jitteredBackoff(snapshotPodResolveBackoffBase)}, nil + } + + // The checkpoint ID is carried as a label (set by the DynamoCheckpoint controller), + // not a spec field; the agent independently reads it from the source pod. + id := snap.Labels[snapshotprotocol.CheckpointIDLabel] + if id == "" { + return sr.failSnapshot(ctx, snap, "MissingCheckpointID", + fmt.Errorf("snapshot %q missing %s label", snap.Name, snapshotprotocol.CheckpointIDLabel)) + } + contentName := snapshotContentName(id) + + content, err := sr.ensureSnapshotContent(ctx, snap, contentName, pod) + if err != nil { + return ctrl.Result{}, err + } + // A freshly-created content always matches; only a pre-existing content whose + // source pod was rescheduled to another node mismatches (spec is immutable). + if content.Spec.Source.NodeName != pod.Spec.NodeName { + return sr.failSnapshot(ctx, snap, "PodRescheduled", + fmt.Errorf("source pod moved from node %q to %q; CRIU checkpoint cannot survive migration", + content.Spec.Source.NodeName, pod.Spec.NodeName)) + } + + return sr.propagateStatus(ctx, snap, content) +} + +// getSourcePod loads the source pod referenced by the Snapshot. +func (sr *SnapshotReconciler) getSourcePod(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot) (*corev1.Pod, error) { + pod := &corev1.Pod{} + key := client.ObjectKey{Namespace: snap.Namespace, Name: snap.Spec.Source.PodRef.Name} + if err := sr.Get(ctx, key, pod); err != nil { + return nil, err + } + return pod, nil +} + +// validateSourcePod requires the pod to be scheduled to a node. +func validateSourcePod(pod *corev1.Pod) error { + if pod.Spec.NodeName == "" { + return errSnapshotPodUnscheduled + } + return nil +} + +// ensureSnapshotContent returns the existing SnapshotContent or, when absent, creates the +// trigger via a single Server-Side Apply carrying the source ref and the node mirror label. +// The returned object is the source of truth for the reschedule guard. +func (sr *SnapshotReconciler) ensureSnapshotContent(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot, contentName string, pod *corev1.Pod) (*nvidiacomv1alpha1.SnapshotContent, error) { + existing := &nvidiacomv1alpha1.SnapshotContent{} + if err := sr.Get(ctx, client.ObjectKey{Name: contentName}, existing); err == nil { + return existing, nil + } else if !apierrors.IsNotFound(err) { + return nil, err + } + + content := sr.buildSnapshotContent(snap, contentName, pod) + if err := sr.Patch(ctx, content, client.Apply, + client.FieldOwner(snapshotContentFieldManager), client.ForceOwnership); err != nil { + sr.Recorder.Event(snap, corev1.EventTypeWarning, "SnapshotContentCreateFailed", err.Error()) + return nil, fmt.Errorf("apply SnapshotContent %q: %w", contentName, err) + } + return content, nil +} + +// buildSnapshotContent constructs the desired cluster-scoped SnapshotContent for a Snapshot. +func (sr *SnapshotReconciler) buildSnapshotContent(snap *nvidiacomv1alpha1.Snapshot, contentName string, pod *corev1.Pod) *nvidiacomv1alpha1.SnapshotContent { + return &nvidiacomv1alpha1.SnapshotContent{ + TypeMeta: metav1.TypeMeta{ + APIVersion: nvidiacomv1alpha1.GroupVersion.String(), + Kind: "SnapshotContent", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: contentName, + Labels: map[string]string{ + snapshotprotocol.SnapshotNodeLabel: pod.Spec.NodeName, + }, + }, + Spec: nvidiacomv1alpha1.SnapshotContentSpec{ + SnapshotRef: nvidiacomv1alpha1.SnapshotReference{ + Namespace: snap.Namespace, + Name: snap.Name, + UID: snap.UID, + }, + Source: nvidiacomv1alpha1.SnapshotContentSource{ + PodRef: nvidiacomv1alpha1.PodReference{Name: pod.Name, UID: pod.UID}, + NodeName: pod.Spec.NodeName, + }, + }, + } +} + +// propagateStatus records the binding and mirrors the SnapshotContent's terminal status to +// the Snapshot, defaulting to a Pending condition until the agent writes a result. It +// receives the content resolved earlier in the reconcile, so it never re-Gets it. +func (sr *SnapshotReconciler) propagateStatus(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot, content *nvidiacomv1alpha1.SnapshotContent) (ctrl.Result, error) { + changed := false + if snap.Status.BoundSnapshotContentName == nil || *snap.Status.BoundSnapshotContentName != content.Name { + name := content.Name + snap.Status.BoundSnapshotContentName = &name + changed = true + } + + switch { + case nvidiacomv1alpha1.IsSnapshotContentSucceeded(content): + cond := meta.FindStatusCondition(content.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady) + changed = sr.setCondition(snap, nvidiacomv1alpha1.SnapshotConditionReady, metav1.ConditionTrue, cond.Reason, cond.Message) || changed + case nvidiacomv1alpha1.IsSnapshotContentFailed(content): + cond := meta.FindStatusCondition(content.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + changed = sr.setCondition(snap, nvidiacomv1alpha1.SnapshotConditionFailed, metav1.ConditionTrue, cond.Reason, cond.Message) || changed + default: + changed = sr.setCondition(snap, nvidiacomv1alpha1.SnapshotConditionReady, metav1.ConditionFalse, "Pending", "Waiting for node agent to capture the checkpoint") || changed + } + + if !changed { + return ctrl.Result{}, nil + } + if err := sr.Status().Update(ctx, snap); err != nil { + return ctrl.Result{}, fmt.Errorf("update snapshot status: %w", err) + } + return ctrl.Result{}, nil +} + +// setCondition sets a status condition and reports whether it changed. +func (sr *SnapshotReconciler) setCondition(snap *nvidiacomv1alpha1.Snapshot, condType string, status metav1.ConditionStatus, reason, message string) bool { + return meta.SetStatusCondition(&snap.Status.Conditions, metav1.Condition{ + Type: condType, + Status: status, + Reason: reason, + Message: message, + }) +} + +// failSnapshot marks the Snapshot Failed terminally and records an event. +func (sr *SnapshotReconciler) failSnapshot(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot, reason string, cause error) (ctrl.Result, error) { + sr.Recorder.Event(snap, corev1.EventTypeWarning, reason, cause.Error()) + sr.setCondition(snap, nvidiacomv1alpha1.SnapshotConditionFailed, metav1.ConditionTrue, reason, cause.Error()) + if err := sr.Status().Update(ctx, snap); err != nil { + return ctrl.Result{}, fmt.Errorf("mark snapshot failed: %w", err) + } + return ctrl.Result{}, nil +} + +// handleDelete cascades deletion to the bound SnapshotContent and blocks (requeues) until +// it is gone before dropping the Snapshot finalizer. The SnapshotContent carries no +// finalizer of its own, so the Delete takes effect immediately. +func (sr *SnapshotReconciler) handleDelete(ctx context.Context, snap *nvidiacomv1alpha1.Snapshot) (ctrl.Result, error) { + if !controllerutil.ContainsFinalizer(snap, snapshotFinalizer) { + return ctrl.Result{}, nil + } + + // Without a checkpoint-id label no SnapshotContent could have been bound; drop the + // finalizer rather than misroute a delete to a wrongly-named object. + id := snap.Labels[snapshotprotocol.CheckpointIDLabel] + if id == "" { + controllerutil.RemoveFinalizer(snap, snapshotFinalizer) + if err := sr.Update(ctx, snap); err != nil { + return ctrl.Result{}, fmt.Errorf("remove snapshot finalizer: %w", err) + } + return ctrl.Result{}, nil + } + + contentName := snapshotContentName(id) + content := &nvidiacomv1alpha1.SnapshotContent{ObjectMeta: metav1.ObjectMeta{Name: contentName}} + if err := sr.Delete(ctx, content); err != nil && !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("delete SnapshotContent %q: %w", contentName, err) + } + + // Block until the content is confirmed gone before releasing the Snapshot. + if err := sr.Get(ctx, client.ObjectKey{Name: contentName}, &nvidiacomv1alpha1.SnapshotContent{}); err == nil { + return ctrl.Result{RequeueAfter: snapshotContentDeleteRequeue}, nil + } else if !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("confirm SnapshotContent %q deleted: %w", contentName, err) + } + + controllerutil.RemoveFinalizer(snap, snapshotFinalizer) + if err := sr.Update(ctx, snap); err != nil { + return ctrl.Result{}, fmt.Errorf("remove snapshot finalizer: %w", err) + } + return ctrl.Result{}, nil +} + +// SetupWithManager wires the controller: it owns Snapshots and watches SnapshotContents, +// mapping a SnapshotContent back to its bound Snapshot via spec.snapshotRef. +func (sr *SnapshotReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&nvidiacomv1alpha1.Snapshot{}). + Watches( + &nvidiacomv1alpha1.SnapshotContent{}, + handler.EnqueueRequestsFromMapFunc(snapshotContentToSnapshot), + ). + Complete(sr) +} + +// snapshotContentToSnapshot maps a SnapshotContent (including a delete-event tombstone) back +// to its bound Snapshot. It MUST unwrap cache.DeletedFinalStateUnknown so that the final +// SnapshotContent delete still re-enqueues the Snapshot and the cascade can complete. +func snapshotContentToSnapshot(_ context.Context, obj client.Object) []reconcile.Request { + ref, ok := snapshotRefFromContentObj(obj) + if !ok || ref.Name == "" { + return nil + } + return []reconcile.Request{{NamespacedName: types.NamespacedName{Namespace: ref.Namespace, Name: ref.Name}}} +} + +// snapshotRefFromContentObj extracts the bound Snapshot reference from a SnapshotContent, +// unwrapping a cache.DeletedFinalStateUnknown tombstone first so the final delete event +// still re-enqueues the Snapshot and the cascade can complete (F-2.2). +func snapshotRefFromContentObj(obj any) (nvidiacomv1alpha1.SnapshotReference, bool) { + if tombstone, isTombstone := obj.(cache.DeletedFinalStateUnknown); isTombstone { + obj = tombstone.Obj + } + content, ok := obj.(*nvidiacomv1alpha1.SnapshotContent) + if !ok { + return nvidiacomv1alpha1.SnapshotReference{}, false + } + return content.Spec.SnapshotRef, true +} + +// snapshotContentName composes the deterministic cluster-scoped SnapshotContent name. +func snapshotContentName(checkpointID string) string { + return "snapshotcontent-" + checkpointID +} + +// jitteredBackoff adds up to 50% jitter to a base delay to avoid synchronized requeues. +func jitteredBackoff(base time.Duration) time.Duration { + return base + time.Duration(rand.Int63n(int64(base/2)+1)) +} diff --git a/deploy/operator/internal/controller/snapshot_reconciler_test.go b/deploy/operator/internal/controller/snapshot_reconciler_test.go new file mode 100644 index 000000000000..1790d4e99c00 --- /dev/null +++ b/deploy/operator/internal/controller/snapshot_reconciler_test.go @@ -0,0 +1,267 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controller + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + + nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" + snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" +) + +func snapshotReconcilerScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = nvidiacomv1alpha1.AddToScheme(s) + _ = corev1.AddToScheme(s) + return s +} + +func makeSnapshotReconciler(s *runtime.Scheme, objs ...client.Object) *SnapshotReconciler { + return &SnapshotReconciler{ + Client: fake.NewClientBuilder().WithScheme(s).WithObjects(objs...). + WithStatusSubresource(&nvidiacomv1alpha1.Snapshot{}, &nvidiacomv1alpha1.SnapshotContent{}).Build(), + Recorder: record.NewFakeRecorder(10), + } +} + +func makeSnapshotForReconcile(checkpointID, podName string) *nvidiacomv1alpha1.Snapshot { + return &nvidiacomv1alpha1.Snapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "snapshot-" + checkpointID, + Namespace: "inference", + UID: types.UID("snap-uid"), + Finalizers: []string{snapshotFinalizer}, + Labels: map[string]string{snapshotprotocol.CheckpointIDLabel: checkpointID}, + }, + Spec: nvidiacomv1alpha1.SnapshotSpec{ + Source: nvidiacomv1alpha1.SnapshotSource{PodRef: nvidiacomv1alpha1.PodReference{Name: podName}}, + }, + } +} + +func scheduledPod(name, node string) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "inference", UID: types.UID("pod-uid-9")}, + Spec: corev1.PodSpec{NodeName: node}, + } +} + +func reconcileSnapshot(t *testing.T, r *SnapshotReconciler, name string) ctrl.Result { + t.Helper() + res, err := r.Reconcile(context.Background(), + ctrl.Request{NamespacedName: types.NamespacedName{Namespace: "inference", Name: name}}) + require.NoError(t, err) + return res +} + +func TestSnapshotReconciler_PodUnscheduledBacksOff(t *testing.T) { + s := snapshotReconcilerScheme() + snap := makeSnapshotForReconcile("abc123", "worker-0") + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference"}} + r := makeSnapshotReconciler(s, snap, pod) + + res := reconcileSnapshot(t, r, snap.Name) + assert.Positive(t, res.RequeueAfter) + + var contents nvidiacomv1alpha1.SnapshotContentList + require.NoError(t, r.List(context.Background(), &contents)) + assert.Empty(t, contents.Items) +} + +func TestSnapshotReconciler_BuildsWorkOrderAndBinds(t *testing.T) { + s := snapshotReconcilerScheme() + snap := makeSnapshotForReconcile("abc123", "worker-0") + r := makeSnapshotReconciler(s, snap, scheduledPod("worker-0", "node-a")) + + reconcileSnapshot(t, r, snap.Name) + + content := &nvidiacomv1alpha1.SnapshotContent{} + require.NoError(t, r.Get(context.Background(), types.NamespacedName{Name: "snapshotcontent-abc123"}, content)) + assert.Equal(t, "worker-0", content.Spec.Source.PodRef.Name) + assert.Equal(t, types.UID("pod-uid-9"), content.Spec.Source.PodRef.UID) + assert.Equal(t, "node-a", content.Spec.Source.NodeName) + assert.Equal(t, "node-a", content.Labels[snapshotprotocol.SnapshotNodeLabel]) + assert.NotContains(t, content.Labels, snapshotprotocol.CheckpointIDLabel) + assert.NotContains(t, content.Annotations, snapshotprotocol.CheckpointArtifactVersionAnnotation) + assert.Empty(t, content.Finalizers) + assert.Equal(t, "inference", content.Spec.SnapshotRef.Namespace) + assert.Equal(t, snap.Name, content.Spec.SnapshotRef.Name) + + updated := &nvidiacomv1alpha1.Snapshot{} + require.NoError(t, r.Get(context.Background(), types.NamespacedName{Namespace: "inference", Name: snap.Name}, updated)) + require.NotNil(t, updated.Status.BoundSnapshotContentName) + assert.Equal(t, "snapshotcontent-abc123", *updated.Status.BoundSnapshotContentName) +} + +func TestSnapshotReconciler_MirrorsReadyAndFailed(t *testing.T) { + for _, tc := range []struct { + name string + condType string + wantReady metav1.ConditionStatus + }{ + {name: "ready", condType: nvidiacomv1alpha1.SnapshotConditionReady}, + {name: "failed", condType: nvidiacomv1alpha1.SnapshotConditionFailed}, + } { + t.Run(tc.name, func(t *testing.T) { + s := snapshotReconcilerScheme() + snap := makeSnapshotForReconcile("abc123", "worker-0") + content := &nvidiacomv1alpha1.SnapshotContent{ + ObjectMeta: metav1.ObjectMeta{Name: "snapshotcontent-abc123", Finalizers: []string{snapshotFinalizer}}, + Spec: nvidiacomv1alpha1.SnapshotContentSpec{ + SnapshotRef: nvidiacomv1alpha1.SnapshotReference{Namespace: "inference", Name: snap.Name}, + Source: nvidiacomv1alpha1.SnapshotContentSource{ + PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0"}, NodeName: "node-a", + }, + }, + Status: nvidiacomv1alpha1.SnapshotContentStatus{ + Conditions: []metav1.Condition{{Type: tc.condType, Status: metav1.ConditionTrue, Reason: "Agent", Message: "done"}}, + }, + } + r := makeSnapshotReconciler(s, snap, content, scheduledPod("worker-0", "node-a")) + + reconcileSnapshot(t, r, snap.Name) + + updated := &nvidiacomv1alpha1.Snapshot{} + require.NoError(t, r.Get(context.Background(), types.NamespacedName{Namespace: "inference", Name: snap.Name}, updated)) + cond := meta.FindStatusCondition(updated.Status.Conditions, tc.condType) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionTrue, cond.Status) + }) + } +} + +func TestSnapshotReconciler_RescheduleFailsSnapshot(t *testing.T) { + s := snapshotReconcilerScheme() + snap := makeSnapshotForReconcile("abc123", "worker-0") + content := &nvidiacomv1alpha1.SnapshotContent{ + ObjectMeta: metav1.ObjectMeta{Name: "snapshotcontent-abc123", Finalizers: []string{snapshotFinalizer}}, + Spec: nvidiacomv1alpha1.SnapshotContentSpec{ + SnapshotRef: nvidiacomv1alpha1.SnapshotReference{Namespace: "inference", Name: snap.Name}, + Source: nvidiacomv1alpha1.SnapshotContentSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0"}, NodeName: "node-a"}, + }, + } + // Pod now runs on a different node than the bound content. + r := makeSnapshotReconciler(s, snap, content, scheduledPod("worker-0", "node-b")) + + reconcileSnapshot(t, r, snap.Name) + + updated := &nvidiacomv1alpha1.Snapshot{} + require.NoError(t, r.Get(context.Background(), types.NamespacedName{Namespace: "inference", Name: snap.Name}, updated)) + cond := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, metav1.ConditionTrue, cond.Status) + assert.Equal(t, "PodRescheduled", cond.Reason) +} + +func TestSnapshotReconciler_MissingCheckpointIDLabelFails(t *testing.T) { + s := snapshotReconcilerScheme() + snap := makeSnapshotForReconcile("abc123", "worker-0") + delete(snap.Labels, snapshotprotocol.CheckpointIDLabel) + r := makeSnapshotReconciler(s, snap, scheduledPod("worker-0", "node-a")) + + reconcileSnapshot(t, r, snap.Name) + + updated := &nvidiacomv1alpha1.Snapshot{} + require.NoError(t, r.Get(context.Background(), types.NamespacedName{Namespace: "inference", Name: snap.Name}, updated)) + cond := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "MissingCheckpointID", cond.Reason) +} + +func TestSnapshotReconciler_DeleteWithoutLabelDropsFinalizer(t *testing.T) { + s := snapshotReconcilerScheme() + now := metav1.Now() + snap := makeSnapshotForReconcile("abc123", "worker-0") + snap.DeletionTimestamp = &now + delete(snap.Labels, snapshotprotocol.CheckpointIDLabel) + r := makeSnapshotReconciler(s, snap) + + reconcileSnapshot(t, r, snap.Name) + + gone := &nvidiacomv1alpha1.Snapshot{} + err := r.Get(context.Background(), types.NamespacedName{Namespace: "inference", Name: snap.Name}, gone) + if err == nil { + assert.False(t, controllerutil.ContainsFinalizer(gone, snapshotFinalizer)) + } else { + assert.True(t, apierrors.IsNotFound(err)) + } +} + +func TestSnapshotReconciler_CascadeDelete(t *testing.T) { + s := snapshotReconcilerScheme() + now := metav1.Now() + snap := makeSnapshotForReconcile("abc123", "worker-0") + snap.DeletionTimestamp = &now + content := &nvidiacomv1alpha1.SnapshotContent{ + ObjectMeta: metav1.ObjectMeta{Name: "snapshotcontent-abc123"}, + Spec: nvidiacomv1alpha1.SnapshotContentSpec{ + SnapshotRef: nvidiacomv1alpha1.SnapshotReference{Namespace: "inference", Name: snap.Name}, + Source: nvidiacomv1alpha1.SnapshotContentSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0"}, NodeName: "node-a"}, + }, + } + r := makeSnapshotReconciler(s, snap, content) + + // The content carries no finalizer, so it is deleted immediately; one pass deletes + // the content and, once confirmed gone, drops the Snapshot finalizer. + reconcileSnapshot(t, r, snap.Name) + err := r.Get(context.Background(), types.NamespacedName{Name: "snapshotcontent-abc123"}, &nvidiacomv1alpha1.SnapshotContent{}) + assert.True(t, apierrors.IsNotFound(err)) + + gone := &nvidiacomv1alpha1.Snapshot{} + err = r.Get(context.Background(), types.NamespacedName{Namespace: "inference", Name: snap.Name}, gone) + if err == nil { + assert.False(t, controllerutil.ContainsFinalizer(gone, snapshotFinalizer)) + } else { + assert.True(t, apierrors.IsNotFound(err)) + } +} + +func TestSnapshotContentToSnapshot_UnwrapsTombstone(t *testing.T) { + content := &nvidiacomv1alpha1.SnapshotContent{ + ObjectMeta: metav1.ObjectMeta{Name: "snapshotcontent-abc123"}, + Spec: nvidiacomv1alpha1.SnapshotContentSpec{ + SnapshotRef: nvidiacomv1alpha1.SnapshotReference{Namespace: "inference", Name: "snapshot-abc123"}, + }, + } + + direct := snapshotContentToSnapshot(context.Background(), content) + require.Len(t, direct, 1) + assert.Equal(t, "snapshot-abc123", direct[0].Name) + + tombstone := cache.DeletedFinalStateUnknown{Key: "snapshotcontent-abc123", Obj: content} + ref, ok := snapshotRefFromContentObj(tombstone) + require.True(t, ok) + assert.Equal(t, "snapshot-abc123", ref.Name) + assert.Equal(t, "inference", ref.Namespace) +} diff --git a/deploy/snapshot/cmd/agent/main.go b/deploy/snapshot/cmd/agent/main.go index e72bbf14f0ce..09d286cc840d 100644 --- a/deploy/snapshot/cmd/agent/main.go +++ b/deploy/snapshot/cmd/agent/main.go @@ -47,11 +47,19 @@ func main() { } }() - ctx, cancel := context.WithCancel(context.Background()) + // rootCtx is cancelled on signal. The single node controller drives both the + // restore (pod informer) and capture (SnapshotContent informer) paths and shuts + // down when rootCtx is cancelled. + rootCtx, cancel := context.WithCancel(context.Background()) defer cancel() sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + go func() { + <-sigChan + agentLog.Info("Shutting down") + cancel() + }() agentLog.Info("Starting snapshot agent", "node", cfg.NodeName, @@ -59,34 +67,13 @@ func main() { "runtime", *runtimeType, ) + // The node controller handles both restore and capture paths. nodeController, err := controller.NewNodeController(cfg, rt, rootLog.WithName("controller")) if err != nil { fatal(agentLog, err, "Failed to create snapshot node controller") } - - // Run the node-local controller in the background. - controllerDone := make(chan error, 1) - go func() { - agentLog.Info("Snapshot node controller started") - controllerDone <- nodeController.Run(ctx) - }() - - // Wait for signal or controller exit. - select { - case <-sigChan: - agentLog.Info("Shutting down") - cancel() - select { - case err := <-controllerDone: - if err != nil { - agentLog.Error(err, "Snapshot node controller exited with error during shutdown") - } - default: - } - case err := <-controllerDone: - if err != nil { - fatal(agentLog, err, "Snapshot node controller exited with error") - } + if runErr := nodeController.Run(rootCtx); runErr != nil { + fatal(agentLog, runErr, "Snapshot node controller exited with error") } agentLog.Info("Agent stopped") diff --git a/deploy/snapshot/go.mod b/deploy/snapshot/go.mod index 3965404580e8..2b5a117b265e 100644 --- a/deploy/snapshot/go.mod +++ b/deploy/snapshot/go.mod @@ -3,6 +3,7 @@ module github.com/ai-dynamo/dynamo/deploy/snapshot go 1.26.3 require ( + github.com/ai-dynamo/dynamo/deploy/operator v0.0.0 github.com/checkpoint-restore/go-criu/v8 v8.2.0 github.com/containerd/containerd v1.7.30 github.com/cyphar/filepath-securejoin v0.5.1 @@ -11,9 +12,10 @@ require ( github.com/google/uuid v1.6.0 github.com/moby/sys/mountinfo v0.7.1 github.com/opencontainers/runtime-spec v1.2.0 - github.com/prometheus/procfs v0.16.1 + github.com/prometheus/procfs v0.17.0 + github.com/stretchr/testify v1.11.1 go.uber.org/zap v1.27.1 - golang.org/x/sys v0.40.0 + golang.org/x/sys v0.41.0 google.golang.org/grpc v1.79.3 google.golang.org/protobuf v1.36.11 gopkg.in/yaml.v3 v3.0.1 @@ -23,7 +25,7 @@ require ( k8s.io/cri-api v0.34.3 k8s.io/cri-client v0.34.3 k8s.io/kubelet v0.34.3 - k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 + k8s.io/utils v0.0.0-20260108192941-914a6e750570 sigs.k8s.io/controller-runtime v0.22.4 sigs.k8s.io/yaml v1.6.0 ) @@ -35,7 +37,7 @@ require ( github.com/Microsoft/hcsshim v0.11.7 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect - github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/containerd/cgroups v1.1.0 // indirect github.com/containerd/containerd/api v1.8.0 // indirect @@ -46,26 +48,27 @@ require ( github.com/containerd/platforms v0.2.1 // indirect github.com/containerd/ttrpc v1.2.7 // indirect github.com/containerd/typeurl/v2 v2.1.1 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/distribution/reference v0.6.0 // indirect github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c // indirect - github.com/emicklei/go-restful/v3 v3.12.2 // indirect + github.com/emicklei/go-restful/v3 v3.13.0 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-logr/stdr v1.2.2 // indirect - github.com/go-openapi/jsonpointer v0.21.0 // indirect - github.com/go-openapi/jsonreference v0.20.2 // indirect - github.com/go-openapi/swag v0.23.0 // indirect + github.com/go-openapi/jsonpointer v0.21.2 // indirect + github.com/go-openapi/jsonreference v0.21.0 // indirect + github.com/go-openapi/swag v0.23.1 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect - github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect + github.com/imdario/mergo v0.3.16 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.18.0 // indirect - github.com/mailru/easyjson v0.7.7 // indirect + github.com/mailru/easyjson v0.9.0 // indirect github.com/moby/locker v1.0.1 // indirect github.com/moby/sys/sequential v0.5.0 // indirect github.com/moby/sys/signal v0.7.0 // indirect @@ -78,42 +81,46 @@ require ( github.com/opencontainers/image-spec v1.1.0 // indirect github.com/opencontainers/selinux v1.13.1 // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_golang v1.23.2 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/common v0.67.5 // indirect github.com/sirupsen/logrus v1.9.3 // indirect - github.com/spf13/pflag v1.0.9 // indirect + github.com/spf13/pflag v1.0.10 // indirect github.com/x448/float16 v0.8.4 // indirect go.opencensus.io v0.24.0 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect go.opentelemetry.io/otel v1.41.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 // indirect go.opentelemetry.io/otel/metric v1.41.0 // indirect go.opentelemetry.io/otel/sdk v1.40.0 // indirect go.opentelemetry.io/otel/trace v1.41.0 // indirect - go.opentelemetry.io/proto/otlp v1.5.0 // indirect + go.opentelemetry.io/proto/otlp v1.7.1 // indirect go.uber.org/multierr v1.11.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/net v0.48.0 // indirect + golang.org/x/net v0.49.0 // indirect golang.org/x/oauth2 v0.34.0 // indirect golang.org/x/sync v0.19.0 // indirect - golang.org/x/term v0.38.0 // indirect - golang.org/x/text v0.32.0 // indirect - golang.org/x/time v0.12.0 // indirect + golang.org/x/term v0.40.0 // indirect + golang.org/x/text v0.34.0 // indirect + golang.org/x/time v0.13.0 // indirect google.golang.org/genproto v0.0.0-20231211222908-989df2bf70f3 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect + k8s.io/apiextensions-apiserver v0.34.3 // indirect k8s.io/component-base v0.34.3 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect + sigs.k8s.io/gateway-api-inference-extension v1.2.0 // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/structured-merge-diff/v6 v6.3.1 // indirect ) + +replace github.com/ai-dynamo/dynamo/deploy/operator => ../operator diff --git a/deploy/snapshot/go.sum b/deploy/snapshot/go.sum index f87802f8d434..0a89b9b5bdf2 100644 --- a/deploy/snapshot/go.sum +++ b/deploy/snapshot/go.sum @@ -4,6 +4,9 @@ github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24/go.mod h github.com/AdamKorcz/go-118-fuzz-build v0.0.0-20230306123547-8075edf89bb0 h1:59MxjQVfjXsBpLy+dbd2/ELV5ofnUkUZBvWSC85sheA= github.com/AdamKorcz/go-118-fuzz-build v0.0.0-20230306123547-8075edf89bb0/go.mod h1:OahwfttHWG6eJ0clwcfBAHoDI6X/LV/15hx/wlMZSrU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= github.com/Microsoft/hcsshim v0.11.7 h1:vl/nj3Bar/CvJSYo7gIQPyRWc9f3c6IeSNavBTSZNZQ= @@ -12,8 +15,8 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= -github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= -github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= @@ -41,18 +44,18 @@ github.com/containerd/ttrpc v1.2.7 h1:qIrroQvuOL9HQ1X6KHe2ohc7p+HP/0VE6XPU7elJRq github.com/containerd/ttrpc v1.2.7/go.mod h1:YCXHsb32f+Sq5/72xHubdiJRQY9inL4a4ZQrAbN1q9o= github.com/containerd/typeurl/v2 v2.1.1 h1:3Q4Pt7i8nYwy2KmQWIw2+1hTvwTE/6w9FqcttATPO/4= github.com/containerd/typeurl/v2 v2.1.1/go.mod h1:IDp2JFvbwZ31H8dQbEIY7sDl2L3o3HZj1hsSQlywkQ0= -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/cyphar/filepath-securejoin v0.5.1 h1:eYgfMq5yryL4fbWfkLpFFy2ukSELzaJOTaUTuh+oF48= github.com/cyphar/filepath-securejoin v0.5.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c h1:+pKlWGMw7gf6bQ+oDZB4KHQFypsfjYlq/C4rfL7D3g8= github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c/go.mod h1:Uw6UezgYA44ePAFQYUehOuCzmy5zmg/+nl2ZfMWGkpA= -github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= -github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= +github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= @@ -70,14 +73,12 @@ github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= -github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= -github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= -github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= -github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= -github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= -github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= -github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= -github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/go-openapi/jsonpointer v0.21.2 h1:AqQaNADVwq/VnkCmQg6ogE+M3FOsKTytwges0JdwVuA= +github.com/go-openapi/jsonpointer v0.21.2/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk= +github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= +github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= +github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZU= +github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= @@ -111,13 +112,15 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo= -github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= +github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8 h1:ZI8gCoCjGzPsum4L21jHdQs8shFBIQih1TM9Rd/c+EQ= +github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs= +github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= +github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= @@ -126,17 +129,14 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= -github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= -github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= -github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= +github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= github.com/moby/locker v1.0.1 h1:fOXqR41zeveg4fFODix+1Ch4mj/gT0NE1XJbp/epuBg= github.com/moby/locker v1.0.1/go.mod h1:S7SDdo5zpBK84bzzVlKr2V0hz+7x9hWbYC/kq7oQppc= github.com/moby/sys/mountinfo v0.7.1 h1:/tTvQaSJRr2FshkhXiIpux6fQ2Zvc4j7tAhMTStAG2g= @@ -157,10 +157,10 @@ github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFd github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg= -github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= -github.com/onsi/gomega v1.36.1 h1:bJDPBO7ibjxcbHMgSCoo4Yj18UWbKDlLwX1x9sybDcw= -github.com/onsi/gomega v1.36.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= +github.com/onsi/ginkgo/v2 v2.27.3 h1:ICsZJ8JoYafeXFFlFAG75a7CxMsJHwgKwtO+82SE9L8= +github.com/onsi/ginkgo/v2 v2.27.3/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= +github.com/onsi/gomega v1.38.3 h1:eTX+W6dobAYfFeGC2PV6RwXRu/MyT+cQguijutvkpSM= +github.com/onsi/gomega v1.38.3/go.mod h1:ZCU1pkQcXDO5Sl9/VVEGlDyp+zm0m1cmeG5TOzLgdh4= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= @@ -171,23 +171,24 @@ github.com/opencontainers/selinux v1.13.1 h1:A8nNeceYngH9Ow++M+VVEwJVpdFmrlxsN22 github.com/opencontainers/selinux v1.13.1/go.mod h1:S10WXZ/osk2kWOYKy1x2f/eXF5ZHJoUs8UU/2caNRbg= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= -github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= -github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= -github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= +github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= +github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= +github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= +github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= -github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= +github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= @@ -210,14 +211,14 @@ go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 h1:x7wzEgXfnzJcHDwStJT+mxOz4etr2EcexjqhBvmoakw= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0/go.mod h1:rg+RlpR5dKwaS95IyyZqj5Wd4E13lk/msnTS0Xl9lJM= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg= go.opentelemetry.io/otel v1.41.0 h1:YlEwVsGAlCvczDILpUXpIpPSL/VPugt7zHThEMLce1c= go.opentelemetry.io/otel v1.41.0/go.mod h1:Yt4UwgEKeT05QbLwbyHXEwhnjxNO6D8L5PQP51/46dE= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 h1:OeNbIYk/2C15ckl7glBlOBp5+WlYsOElzTNmiPW/x60= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0/go.mod h1:7Bept48yIeqxP2OZ9/AqIpYS94h2or0aB4FypJTc8ZM= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 h1:tgJ0uaNS4c98WRNUEx5U3aDlrDOI5Rs+1Vifcw4DJ8U= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0/go.mod h1:U7HYyW0zt/a9x5J1Kjs+r1f/d4ZHnYFclhYY2+YbeoE= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk= go.opentelemetry.io/otel/metric v1.41.0 h1:rFnDcs4gRzBcsO9tS8LCpgR0dxg4aaxWlJxCno7JlTQ= go.opentelemetry.io/otel/metric v1.41.0/go.mod h1:xPvCwd9pU0VN8tPZYzDZV/BMj9CM9vs00GuBjeKhJps= go.opentelemetry.io/otel/sdk v1.40.0 h1:KHW/jUzgo6wsPh9At46+h4upjtccTmuZCFAc9OJ71f8= @@ -226,8 +227,8 @@ go.opentelemetry.io/otel/sdk/metric v1.40.0 h1:mtmdVqgQkeRxHgRv4qhyJduP3fYJRMX4A go.opentelemetry.io/otel/sdk/metric v1.40.0/go.mod h1:4Z2bGMf0KSK3uRjlczMOeMhKU2rhUqdWNoKcYrtcBPg= go.opentelemetry.io/otel/trace v1.41.0 h1:Vbk2co6bhj8L59ZJ6/xFTskY+tGAbOnCtQGVVa9TIN0= go.opentelemetry.io/otel/trace v1.41.0/go.mod h1:U1NU4ULCoxeDKc09yCWdWe+3QoyweJcISEVa1RBzOis= -go.opentelemetry.io/proto/otlp v1.5.0 h1:xJvq7gMzB31/d406fB8U5CBdyQGw4P399D1aQWU/3i4= -go.opentelemetry.io/proto/otlp v1.5.0/go.mod h1:keN8WnHxOy8PG0rQZjJJ5A2ebUoafqWp0eVQ4yIXvJ4= +go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4= +go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= @@ -247,6 +248,8 @@ golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvx golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c= +golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -256,8 +259,8 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU= -golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY= +golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= +golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= @@ -275,16 +278,16 @@ golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= -golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= -golang.org/x/term v0.38.0 h1:PQ5pkm/rLO6HnxFR7N2lJHOZX6Kez5Y1gDSJla6jo7Q= -golang.org/x/term v0.38.0/go.mod h1:bSEAKrOT1W+VSu9TSCMtoGEOUcKxOKgl3LE5QEF/xVg= +golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= +golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.40.0 h1:36e4zGLqU4yhjlmxEaagx2KuYbJq3EwY8K943ZsHcvg= +golang.org/x/term v0.40.0/go.mod h1:w2P8uVp06p2iyKKuvXIm7N/y0UCRt3UfJTfZ7oOpglM= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU= -golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY= -golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= -golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= +golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= +golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= +golang.org/x/time v0.13.0 h1:eUlYslOIt32DgYD6utsuUeHs4d7AsEYLuIAdg7FlYgI= +golang.org/x/time v0.13.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= @@ -293,8 +296,8 @@ golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBn golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.39.0 h1:ik4ho21kwuQln40uelmciQPp9SipgNDdrafrYA4TmQQ= -golang.org/x/tools v0.39.0/go.mod h1:JnefbkDPyD8UU2kI5fuf8ZX4/yUeh9W877ZeBONxUqQ= +golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc= +golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -346,8 +349,8 @@ honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWh honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= k8s.io/api v0.34.3 h1:D12sTP257/jSH2vHV2EDYrb16bS7ULlHpdNdNhEw2S4= k8s.io/api v0.34.3/go.mod h1:PyVQBF886Q5RSQZOim7DybQjAbVs8g7gwJNhGtY5MBk= -k8s.io/apiextensions-apiserver v0.34.1 h1:NNPBva8FNAPt1iSVwIE0FsdrVriRXMsaWFMqJbII2CI= -k8s.io/apiextensions-apiserver v0.34.1/go.mod h1:hP9Rld3zF5Ay2Of3BeEpLAToP+l4s5UlxiHfqRaRcMc= +k8s.io/apiextensions-apiserver v0.34.3 h1:p10fGlkDY09eWKOTeUSioxwLukJnm+KuDZdrW71y40g= +k8s.io/apiextensions-apiserver v0.34.3/go.mod h1:aujxvqGFRdb/cmXYfcRTeppN7S2XV/t7WMEc64zB5A0= k8s.io/apimachinery v0.34.3 h1:/TB+SFEiQvN9HPldtlWOTp0hWbJ+fjU+wkxysf/aQnE= k8s.io/apimachinery v0.34.3/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= k8s.io/client-go v0.34.3 h1:wtYtpzy/OPNYf7WyNBTj3iUA0XaBHVqhv4Iv3tbrF5A= @@ -364,10 +367,12 @@ k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZ k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= k8s.io/kubelet v0.34.3 h1:8QRev2FmasZ05yCC774qn6ULche72PYM7AQv0CVt9CM= k8s.io/kubelet v0.34.3/go.mod h1:pMgblr+nVQ02UkyaTcgqzS3AIYVQkjlMFg1Pd5rGC1Q= -k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= -k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/utils v0.0.0-20260108192941-914a6e750570 h1:JT4W8lsdrGENg9W+YwwdLJxklIuKWdRm+BC+xt33FOY= +k8s.io/utils v0.0.0-20260108192941-914a6e750570/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk= sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A= sigs.k8s.io/controller-runtime v0.22.4/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8= +sigs.k8s.io/gateway-api-inference-extension v1.2.0 h1:7H+ijrUImnW2ubcTakNgV723xDIdQx1Umv4vDVB+tTk= +sigs.k8s.io/gateway-api-inference-extension v1.2.0/go.mod h1:/HWeqxuOMjFM56YwJ2Spt3qceK7Spz4hk6ZfXYgE9a8= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= diff --git a/deploy/snapshot/internal/controller/controller.go b/deploy/snapshot/internal/controller/controller.go index 9f4c5d27d3f2..9535dd8be512 100644 --- a/deploy/snapshot/internal/controller/controller.go +++ b/deploy/snapshot/internal/controller/controller.go @@ -16,17 +16,23 @@ import ( "github.com/go-logr/logr" "github.com/google/uuid" - batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/dynamic/dynamicinformer" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" - "k8s.io/client-go/util/retry" + "sigs.k8s.io/controller-runtime/pkg/client" + nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/executor" snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime" "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types" @@ -34,17 +40,27 @@ import ( ) // NodeController watches local-node pods with checkpoint metadata and reconciles -// snapshot execution for checkpoint and restore requests. +// snapshot execution for checkpoint and restore requests. The restore path is +// driven by a client-go pod informer; the capture path is driven by a dynamic +// informer over SnapshotContent work orders filtered to this node, with typed +// reads/writes via an uncached controller-runtime client. type NodeController struct { - config *types.AgentConfig - clientset kubernetes.Interface - runtime snapshotruntime.Runtime - log logr.Logger - holderID string + config *types.AgentConfig + clientset kubernetes.Interface + client client.Client + dynClient dynamic.Interface + runtime snapshotruntime.Runtime + log logr.Logger + holderID string + checkpointFn func(ctx context.Context, params CheckpointParams) error inFlight map[string]struct{} inFlightMu sync.Mutex + // contentIndexer is the SnapshotContent informer's indexer, indexed by source pod + // (podRefIndex). The source-pod informer uses it to map a pod event back to its work order. + contentIndexer cache.Indexer + stopCh chan struct{} } @@ -57,8 +73,15 @@ const ( containerResolveAttemptTimeout = 1 * time.Second restoreContainerResolveInterval = 50 * time.Millisecond restoreContainerResolveTimeout = 30 * time.Second + + // snapshotContentResyncInterval re-drives every SnapshotContent work order so a + // not-yet-Ready source pod is re-checked for quiesce without a busy loop. + snapshotContentResyncInterval = 10 * time.Second ) +// snapshotContentGVR is the cluster-scoped resource the capture informer watches. +var snapshotContentGVR = nvidiacomv1alpha1.GroupVersion.WithResource("snapshotcontents") + // NewNodeController creates the node-local controller that runs inside snapshot-agent. func NewNodeController( cfg *types.AgentConfig, @@ -75,15 +98,33 @@ func NewNodeController( return nil, fmt.Errorf("failed to create kubernetes client: %w", err) } - return &NodeController{ + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(nvidiacomv1alpha1.AddToScheme(scheme)) + + typedClient, err := client.New(restConfig, client.Options{Scheme: scheme}) + if err != nil { + return nil, fmt.Errorf("failed to create typed client: %w", err) + } + + dynClient, err := dynamic.NewForConfig(restConfig) + if err != nil { + return nil, fmt.Errorf("failed to create dynamic client: %w", err) + } + + w := &NodeController{ config: cfg, clientset: clientset, + client: typedClient, + dynClient: dynClient, runtime: rt, log: log, holderID: "snapshot-agent/" + uuid.NewString(), inFlight: make(map[string]struct{}), stopCh: make(chan struct{}), - }, nil + } + w.checkpointFn = w.executorCheckpoint + return w, nil } // Run starts the local pod informers and processes checkpoint/restore events. @@ -104,43 +145,6 @@ func (w *NodeController) Run(ctx context.Context) error { var syncFuncs []cache.InformerSynced - // Checkpoint informer - checkpointSelector := labels.SelectorFromSet(labels.Set{ - snapshotprotocol.CheckpointSourceLabel: "true", - }).String() - - ckptFactoryOpts := append([]informers.SharedInformerOption{ - informers.WithTweakListOptions(func(opts *metav1.ListOptions) { - opts.LabelSelector = checkpointSelector - }), - }, nsOptions...) - - ckptFactory := informers.NewSharedInformerFactoryWithOptions( - w.clientset, 30*time.Second, ckptFactoryOpts..., - ) - - ckptInformer := ckptFactory.Core().V1().Pods().Informer() - if _, err := ckptInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - pod, ok := podFromInformerObj(obj) - if !ok { - return - } - w.reconcileCheckpointPod(ctx, pod) - }, - UpdateFunc: func(_, newObj interface{}) { - pod, ok := podFromInformerObj(newObj) - if !ok { - return - } - w.reconcileCheckpointPod(ctx, pod) - }, - }); err != nil { - return fmt.Errorf("failed to add checkpoint informer handler: %w", err) - } - go ckptFactory.Start(w.stopCh) - syncFuncs = append(syncFuncs, ckptInformer.HasSynced) - // Restore pods carry a checkpoint ID but are not checkpoint sources. restoreSel, err := labels.Parse(snapshotprotocol.CheckpointIDLabel + ",!" + snapshotprotocol.CheckpointSourceLabel) if err != nil { @@ -180,119 +184,79 @@ func (w *NodeController) Run(ctx context.Context) error { go restoreFactory.Start(w.stopCh) syncFuncs = append(syncFuncs, restoreInformer.HasSynced) - if !cache.WaitForCacheSync(w.stopCh, syncFuncs...) { - return fmt.Errorf("failed to sync informer caches") - } - - w.log.Info("Snapshot node controller started and caches synced") - <-ctx.Done() - close(w.stopCh) - return nil -} - -func (w *NodeController) reconcileCheckpointPod(ctx context.Context, pod *corev1.Pod) { - if pod.Spec.NodeName != w.config.NodeName { - return - } - - podKey := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name) - - checkpointID, ok := pod.Labels[snapshotprotocol.CheckpointIDLabel] - if !ok || checkpointID == "" { - w.log.Info("Pod has checkpoint label but no checkpoint-id label", "pod", podKey) - return - } - - job, err := getCheckpointJob(ctx, w.clientset, pod) - if err != nil { - w.log.Error(err, "Failed to resolve checkpoint job", "pod", podKey) - return - } - - jobStatus := job.Annotations[snapshotprotocol.CheckpointStatusAnnotation] - if jobStatus == snapshotprotocol.CheckpointStatusFailed { - return - } - - for i := range pod.Status.ContainerStatuses { - failed := &pod.Status.ContainerStatuses[i] - term := failed.State.Terminated - if term == nil || term.ExitCode == 0 { - continue - } - message := fmt.Sprintf("Checkpoint container %q terminated with exit code %d", failed.Name, term.ExitCode) - if term.Reason != "" { - message = fmt.Sprintf("%s: %s", message, term.Reason) - } - opLog := w.log.WithValues("pod", podKey, "checkpoint_id", checkpointID, "container", failed.Name) - opLog.Info("Checkpoint pod container failed", "exit_code", term.ExitCode, "reason", term.Reason) - emitPodEvent(ctx, w.clientset, opLog, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", message) - if err := annotateJob(ctx, w.clientset, opLog, job, map[string]string{ - snapshotprotocol.CheckpointStatusAnnotation: snapshotprotocol.CheckpointStatusFailed, - }); err != nil { - opLog.Error(err, "Failed to mark checkpoint job failed") - } - reason := fmt.Sprintf("checkpoint container %s failed", failed.Name) - for _, status := range pod.Status.ContainerStatuses { - if status.State.Running == nil || status.ContainerID == "" { - continue - } - containerID := snapshotruntime.StripCRIScheme(status.ContainerID) - resolveCtx, cancel := context.WithTimeout(ctx, containerResolveAttemptTimeout) - pid, _, err := w.runtime.ResolveContainer(resolveCtx, containerID) - cancel() - if err != nil { - opLog.Error(err, "Failed to resolve running checkpoint container", "container", status.Name) - continue + // Capture path: a dynamic informer over SnapshotContent work orders, filtered at + // the list/watch level to this node's mirror label. The node-label filter is the + // node scoping; reconcileSnapshotContent keeps a defensive nodeName check. + nodeContentSelector := labels.SelectorFromSet(labels.Set{snapshotprotocol.SnapshotNodeLabel: w.config.NodeName}).String() + dynFactory := dynamicinformer.NewFilteredDynamicSharedInformerFactory( + w.dynClient, snapshotContentResyncInterval, metav1.NamespaceAll, + func(opts *metav1.ListOptions) { + opts.LabelSelector = nodeContentSelector + }, + ) + contentInformer := dynFactory.ForResource(snapshotContentGVR).Informer() + // Index work orders by their source pod so a source-pod event maps back to its + // SnapshotContent in O(1). Must be registered before the informer starts. + if err := contentInformer.AddIndexers(cache.Indexers{podRefIndex: podRefIndexFunc}); err != nil { + return fmt.Errorf("failed to add snapshot-content podRef indexer: %w", err) + } + w.contentIndexer = contentInformer.GetIndexer() + if _, err := contentInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + if name, ok := contentNameFromInformerObj(obj); ok { + w.reconcileSnapshotContent(ctx, name) } - if err := snapshotruntime.SendSignalToPID(opLog, pid, syscall.SIGKILL, reason); err != nil { - opLog.Error(err, "Failed to signal running checkpoint container", "container", status.Name) + }, + UpdateFunc: func(_, newObj interface{}) { + if name, ok := contentNameFromInformerObj(newObj); ok { + w.reconcileSnapshotContent(ctx, name) } - } - return - } - - if jobStatus == snapshotprotocol.CheckpointStatusCompleted { - return - } - - // Checkpoint contract: exactly one target container per job. - targets, err := snapshotprotocol.TargetContainersFromAnnotations(pod.Annotations, 1, 1) - if err != nil { - w.log.Error(err, "Checkpoint pod missing target-containers annotation", "pod", podKey) - return - } - containerName := targets[0] - if !isContainerReady(pod, containerName) { - return + }, + }); err != nil { + return fmt.Errorf("failed to add snapshot-content informer handler: %w", err) } + go dynFactory.Start(w.stopCh) + syncFuncs = append(syncFuncs, contentInformer.HasSynced) - if !w.tryAcquire(podKey) { - return + // Source-pod informer: capture-source pods carry CheckpointSourceLabel=true. A pod status + // change (a checkpoint container crashing, or the target becoming ready) does not touch the + // SnapshotContent, so without this trigger it would only be acted on at the content informer's + // resync. It needs its own factory: its selector is disjoint from the restore informer's. + sourceSelector := labels.SelectorFromSet(labels.Set{snapshotprotocol.CheckpointSourceLabel: "true"}).String() + sourceFactoryOpts := append([]informers.SharedInformerOption{ + informers.WithTweakListOptions(func(opts *metav1.ListOptions) { + opts.LabelSelector = sourceSelector + }), + }, nsOptions...) + sourceFactory := informers.NewSharedInformerFactoryWithOptions( + w.clientset, 30*time.Second, sourceFactoryOpts..., + ) + sourceInformer := sourceFactory.Core().V1().Pods().Informer() + if _, err := sourceInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + if pod, ok := podFromInformerObj(obj); ok { + w.reconcileSourcePod(ctx, pod) + } + }, + UpdateFunc: func(_, newObj interface{}) { + if pod, ok := podFromInformerObj(newObj); ok { + w.reconcileSourcePod(ctx, pod) + } + }, + }); err != nil { + return fmt.Errorf("failed to add source-pod informer handler: %w", err) } + go sourceFactory.Start(w.stopCh) + syncFuncs = append(syncFuncs, sourceInformer.HasSynced) - acquiredLease, err := acquireCheckpointLease(ctx, w.clientset, w.log, job, w.holderID) - if err != nil { - w.release(podKey) - w.log.Error(err, "Failed to acquire checkpoint lease", "pod", podKey, "checkpoint_id", checkpointID) - return - } - if !acquiredLease { - w.release(podKey) - return + if !cache.WaitForCacheSync(w.stopCh, syncFuncs...) { + return fmt.Errorf("failed to sync informer caches") } - startedAt := time.Now() - w.log.Info("Checkpoint target detected, triggering checkpoint", "pod", podKey, "checkpoint_id", checkpointID) - emitPodEvent(ctx, w.clientset, w.log, pod, "snapshot", corev1.EventTypeNormal, "CheckpointRequested", fmt.Sprintf("Checkpoint requested: %s", checkpointID)) - - go func() { - if err := w.runCheckpoint(ctx, pod, job, checkpointID, containerName, podKey, startedAt); err != nil { - opLog := w.log.WithValues("pod", podKey, "checkpoint_id", checkpointID) - opLog.Error(err, "Checkpoint controller worker failed") - emitPodEvent(ctx, w.clientset, opLog, pod, "snapshot", corev1.EventTypeWarning, "CheckpointWorkerFailed", err.Error()) - } - }() + w.log.Info("Snapshot node controller started and caches synced") + <-ctx.Done() + close(w.stopCh) + return nil } func (w *NodeController) reconcileRestorePod(ctx context.Context, pod *corev1.Pod) { @@ -523,200 +487,6 @@ func (w *NodeController) startRestoreForContainer( }() } -// runCheckpoint runs the full checkpoint workflow for a pod: -// 1. Hold and renew the checkpoint lease -// 2. Resolve the container ID and host PID -// 3. Call executor.Checkpoint (inspect → configure → CUDA lock/checkpoint → CRIU dump → rootfs diff) -// 4. Write a snapshot-complete sentinel into the pod's snapshot-control -// volume on success (observed by the workload via polling), or SIGKILL -// on failure (unrecoverable CUDA-locked process) -// 5. Mark job as completed or failed -func (w *NodeController) runCheckpoint(ctx context.Context, pod *corev1.Pod, job *batchv1.Job, checkpointID, containerName, podKey string, startedAt time.Time) error { - releasePodOnExit := true - defer func() { - if releasePodOnExit { - w.release(podKey) - } - }() - log := w.log.WithValues("pod", podKey, "checkpoint_id", checkpointID) - leaseCtx, stopLease := context.WithCancelCause(ctx) - defer stopLease(nil) - - releaseLeaseOnExit := true - defer func() { - if !releaseLeaseOnExit { - return - } - releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - if err := releaseCheckpointLease(releaseCtx, w.clientset, log, job, w.holderID); err != nil { - log.Error(err, "Failed to release checkpoint lease") - } - }() - - go w.renewCheckpointLease(leaseCtx, log, job, stopLease) - - setCheckpointStatus := func(value string) (bool, error) { - if value != snapshotprotocol.CheckpointStatusCompleted { - if err := annotateJob(ctx, w.clientset, log, job, map[string]string{ - snapshotprotocol.CheckpointStatusAnnotation: value, - }); err != nil { - releasePodOnExit = false - releaseLeaseOnExit = false - return false, fmt.Errorf("failed to persist terminal checkpoint status %q: %w", value, err) - } - return true, nil - } - - updated := false - jobClient := w.clientset.BatchV1().Jobs(job.Namespace) - err := retry.RetryOnConflict(retry.DefaultRetry, func() error { - current, err := jobClient.Get(ctx, job.Name, metav1.GetOptions{}) - if err != nil { - return fmt.Errorf("failed to get current checkpoint job %s/%s: %w", job.Namespace, job.Name, err) - } - if current.Annotations[snapshotprotocol.CheckpointStatusAnnotation] == snapshotprotocol.CheckpointStatusFailed { - updated = false - return nil - } - if current.Annotations == nil { - current.Annotations = map[string]string{} - } - current.Annotations[snapshotprotocol.CheckpointStatusAnnotation] = value - if _, err := jobClient.Update(ctx, current, metav1.UpdateOptions{}); err != nil { - return err - } - updated = true - return nil - }) - if err != nil { - releasePodOnExit = false - releaseLeaseOnExit = false - return false, fmt.Errorf("failed to persist terminal checkpoint status %q: %w", value, err) - } - if !updated { - log.Info("Skipping checkpoint completion because checkpoint job is already failed", - "job", fmt.Sprintf("%s/%s", job.Namespace, job.Name), - ) - } - return updated, nil - } - - // Resolve the target container ID from pod status. - var containerID string - for _, cs := range pod.Status.ContainerStatuses { - if cs.Name == containerName { - containerID = snapshotruntime.StripCRIScheme(cs.ContainerID) - break - } - } - if containerID == "" { - emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", fmt.Sprintf("Could not resolve container %q ID", containerName)) - if _, statusErr := setCheckpointStatus(snapshotprotocol.CheckpointStatusFailed); statusErr != nil { - return statusErr - } - return nil - } - - // Resolve the container's host PID (needed for signaling after checkpoint) - containerPID, _, err := w.runtime.ResolveContainer(ctx, containerID) - if err != nil { - log.Error(err, "Failed to resolve container") - emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", fmt.Sprintf("Container resolve failed: %v", err)) - if _, statusErr := setCheckpointStatus(snapshotprotocol.CheckpointStatusFailed); statusErr != nil { - return statusErr - } - return nil - } - - checkpointLocation, err := w.checkpointLocationsFromPod(pod, checkpointID, containerPID) - if err != nil { - log.Error(err, "Checkpoint pod is missing storage metadata") - emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error()) - if _, statusErr := setCheckpointStatus(snapshotprotocol.CheckpointStatusFailed); statusErr != nil { - return statusErr - } - return nil - } - if err := w.validatePodMountContainerPID(ctx, containerID, containerPID); err != nil { - log.Error(err, "Checkpoint container changed before storage access") - emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error()) - if _, statusErr := setCheckpointStatus(snapshotprotocol.CheckpointStatusFailed); statusErr != nil { - return statusErr - } - return nil - } - - // Step 1: Run the checkpoint orchestrator - req := executor.CheckpointRequest{ - ContainerID: containerID, - ContainerName: containerName, - CheckpointID: checkpointID, - CheckpointLocation: checkpointLocation.HostPath, - StartedAt: startedAt, - NodeName: w.config.NodeName, - PodName: pod.Name, - PodNamespace: pod.Namespace, - Clientset: w.clientset, - } - if err := executor.Checkpoint(leaseCtx, w.runtime, log, req, w.config); err != nil { - if cause := context.Cause(leaseCtx); cause != nil && cause != context.Canceled { - err = fmt.Errorf("checkpoint lease lost: %w", cause) - } - log.Error(err, "Checkpoint failed") - emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error()) - // SIGKILL on failure: process is unrecoverable (CUDA locked), terminate immediately - if signalErr := snapshotruntime.SendSignalToPID(log, containerPID, syscall.SIGKILL, "checkpoint failed"); signalErr != nil { - log.Error(signalErr, "Failed to signal checkpoint failure to runtime process") - } - if _, statusErr := setCheckpointStatus(snapshotprotocol.CheckpointStatusFailed); statusErr != nil { - return statusErr - } - return nil - } - - info, err := os.Stat(checkpointLocation.HostPath) - if err != nil || !info.IsDir() { - if err == nil { - err = fmt.Errorf("published checkpoint path %s is not a directory", checkpointLocation.HostPath) - } else { - err = fmt.Errorf("published checkpoint path %s is missing: %w", checkpointLocation.HostPath, err) - } - log.Error(err, "Checkpoint failed verification") - emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error()) - if signalErr := snapshotruntime.SendSignalToPID(log, containerPID, syscall.SIGKILL, "checkpoint verification failed"); signalErr != nil { - log.Error(signalErr, "Failed to signal checkpoint verification failure to runtime process") - } - if _, statusErr := setCheckpointStatus(snapshotprotocol.CheckpointStatusFailed); statusErr != nil { - return statusErr - } - return nil - } - // Step 2: Sentinel on success. Workload observes via polling on the - // snapshot-control volume; containerPID is a PID inside the container's - // mount namespace, which is all the /host/proc//root write path - // requires. The Succeeded event is emitted only after the sentinel has - // been written and the terminal status has been persisted so failures don't - // produce conflicting Succeeded+Failed events for the same operation. - if err := snapshotruntime.WriteControlSentinel(containerPID, snapshotprotocol.SnapshotCompleteFile); err != nil { - log.Error(err, "Failed to write snapshot-complete sentinel") - emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", err.Error()) - if _, statusErr := setCheckpointStatus(snapshotprotocol.CheckpointStatusFailed); statusErr != nil { - return statusErr - } - return nil - } - - updated, err := setCheckpointStatus(snapshotprotocol.CheckpointStatusCompleted) - if err != nil { - return err - } - if updated { - emitPodEvent(ctx, w.clientset, log, pod, "snapshot", corev1.EventTypeNormal, "CheckpointSucceeded", fmt.Sprintf("Checkpoint completed: %s", checkpointID)) - } - return nil -} - // runRestore runs the full restore workflow for one target container: // 1. Annotate the pod with restore in_progress // 2. Call executor.Restore (inspect placeholder → nsrestore inside namespace) @@ -843,6 +613,64 @@ func (w *NodeController) release(podKey string) { delete(w.inFlight, podKey) } +// podRefIndex is the SnapshotContent informer index keyed by source pod ("/"). +const podRefIndex = "byPodRef" + +// podRefIndexFunc indexes a SnapshotContent by its source pod ("/"). +// It runs against the dynamic informer's *unstructured.Unstructured objects; an unexpected type or a +// missing field yields no index entry (nil) rather than an error, so it never poisons the index. +func podRefIndexFunc(obj interface{}) ([]string, error) { + u, ok := obj.(*unstructured.Unstructured) + if !ok { + return nil, nil + } + ns, _, _ := unstructured.NestedString(u.Object, "spec", "snapshotRef", "namespace") + name, _, _ := unstructured.NestedString(u.Object, "spec", "source", "podRef", "name") + if ns == "" || name == "" { + return nil, nil + } + return []string{ns + "/" + name}, nil +} + +// contentFromInformerObj converts a dynamic informer object (or its DeletedFinalStateUnknown +// tombstone) to a typed SnapshotContent. It returns false on an unexpected type. +func contentFromInformerObj(obj interface{}) (*nvidiacomv1alpha1.SnapshotContent, bool) { + if tombstone, ok := obj.(cache.DeletedFinalStateUnknown); ok { + obj = tombstone.Obj + } + u, ok := obj.(*unstructured.Unstructured) + if !ok { + return nil, false + } + content := &nvidiacomv1alpha1.SnapshotContent{} + if err := runtime.DefaultUnstructuredConverter.FromUnstructured(u.Object, content); err != nil { + return nil, false + } + return content, true +} + +// chooseActiveContent returns the name of the oldest non-terminal SnapshotContent among the indexed +// objects (oldest first by CreationTimestamp, ties broken by Name), or "" when none are active. +// Driving the oldest until it finishes gives deterministic, stable selection across pod events. +func chooseActiveContent(objs []interface{}) string { + var chosen *nvidiacomv1alpha1.SnapshotContent + for _, obj := range objs { + content, ok := contentFromInformerObj(obj) + if !ok || isContentTerminal(content) { + continue + } + if chosen == nil || + content.CreationTimestamp.Before(&chosen.CreationTimestamp) || + (content.CreationTimestamp.Equal(&chosen.CreationTimestamp) && content.Name < chosen.Name) { + chosen = content + } + } + if chosen == nil { + return "" + } + return chosen.Name +} + func (w *NodeController) checkpointLocationsFromPod(pod *corev1.Pod, checkpointID string, hostPID int) (checkpointLocations, error) { rawBasePath, hasBasePathAnnotation := pod.Annotations[snapshotprotocol.CheckpointStorageBasePathAnnotation] basePath := strings.TrimSpace(rawBasePath) diff --git a/deploy/snapshot/internal/controller/controller_test.go b/deploy/snapshot/internal/controller/controller_test.go index 19a7632e6c49..0ac4a0e6a0ae 100644 --- a/deploy/snapshot/internal/controller/controller_test.go +++ b/deploy/snapshot/internal/controller/controller_test.go @@ -11,8 +11,6 @@ import ( "github.com/go-logr/logr/testr" specs "github.com/opencontainers/runtime-spec/specs-go" - batchv1 "k8s.io/api/batch/v1" - coordinationv1 "k8s.io/api/coordination/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -32,12 +30,18 @@ const testContainerID = "test-container" type fakeRuntime struct { containerIDByPod string resolvedContainerIDs []string + // resolveContainerPID, when set, is returned by ResolveContainer with no error so the + // capture path can advance past container resolution. + resolveContainerPID int } var _ snapshotruntime.Runtime = (*fakeRuntime)(nil) func (r *fakeRuntime) ResolveContainer(ctx context.Context, id string) (int, *specs.Spec, error) { r.resolvedContainerIDs = append(r.resolvedContainerIDs, id) + if r.resolveContainerPID > 0 { + return r.resolveContainerPID, nil, nil + } return 0, nil, errors.New("not implemented") } func (r *fakeRuntime) ResolveContainerIDByPod(ctx context.Context, pod, ns, ctr string) (string, error) { @@ -52,8 +56,8 @@ func (r *fakeRuntime) ResolveContainerByPod(ctx context.Context, pod, ns, ctr st func (r *fakeRuntime) Close() error { return nil } // makeTestController creates a NodeController with a fake k8s client and nil executors. -// The fake clientset is empty so any goroutine launched by runCheckpoint/runRestore -// will fail on the first annotatePod call and exit cleanly. +// The fake clientset is empty so any goroutine launched by the restore path will fail on +// the first annotatePod call and exit cleanly. func makeTestController(t *testing.T, objs ...runtime.Object) *NodeController { t.Helper() return &NodeController{ @@ -87,23 +91,6 @@ func sawEventReason(clientset *fake.Clientset, reason string) bool { return false } -func makeLease(namespace, name, holder string, renewTime time.Time) *coordinationv1.Lease { - leaseDurationSeconds := int32(checkpointLeaseDuration.Seconds()) - renewMicroTime := metav1.NewMicroTime(renewTime) - return &coordinationv1.Lease{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: namespace, - }, - Spec: coordinationv1.LeaseSpec{ - HolderIdentity: &holder, - LeaseDurationSeconds: &leaseDurationSeconds, - AcquireTime: &renewMicroTime, - RenewTime: &renewMicroTime, - }, - } -} - func makePod(name, namespace, nodeName string, phase corev1.PodPhase, ready bool, labels, annotations map[string]string) *corev1.Pod { var conditions []corev1.PodCondition if ready { @@ -315,230 +302,6 @@ func TestRestoreCheckpointReady(t *testing.T) { }) } -func TestReconcileCheckpointPod(t *testing.T) { - tests := []struct { - name string - nodeName string - phase corev1.PodPhase - ready bool - hash string - annotation string - lease *coordinationv1.Lease - preSeed bool // pre-populate inFlight to test deduplication - want bool // true = pod passes filtering and triggers checkpoint - }{ - { - name: "happy path", - nodeName: testNodeName, - phase: corev1.PodRunning, - ready: true, - hash: "abc123", - want: true, - }, - { - name: "wrong node", - nodeName: "other-node", - phase: corev1.PodRunning, - ready: true, - hash: "abc123", - want: false, - }, - { - name: "not running", - nodeName: testNodeName, - phase: corev1.PodPending, - ready: false, - hash: "abc123", - want: false, - }, - { - name: "running but not ready", - nodeName: testNodeName, - phase: corev1.PodRunning, - ready: false, - hash: "abc123", - want: false, - }, - { - name: "missing hash label", - nodeName: testNodeName, - phase: corev1.PodRunning, - ready: true, - hash: "", - want: false, - }, - { - name: "already completed", - nodeName: testNodeName, - phase: corev1.PodRunning, - ready: true, - hash: "abc123", - annotation: "completed", - want: false, - }, - { - name: "already failed", - nodeName: testNodeName, - phase: corev1.PodRunning, - ready: true, - hash: "abc123", - annotation: "failed", - want: false, - }, - { - name: "active lease held elsewhere", - nodeName: testNodeName, - phase: corev1.PodRunning, - ready: true, - hash: "abc123", - lease: makeLease("default", "checkpoint-job", "other-holder", time.Now()), - want: false, - }, - { - name: "expired lease can be reclaimed", - nodeName: testNodeName, - phase: corev1.PodRunning, - ready: true, - hash: "abc123", - lease: makeLease("default", "checkpoint-job", "other-holder", time.Now().Add(-checkpointLeaseDuration-time.Second)), - want: true, - }, - { - name: "duplicate in-flight", - nodeName: testNodeName, - phase: corev1.PodRunning, - ready: true, - hash: "abc123", - preSeed: true, - want: false, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - labels := map[string]string{ - snapshotprotocol.CheckpointSourceLabel: "true", - "batch.kubernetes.io/job-name": "checkpoint-job", - } - if tc.hash != "" { - labels[snapshotprotocol.CheckpointIDLabel] = tc.hash - } - - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: "checkpoint-job", - Namespace: "default", - }, - } - if tc.annotation != "" { - job.Annotations = map[string]string{ - snapshotprotocol.CheckpointStatusAnnotation: tc.annotation, - } - } - - pod := makePod("test-pod", "default", tc.nodeName, tc.phase, tc.ready, labels, nil) - objs := []runtime.Object{job} - if tc.lease != nil { - objs = append(objs, tc.lease) - } - - w := makeTestController(t, objs...) - ctx := context.Background() - - if tc.preSeed { - w.inFlight["default/test-pod"] = struct{}{} - } - - w.reconcileCheckpointPod(ctx, pod) - - triggered := sawEventReason(w.clientset.(*fake.Clientset), "CheckpointRequested") - - if triggered != tc.want { - t.Errorf("triggered = %v, want %v (inFlight=%d, preSeed=%v, actions=%#v)", triggered, tc.want, len(w.inFlight), tc.preSeed, w.clientset.(*fake.Clientset).Actions()) - } - - // Let the background goroutine (if any) finish before the test ends - if tc.want { - time.Sleep(50 * time.Millisecond) - } - }) - } -} - -func TestReconcileCheckpointPodFailsWhenAnyRegularContainerFails(t *testing.T) { - for _, jobStatus := range []string{"", snapshotprotocol.CheckpointStatusCompleted} { - t.Run("job status "+jobStatus, func(t *testing.T) { - labels := map[string]string{ - snapshotprotocol.CheckpointSourceLabel: "true", - snapshotprotocol.CheckpointIDLabel: "abc123", - "batch.kubernetes.io/job-name": "checkpoint-job", - } - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: "checkpoint-job", - Namespace: "default", - Annotations: map[string]string{}, - }, - } - if jobStatus != "" { - job.Annotations[snapshotprotocol.CheckpointStatusAnnotation] = jobStatus - } - pod := makePod("test-pod", "default", testNodeName, corev1.PodRunning, false, labels, nil) - pod.Spec.Containers = append(pod.Spec.Containers, corev1.Container{Name: "helper"}) - pod.Status.ContainerStatuses = []corev1.ContainerStatus{ - { - Name: "main", - Ready: true, - State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, - ContainerID: "containerd://main-id", - }, - { - Name: "helper", - State: corev1.ContainerState{ - Terminated: &corev1.ContainerStateTerminated{ExitCode: 1, Reason: "Error"}, - }, - ContainerID: "containerd://helper-id", - }, - } - - w := makeTestController(t, job) - rt := &fakeRuntime{} - w.runtime = rt - w.reconcileCheckpointPod(context.Background(), pod) - - updated, err := w.clientset.BatchV1().Jobs("default").Get(context.Background(), "checkpoint-job", metav1.GetOptions{}) - if err != nil { - t.Fatalf("failed to get checkpoint job: %v", err) - } - if got := updated.Annotations[snapshotprotocol.CheckpointStatusAnnotation]; got != snapshotprotocol.CheckpointStatusFailed { - t.Fatalf("checkpoint status annotation = %q, want %q", got, snapshotprotocol.CheckpointStatusFailed) - } - - var sawFailureEvent bool - for _, action := range w.clientset.(*fake.Clientset).Actions() { - create, ok := action.(clientgotesting.CreateAction) - if !ok || create.GetResource().Resource != "events" { - continue - } - event, ok := create.GetObject().(*corev1.Event) - if ok && event.Reason == "CheckpointFailed" && strings.Contains(event.Message, `container "helper"`) { - sawFailureEvent = true - break - } - } - if !sawFailureEvent { - t.Fatalf("expected CheckpointFailed event for failed regular container; actions=%#v", w.clientset.(*fake.Clientset).Actions()) - } - if len(w.inFlight) != 0 { - t.Fatalf("failed checkpoint pod should not start snapshot worker, got inFlight=%v", w.inFlight) - } - if len(rt.resolvedContainerIDs) != 1 || rt.resolvedContainerIDs[0] != "main-id" { - t.Fatalf("expected to resolve remaining running container before failing job, got %v", rt.resolvedContainerIDs) - } - }) - } -} - func TestReconcileRestorePod(t *testing.T) { tests := []struct { name string @@ -951,65 +714,3 @@ func TestPollForContainerIDSkipsWhenRestoreAttemptAlreadyHeld(t *testing.T) { } } -func TestRunCheckpointKeepsLeaseAndInFlightOnTerminalStatusPatchFailure(t *testing.T) { - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Namespace: "default", - Labels: map[string]string{ - "batch.kubernetes.io/job-name": "checkpoint-job", - }, - }, - } - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Name: "checkpoint-job", - Namespace: "default", - }, - } - lease := makeLease("default", "checkpoint-job", "test-holder", time.Now()) - - clientset := fake.NewClientset(pod.DeepCopy(), job, lease) - patchCalls := 0 - clientset.PrependReactor("patch", "jobs", func(clientgotesting.Action) (bool, runtime.Object, error) { - patchCalls++ - return true, nil, errors.New("terminal patch failed") - }) - - w := &NodeController{ - config: &types.AgentConfig{ - NodeName: testNodeName, - Storage: types.StorageSpec{ - Type: snapshotprotocol.StorageTypePVC, - BasePath: t.TempDir(), - }, - }, - clientset: clientset, - runtime: &fakeRuntime{}, - log: testr.New(t), - holderID: "test-holder", - inFlight: map[string]struct{}{ - "default/test-pod": {}, - }, - stopCh: make(chan struct{}), - } - - err := w.runCheckpoint(context.Background(), pod, job, "abc123", "main", "default/test-pod", time.Now()) - if err == nil { - t.Fatal("expected terminal checkpoint status update to fail") - } - if _, ok := w.inFlight["default/test-pod"]; !ok { - t.Fatal("checkpoint terminal status failure should keep pod in-flight") - } - if patchCalls != 1 { - t.Fatalf("patchCalls = %d, want %d", patchCalls, 1) - } - - remainingLease, err := clientset.CoordinationV1().Leases("default").Get(context.Background(), "checkpoint-job", metav1.GetOptions{}) - if err != nil { - t.Fatalf("expected checkpoint lease to remain after terminal status patch failure: %v", err) - } - if remainingLease.Spec.HolderIdentity == nil || *remainingLease.Spec.HolderIdentity != "test-holder" { - t.Fatalf("unexpected remaining lease holder: %#v", remainingLease.Spec.HolderIdentity) - } -} diff --git a/deploy/snapshot/internal/controller/snapshotcontent.go b/deploy/snapshot/internal/controller/snapshotcontent.go new file mode 100644 index 000000000000..c2b77cdaa879 --- /dev/null +++ b/deploy/snapshot/internal/controller/snapshotcontent.go @@ -0,0 +1,453 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "context" + "errors" + "fmt" + "os" + "strings" + "syscall" + "time" + + "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + + nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" + "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/executor" + snapshotruntime "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/runtime" + snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" +) + +// CheckpointParams carries everything the node driver needs to dump one container. +type CheckpointParams struct { + // Pod is the live source pod (already provenance-verified by the reconciler). + Pod *corev1.Pod + // ContainerName is the single target container to checkpoint. + ContainerName string + // ContainerID is the agent-resolved running container ID (CRI scheme stripped). + ContainerID string + // ContainerPID is the agent-resolved host PID of the running container. + ContainerPID int + // CheckpointID is the stable artifact identity. + CheckpointID string + // HostPath is the agent-resolved destination directory for the dump. + HostPath string + // ContainerPath is the destination as seen inside the workload container's mount + // namespace (equal to HostPath under agentMount storage). + ContainerPath string + // StartedAt marks when the controller observed the work order, for timing. + StartedAt time.Time +} + +// reconcileSnapshotContent is the pre-bind gate for a SnapshotContent work order. It validates the +// source pod (existence and provenance) and, when the pod is valid, hands off to reconcileSourcePod +// — the single capture path. It never runs the capture flow itself. Driven by the content informer +// (Add/Update) and its 10s resync; the resync is the backstop that eventually writes a terminal +// failure for a work order whose source pod is gone. +func (w *NodeController) reconcileSnapshotContent(ctx context.Context, name string) { + logger := w.log.WithValues("content", name) + + content := &nvidiacomv1alpha1.SnapshotContent{} + if err := w.client.Get(ctx, client.ObjectKey{Name: name}, content); err != nil { + if apierrors.IsNotFound(err) { + return + } + logger.Error(err, "Failed to get SnapshotContent") + return + } + + // Defense in depth: the informer label filter already scopes to this node. + if content.Spec.Source.NodeName != w.config.NodeName { + return + } + // Idempotency: terminal status means the work is done. + if isContentTerminal(content) { + return + } + + pod := &corev1.Pod{} + key := client.ObjectKey{Namespace: content.Spec.SnapshotRef.Namespace, Name: content.Spec.Source.PodRef.Name} + if err := w.client.Get(ctx, key, pod); err != nil { + if apierrors.IsNotFound(err) { + // The operator creates the SnapshotContent only after the source pod exists, and this + // is a linearizable (quorum) Get, so NotFound means the pod was deleted, not a + // creation race: fail the work order terminally. + w.writeFailed(ctx, content, "SourcePodNotFound", fmt.Errorf("source pod %q not found", key.String())) + return + } + logger.Error(err, "Failed to get source pod", "pod", key.String()) + return + } + if reason, msg := classifySourcePod(content, pod); reason != "" { + w.writeFailed(ctx, content, reason, errors.New(msg)) + return + } + + // Pod is valid: hand off to the single capture path. + w.reconcileSourcePod(ctx, pod) +} + +// reconcileSourcePod is the single capture path. It is driven by source-pod events and by +// reconcileSnapshotContent once the pod is validated. It selects the oldest active work order for +// the pod and drives the unstick + dump. Capture parameters come from the source pod, which is the +// single source of truth; it never mutates spec and writes status via Status().Patch only. The +// triggering content event (if any) may name a different work order than the one chosen here — the +// event is only a trigger; chooseActiveContent picks the oldest active SnapshotContent for the pod. +func (w *NodeController) reconcileSourcePod(ctx context.Context, pod *corev1.Pod) { + if pod.Spec.NodeName != w.config.NodeName { + return + } + if w.contentIndexer == nil { + return + } + objs, err := w.contentIndexer.ByIndex(podRefIndex, pod.Namespace+"/"+pod.Name) + if err != nil { + w.log.Error(err, "Failed to look up SnapshotContent by source pod", "pod", fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)) + return + } + name := chooseActiveContent(objs) + if name == "" { + return + } + logger := w.log.WithValues("content", name) + + content := &nvidiacomv1alpha1.SnapshotContent{} + if err := w.client.Get(ctx, client.ObjectKey{Name: name}, content); err != nil { + if !apierrors.IsNotFound(err) { + logger.Error(err, "Failed to get SnapshotContent") + } + return + } + if isContentTerminal(content) { + return + } + + key := content.Name + if !w.tryAcquire(key) { + return + } + releaseInFlight := true + defer func() { + if releaseInFlight { + w.release(key) + } + }() + + // Active unstick first: a non-zero container exit must SIGKILL the pod's still-running + // containers and fail the work order even when the pod is already Phase==Failed (which the + // gone-guard below would otherwise short-circuit). This runs while we hold the in-flight key, + // so it can never race a live dump (a dump in flight means tryAcquire above would have returned). + if w.failCheckpointOnContainerExit(ctx, content, pod) { + return + } + // Provenance/liveness guard. The terminal writeFailed for these is owned by + // reconcileSnapshotContent (pre-bind); here we only skip capture and let the content resync + // write the failure. + if reason, _ := classifySourcePod(content, pod); reason != "" { + logger.V(1).Info("Skipping capture; source pod not usable", "reason", reason, "pod", pod.Name) + return + } + + // Capture parameters come from the source pod, which is the single source of truth. The + // checkpoint ID is the pod label; the work order name is treated as opaque (never parsed). + id := strings.TrimSpace(pod.Labels[snapshotprotocol.CheckpointIDLabel]) + if id == "" { + w.writeFailed(ctx, content, "MissingCheckpointID", + fmt.Errorf("source pod %q missing %s label", pod.Name, snapshotprotocol.CheckpointIDLabel)) + return + } + + containerName, err := snapshotprotocol.TargetContainersFromAnnotations(pod.Annotations, 1, 1) + if err != nil { + w.writeFailed(ctx, content, "MissingTargetContainer", err) + return + } + if !isContainerReady(pod, containerName[0]) { + logger.V(1).Info("Source container not ready, awaiting quiesce", "pod", pod.Name, "container", containerName[0]) + return + } + + // Resolve the running container ID and host PID, then compute the destination from the + // pod's storage annotations. + containerID := containerIDForName(pod, containerName[0]) + if containerID == "" { + w.writeFailed(ctx, content, "ContainerNotResolved", + fmt.Errorf("could not resolve container %q ID", containerName[0])) + return + } + containerPID, _, err := w.runtime.ResolveContainer(ctx, containerID) + if err != nil { + w.writeFailed(ctx, content, "ContainerNotResolved", fmt.Errorf("resolve container %q: %w", containerName[0], err)) + return + } + loc, err := w.checkpointLocationsFromPod(pod, id, containerPID) + if err != nil { + w.writeFailed(ctx, content, "InvalidDestination", err) + return + } + if err := w.validatePodMountContainerPID(ctx, containerID, containerPID); err != nil { + w.writeFailed(ctx, content, "ContainerChanged", err) + return + } + + // Resume: a present artifact with unwritten status means a prior dump finished but the + // status write did not. The artifact dir exists only after the executor's atomic rename, + // so its presence means a completed dump. + if artifactPresent(loc.HostPath) { + w.writeReady(ctx, content) + return + } + + leaseKey := client.ObjectKey{Namespace: content.Spec.SnapshotRef.Namespace, Name: content.Name} + acquired, err := w.acquireLease(ctx, leaseKey) + if err != nil { + logger.Error(err, "Failed to acquire checkpoint lease", "lease", leaseKey.String()) + return + } + if !acquired { + return + } + + releaseInFlight = false + go w.runCheckpoint(ctx, content, pod, containerName[0], containerID, containerPID, id, loc, leaseKey, key) +} + +// runCheckpoint executes the dump under a renewed lease and writes the terminal status. +// The container ID, host PID, and resolved locations are pre-resolved by the reconciler so +// the dump does not re-resolve them. +func (w *NodeController) runCheckpoint( + ctx context.Context, + content *nvidiacomv1alpha1.SnapshotContent, + pod *corev1.Pod, + containerName, containerID string, + containerPID int, + checkpointID string, + loc checkpointLocations, + leaseKey client.ObjectKey, + inFlightKey string, +) { + logger := w.log.WithValues("content", content.Name) + defer w.release(inFlightKey) + + leaseCtx, stopLease := context.WithCancel(ctx) + defer stopLease() + go w.renewLease(leaseCtx, leaseKey) + defer func() { + releaseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := w.releaseLease(releaseCtx, leaseKey); err != nil { + logger.Error(err, "Failed to release checkpoint lease", "lease", leaseKey.String()) + } + }() + + params := CheckpointParams{ + Pod: pod, + ContainerName: containerName, + ContainerID: containerID, + ContainerPID: containerPID, + CheckpointID: checkpointID, + HostPath: loc.HostPath, + ContainerPath: loc.ContainerPath, + StartedAt: time.Now(), + } + if err := w.checkpointFn(leaseCtx, params); err != nil { + logger.Error(err, "Checkpoint failed") + w.writeFailed(ctx, content, "CheckpointFailed", err) + return + } + + w.writeReady(ctx, content) +} + +// classifySourcePod reports whether the source pod is unusable for capture, returning a terminal +// failure reason and message ("" reason means the pod is valid). It is pure: callers decide whether +// to writeFailed (reconcileSnapshotContent, pre-bind) or merely skip capture (reconcileSourcePod +// guard). Pod existence (NotFound) is handled by the caller, which holds the Get error. +func classifySourcePod(content *nvidiacomv1alpha1.SnapshotContent, pod *corev1.Pod) (string, string) { + if content.Spec.Source.PodRef.UID != "" && pod.UID != content.Spec.Source.PodRef.UID { + return "StalePodReference", + fmt.Sprintf("source pod %q UID %q does not match work order UID %q", pod.Name, pod.UID, content.Spec.Source.PodRef.UID) + } + if pod.DeletionTimestamp != nil || pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodSucceeded { + return "SourcePodGone", + fmt.Sprintf("source pod %q is no longer running (phase %s)", pod.Name, pod.Status.Phase) + } + return "", "" +} + +// failCheckpointOnContainerExit fails the work order and force-terminates the source pod's +// still-running containers when any checkpoint container has terminated non-zero. It returns +// true when a failure was handled and the caller must stop. Init containers +// (pod.Status.InitContainerStatuses) are intentionally out of scope. +func (w *NodeController) failCheckpointOnContainerExit(ctx context.Context, content *nvidiacomv1alpha1.SnapshotContent, pod *corev1.Pod) bool { + var failed *corev1.ContainerStatus + for i := range pod.Status.ContainerStatuses { + cs := &pod.Status.ContainerStatuses[i] + if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 { + failed = cs + break + } + } + if failed == nil { + return false + } + + term := failed.State.Terminated + message := fmt.Sprintf("checkpoint container %q terminated with exit code %d", failed.Name, term.ExitCode) + if term.Reason != "" { + message = fmt.Sprintf("%s: %s", message, term.Reason) + } + logger := w.log.WithValues("content", content.Name, "container", failed.Name) + logger.Info("Checkpoint container failed", "exit_code", term.ExitCode, "reason", term.Reason) + emitPodEvent(ctx, w.clientset, logger, pod, "snapshot", corev1.EventTypeWarning, "CheckpointFailed", message) + w.killRunningContainers(ctx, logger, pod, fmt.Sprintf("checkpoint container %s failed", failed.Name)) + w.writeFailed(ctx, content, "CheckpointContainerFailed", errors.New(message)) + return true +} + +// killRunningContainers SIGKILLs every still-running container in the pod, resolving each +// container's host PID through the node runtime. Best-effort: resolution and signal errors are +// logged and skipped so one stuck container does not block terminating the rest. +func (w *NodeController) killRunningContainers(ctx context.Context, logger logr.Logger, pod *corev1.Pod, reason string) { + for _, cs := range pod.Status.ContainerStatuses { + if cs.State.Running == nil || cs.ContainerID == "" { + continue + } + containerID := snapshotruntime.StripCRIScheme(cs.ContainerID) + resolveCtx, cancel := context.WithTimeout(ctx, containerResolveAttemptTimeout) + pid, _, err := w.runtime.ResolveContainer(resolveCtx, containerID) + cancel() + if err != nil { + logger.Error(err, "Failed to resolve running checkpoint container", "container", cs.Name) + continue + } + if err := snapshotruntime.SendSignalToPID(logger, pid, syscall.SIGKILL, reason); err != nil { + logger.Error(err, "Failed to signal running checkpoint container", "container", cs.Name) + } + } +} + +// writeReady patches status with the Ready condition. +func (w *NodeController) writeReady(ctx context.Context, content *nvidiacomv1alpha1.SnapshotContent) { + patch := client.MergeFrom(content.DeepCopy()) + meta.SetStatusCondition(&content.Status.Conditions, metav1.Condition{ + Type: nvidiacomv1alpha1.SnapshotConditionReady, + Status: metav1.ConditionTrue, + Reason: "Captured", + Message: "Checkpoint captured and verified", + }) + if err := w.client.Status().Patch(ctx, content, patch); err != nil { + w.log.Error(err, "Failed to write SnapshotContent ready status", "content", content.Name) + } +} + +// writeFailed patches status with the Failed condition. +func (w *NodeController) writeFailed(ctx context.Context, content *nvidiacomv1alpha1.SnapshotContent, reason string, cause error) { + patch := client.MergeFrom(content.DeepCopy()) + meta.SetStatusCondition(&content.Status.Conditions, metav1.Condition{ + Type: nvidiacomv1alpha1.SnapshotConditionFailed, + Status: metav1.ConditionTrue, + Reason: reason, + Message: cause.Error(), + }) + if err := w.client.Status().Patch(ctx, content, patch); err != nil { + w.log.Error(err, "Failed to write SnapshotContent failed status", "content", content.Name, "reason", reason) + } +} + +// executorCheckpoint is the production checkpointFn. The reconciler has already resolved the +// container ID and host PID. It runs executor.Checkpoint to the destination, verifies the +// artifact directory, and writes the snapshot-complete sentinel. On dump or verification +// failure it SIGKILLs the CUDA-locked process before returning the error. +func (w *NodeController) executorCheckpoint(ctx context.Context, params CheckpointParams) error { + log := logr.FromContextOrDiscard(ctx) + + req := executor.CheckpointRequest{ + ContainerID: params.ContainerID, + ContainerName: params.ContainerName, + CheckpointID: params.CheckpointID, + CheckpointLocation: params.HostPath, + StartedAt: params.StartedAt, + NodeName: w.config.NodeName, + PodName: params.Pod.Name, + PodNamespace: params.Pod.Namespace, + Clientset: w.clientset, + } + if err := executor.Checkpoint(ctx, w.runtime, log, req, w.config); err != nil { + w.killCheckpointProcess(log, params.ContainerPID, "checkpoint failed") + return fmt.Errorf("checkpoint: %w", err) + } + + info, statErr := os.Stat(params.HostPath) + if statErr != nil || !info.IsDir() { + w.killCheckpointProcess(log, params.ContainerPID, "checkpoint verification failed") + if statErr != nil { + return fmt.Errorf("verify checkpoint path %s: %w", params.HostPath, statErr) + } + return fmt.Errorf("verify checkpoint path %s: not a directory", params.HostPath) + } + + if err := snapshotruntime.WriteControlSentinel(params.ContainerPID, snapshotprotocol.SnapshotCompleteFile); err != nil { + w.killCheckpointProcess(log, params.ContainerPID, "checkpoint sentinel failed") + return fmt.Errorf("write snapshot-complete sentinel: %w", err) + } + return nil +} + +// killCheckpointProcess signals the CUDA-locked process so it does not hang after a failed dump. +func (w *NodeController) killCheckpointProcess(log logr.Logger, pid int, reason string) { + if err := snapshotruntime.SendSignalToPID(log, pid, syscall.SIGKILL, reason); err != nil { + log.Error(err, "Failed to signal checkpoint process", "reason", reason) + } +} + +// containerIDForName returns the running container's CRI-stripped ID, or "" if absent. +func containerIDForName(pod *corev1.Pod, containerName string) string { + for _, cs := range pod.Status.ContainerStatuses { + if cs.Name == containerName { + return snapshotruntime.StripCRIScheme(cs.ContainerID) + } + } + return "" +} + +// isContentTerminal reports whether the work order already has a terminal condition. +func isContentTerminal(content *nvidiacomv1alpha1.SnapshotContent) bool { + for _, t := range []string{nvidiacomv1alpha1.SnapshotConditionReady, nvidiacomv1alpha1.SnapshotConditionFailed} { + if cond := meta.FindStatusCondition(content.Status.Conditions, t); cond != nil && cond.Status == metav1.ConditionTrue { + return true + } + } + return false +} + +// artifactPresent reports whether a completed checkpoint directory already exists on disk. +func artifactPresent(destination string) bool { + info, err := os.Stat(destination) + return err == nil && info.IsDir() +} + +// contentNameFromInformerObj extracts the object name from a dynamic informer object, +// handling the DeletedFinalStateUnknown tombstone. +func contentNameFromInformerObj(obj interface{}) (string, bool) { + if accessor, err := meta.Accessor(obj); err == nil { + return accessor.GetName(), true + } + tombstone, ok := obj.(cache.DeletedFinalStateUnknown) + if !ok { + return "", false + } + accessor, err := meta.Accessor(tombstone.Obj) + if err != nil { + return "", false + } + return accessor.GetName(), true +} diff --git a/deploy/snapshot/internal/controller/snapshotcontent_test.go b/deploy/snapshot/internal/controller/snapshotcontent_test.go new file mode 100644 index 000000000000..ab45818c26f9 --- /dev/null +++ b/deploy/snapshot/internal/controller/snapshotcontent_test.go @@ -0,0 +1,607 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "context" + "errors" + "os" + "path/filepath" + "sync" + "testing" + "time" + + "github.com/go-logr/logr" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + k8sfake "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/tools/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + crfake "sigs.k8s.io/controller-runtime/pkg/client/fake" + + nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" + snapshottypes "github.com/ai-dynamo/dynamo/deploy/snapshot/internal/types" + snapshotprotocol "github.com/ai-dynamo/dynamo/deploy/snapshot/protocol" +) + +// fakeCheckpointer records calls behind the checkpointFn seam and returns a configured error. +type fakeCheckpointer struct { + mu sync.Mutex + called bool + params CheckpointParams + err error +} + +// fn is the checkpointFn seam the NodeController invokes for the dump. +func (fc *fakeCheckpointer) fn(_ context.Context, params CheckpointParams) error { + fc.mu.Lock() + defer fc.mu.Unlock() + fc.called = true + fc.params = params + return fc.err +} + +// wasCalled reports whether the seam was invoked. +func (fc *fakeCheckpointer) wasCalled() bool { + fc.mu.Lock() + defer fc.mu.Unlock() + return fc.called +} + +// lastParams returns the params from the most recent seam invocation. +func (fc *fakeCheckpointer) lastParams() CheckpointParams { + fc.mu.Lock() + defer fc.mu.Unlock() + return fc.params +} + +// contentScheme builds a scheme with the SnapshotContent and core types registered. +func contentScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + require.NoError(t, nvidiacomv1alpha1.AddToScheme(s)) + require.NoError(t, corev1.AddToScheme(s)) + return s +} + +// makeNodeController builds a NodeController wired to a fake typed client, runtime, and seam. Any +// SnapshotContent in objs is also added to the podRef index (mirroring the content informer's +// cache) so the pod-driven reconcileSourcePod can resolve it; tests that need a different index +// state override w.contentIndexer after construction. +func makeNodeController(t *testing.T, fc *fakeCheckpointer, objs ...client.Object) *NodeController { + t.Helper() + s := contentScheme(t) + idx := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{podRefIndex: podRefIndexFunc}) + for _, o := range objs { + if sc, ok := o.(*nvidiacomv1alpha1.SnapshotContent); ok { + require.NoError(t, idx.Add(mustUnstructured(t, sc))) + } + } + w := &NodeController{ + config: &snapshottypes.AgentConfig{NodeName: "node-a", Storage: snapshottypes.StorageSpec{Type: "pvc", BasePath: t.TempDir()}}, + clientset: k8sfake.NewClientset(), + client: crfake.NewClientBuilder().WithScheme(s).WithObjects(objs...). + WithStatusSubresource(&nvidiacomv1alpha1.SnapshotContent{}).Build(), + runtime: &fakeRuntime{}, + log: logr.Discard(), + holderID: "snapshot-agent/test", + inFlight: make(map[string]struct{}), + contentIndexer: idx, + } + w.checkpointFn = fc.fn + return w +} + +// makeWorkOrder builds a SnapshotContent work order pinned to a node and checkpoint id. +// Capture parameters now live on the source pod, so the work order carries only the node +// label and spec. +func makeWorkOrder(name, node, checkpointID string) *nvidiacomv1alpha1.SnapshotContent { + return &nvidiacomv1alpha1.SnapshotContent{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{snapshotprotocol.SnapshotNodeLabel: node}, + }, + Spec: nvidiacomv1alpha1.SnapshotContentSpec{ + SnapshotRef: nvidiacomv1alpha1.SnapshotReference{Namespace: "inference", Name: "snapshot-" + checkpointID}, + Source: nvidiacomv1alpha1.SnapshotContentSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0", UID: types.UID("pod-uid")}, NodeName: node}, + }, + } +} + +// makeSourcePod builds a ready source pod that carries the capture parameters the agent reads: +// the checkpoint-id label, the target-container annotation, and the storage/version annotations +// checkpointLocationsFromPod needs. +func makeSourcePod(checkpointID string) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-0", + Namespace: "inference", + UID: types.UID("pod-uid"), + Labels: map[string]string{snapshotprotocol.CheckpointIDLabel: checkpointID}, + Annotations: map[string]string{ + snapshotprotocol.TargetContainersAnnotation: "main", + snapshotprotocol.CheckpointArtifactVersionAnnotation: "1", + }, + }, + Spec: corev1.PodSpec{NodeName: "node-a"}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", Ready: true, ContainerID: "containerd://abc123"}, + }, + }, + } +} + +// getContent reads a SnapshotContent back from the fake client. +func getContent(t *testing.T, w *NodeController, name string) *nvidiacomv1alpha1.SnapshotContent { + t.Helper() + c := &nvidiacomv1alpha1.SnapshotContent{} + require.NoError(t, w.client.Get(context.Background(), types.NamespacedName{Name: name}, c)) + return c +} + +func TestReconcileSnapshotContent_IgnoresOtherNode(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-b", "x") + fc := &fakeCheckpointer{} + w := makeNodeController(t, fc, content) + + w.reconcileSnapshotContent(context.Background(), content.Name) + assert.False(t, fc.wasCalled()) + got := getContent(t, w, content.Name) + assert.Empty(t, got.Status.Conditions) +} + +func TestReconcileSnapshotContent_InFlightGuard(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid")}, + Spec: corev1.PodSpec{NodeName: "node-a"}, + Status: corev1.PodStatus{Phase: corev1.PodRunning}, + } + w := makeNodeController(t, &fakeCheckpointer{}, content, pod) + // Pre-mark the work order in-flight; the reconcile must short-circuit. + w.inFlight["snapshotcontent-x"] = struct{}{} + + w.reconcileSnapshotContent(context.Background(), content.Name) + got := getContent(t, w, content.Name) + assert.Empty(t, got.Status.Conditions) +} + +func TestReconcileSnapshotContent_MissingCheckpointIDFails(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + pod := makeSourcePod("x") + delete(pod.Labels, snapshotprotocol.CheckpointIDLabel) + w := makeNodeController(t, &fakeCheckpointer{}, content, pod) + + w.reconcileSnapshotContent(context.Background(), content.Name) + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "MissingCheckpointID", cond.Reason) +} + +func TestReconcileSnapshotContent_FailedContainerUnsticksAndFails(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-0", + Namespace: "inference", + UID: types.UID("pod-uid"), + Labels: map[string]string{snapshotprotocol.CheckpointIDLabel: "abc"}, + Annotations: map[string]string{snapshotprotocol.TargetContainersAnnotation: "main"}, + }, + Spec: corev1.PodSpec{NodeName: "node-a"}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, ContainerID: "containerd://main-id"}, + {Name: "helper", State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{ExitCode: 1, Reason: "Error"}}, ContainerID: "containerd://helper-id"}, + }, + }, + } + fc := &fakeCheckpointer{} + rt := &fakeRuntime{} // PID 0 → ResolveContainer errors → SendSignalToPID skipped (no real signal sent) + w := makeNodeController(t, fc, content, pod) + w.runtime = rt + + w.reconcileSnapshotContent(context.Background(), content.Name) + + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "CheckpointContainerFailed", cond.Reason) + assert.Contains(t, cond.Message, "helper") + assert.True(t, sawEventReason(w.clientset.(*k8sfake.Clientset), "CheckpointFailed")) + // Only the still-running sibling is resolved for the SIGKILL; the dead container is skipped. + assert.Equal(t, []string{"main-id"}, rt.resolvedContainerIDs) + assert.False(t, fc.wasCalled()) + assert.Empty(t, w.inFlight) +} + +func TestFailCheckpointOnContainerExit_IgnoresCleanExit(t *testing.T) { + w := makeNodeController(t, &fakeCheckpointer{}) + pod := &corev1.Pod{Status: corev1.PodStatus{ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}}, + {Name: "helper", State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{ExitCode: 0}}}, + }}} + + handled := w.failCheckpointOnContainerExit(context.Background(), &nvidiacomv1alpha1.SnapshotContent{}, pod) + assert.False(t, handled) +} + +func TestReconcileSnapshotContent_OpaqueNameUsesPodLabel(t *testing.T) { + // The work order name does not encode the pod's checkpoint id: the name is opaque and the + // pod label is the sole source of truth. Capture must proceed using the pod label ("abc"). + content := makeWorkOrder("snapshotcontent-unrelated-name", "node-a", "abc") + pod := makeSourcePod("abc") + fc := &fakeCheckpointer{} + w := makeNodeController(t, fc, content, pod) + w.runtime = &fakeRuntime{resolveContainerPID: 7} + + w.reconcileSnapshotContent(context.Background(), content.Name) + require.Eventually(t, fc.wasCalled, time.Second, 5*time.Millisecond) + + // The checkpoint id and destination come from the pod label, not the work order name. + params := fc.lastParams() + assert.Equal(t, "abc", params.CheckpointID) + assert.Equal(t, filepath.Join(w.config.Storage.BasePath, "abc", "versions", "1"), params.HostPath) + + got := getContent(t, w, content.Name) + require.NotNil(t, meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady)) +} + +func TestReconcileSnapshotContent_ResumeWritesReady(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + pod := makeSourcePod("abc") + fc := &fakeCheckpointer{} + w := makeNodeController(t, fc, content, pod) + w.runtime = &fakeRuntime{resolveContainerPID: 4242} + // Pre-create the artifact directory at the resolved destination so the resume check fires. + dest := filepath.Join(w.config.Storage.BasePath, "abc", "versions", "1") + require.NoError(t, os.MkdirAll(dest, 0o755)) + + w.reconcileSnapshotContent(context.Background(), content.Name) + assert.False(t, fc.wasCalled()) + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady) + require.NotNil(t, cond) +} + +func TestReconcileSnapshotContent_PodMountResolvesContainerPID(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + pod := makeSourcePod("abc") + fc := &fakeCheckpointer{} + w := makeNodeController(t, fc, content, pod) + w.config.Storage.AccessMode = snapshottypes.StorageAccessModePodMount + rt := &fakeRuntime{resolveContainerPID: 4242} + w.runtime = rt + + w.reconcileSnapshotContent(context.Background(), content.Name) + + // podMount mode resolves the container PID and feeds it through checkpointLocationsFromPod + // (a zero PID would fail there with a different reason). The subsequent live-PID validation + // fails in a unit test because /host/proc/ does not exist, which proves the non-zero + // PID flowed through to validatePodMountContainerPID. + assert.Contains(t, rt.resolvedContainerIDs, "abc123") + assert.False(t, fc.wasCalled()) + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "ContainerChanged", cond.Reason) +} + +func TestReconcileSnapshotContent_PodNotFoundFails(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + w := makeNodeController(t, &fakeCheckpointer{}, content) // no pod + + w.reconcileSnapshotContent(context.Background(), content.Name) + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "SourcePodNotFound", cond.Reason) +} + +func TestClassifySourcePod(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") // PodRef Name worker-0, UID pod-uid + running := func(uid string, phase corev1.PodPhase, deleting bool) *corev1.Pod { + p := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID(uid)}, + Status: corev1.PodStatus{Phase: phase}, + } + if deleting { + now := metav1.Now() + p.DeletionTimestamp = &now + } + return p + } + + reason, _ := classifySourcePod(content, running("pod-uid", corev1.PodRunning, false)) + assert.Equal(t, "", reason) + + reason, _ = classifySourcePod(content, running("other-uid", corev1.PodRunning, false)) + assert.Equal(t, "StalePodReference", reason) + + for _, phase := range []corev1.PodPhase{corev1.PodFailed, corev1.PodSucceeded} { + reason, _ = classifySourcePod(content, running("pod-uid", phase, false)) + assert.Equal(t, "SourcePodGone", reason) + } + + reason, _ = classifySourcePod(content, running("pod-uid", corev1.PodRunning, true)) + assert.Equal(t, "SourcePodGone", reason) +} + +func TestReconcileSnapshotContent_StalePodUIDFails(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("different-uid")}, + Spec: corev1.PodSpec{NodeName: "node-a"}, + Status: corev1.PodStatus{Phase: corev1.PodRunning}, + } + w := makeNodeController(t, &fakeCheckpointer{}, content, pod) + + w.reconcileSnapshotContent(context.Background(), content.Name) + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "StalePodReference", cond.Reason) +} + +func TestReconcileSnapshotContent_PodFailedFails(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid")}, + Spec: corev1.PodSpec{NodeName: "node-a"}, + Status: corev1.PodStatus{Phase: corev1.PodFailed}, + } + w := makeNodeController(t, &fakeCheckpointer{}, content, pod) + + w.reconcileSnapshotContent(context.Background(), content.Name) + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "SourcePodGone", cond.Reason) +} + +func TestReconcileSnapshotContent_NotReadyQuiesceNoOp(t *testing.T) { + content := makeWorkOrder("snapshotcontent-x", "node-a", "x") + pod := makeSourcePod("x") + pod.Status.ContainerStatuses[0].Ready = false + fc := &fakeCheckpointer{} + w := makeNodeController(t, fc, content, pod) + + w.reconcileSnapshotContent(context.Background(), content.Name) + assert.False(t, fc.wasCalled()) + got := getContent(t, w, content.Name) + assert.Empty(t, got.Status.Conditions) +} + +func TestReconcileSnapshotContent_CapturesFromPod(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + pod := makeSourcePod("abc") + fc := &fakeCheckpointer{} + w := makeNodeController(t, fc, content, pod) + w.runtime = &fakeRuntime{resolveContainerPID: 7} + + w.reconcileSnapshotContent(context.Background(), content.Name) + require.Eventually(t, fc.wasCalled, time.Second, 5*time.Millisecond) + + // Capture parameters are read from the source pod, not from SnapshotContent metadata. + params := fc.lastParams() + assert.Equal(t, "abc", params.CheckpointID) + assert.Equal(t, "main", params.ContainerName) + assert.Equal(t, "abc123", params.ContainerID) + assert.Equal(t, 7, params.ContainerPID) + // agentMount: HostPath == ContainerPath == resolved destination. + dest := filepath.Join(w.config.Storage.BasePath, "abc", "versions", "1") + assert.Equal(t, dest, params.HostPath) + assert.Equal(t, dest, params.ContainerPath) + + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady) + require.NotNil(t, cond) +} + +func TestRunCheckpoint_WritesReadyOnSuccess(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + fc := &fakeCheckpointer{} + w := makeNodeController(t, fc, content) + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid")}} + leaseKey := client.ObjectKey{Namespace: "inference", Name: content.Name} + loc := checkpointLocations{ + HostPath: filepath.Join(w.config.Storage.BasePath, "abc", "versions", "1"), + ContainerPath: filepath.Join(w.config.Storage.BasePath, "abc", "versions", "1"), + } + + w.runCheckpoint(context.Background(), content, pod, "main", "abc123", 7, "abc", loc, leaseKey, "snapshotcontent-abc") + + assert.True(t, fc.wasCalled()) + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionReady) + require.NotNil(t, cond) +} + +func TestRunCheckpoint_WritesFailedOnError(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + fc := &fakeCheckpointer{err: errors.New("criu boom")} + w := makeNodeController(t, fc, content) + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "worker-0", Namespace: "inference", UID: types.UID("pod-uid")}} + leaseKey := client.ObjectKey{Namespace: "inference", Name: content.Name} + loc := checkpointLocations{ + HostPath: filepath.Join(w.config.Storage.BasePath, "abc", "versions", "1"), + ContainerPath: filepath.Join(w.config.Storage.BasePath, "abc", "versions", "1"), + } + + w.runCheckpoint(context.Background(), content, pod, "main", "abc123", 7, "abc", loc, leaseKey, "snapshotcontent-abc") + + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "CheckpointFailed", cond.Reason) +} + +// mustUnstructured converts a typed object to the *unstructured.Unstructured the dynamic informer +// (and thus the podRef index) stores. +func mustUnstructured(t *testing.T, obj runtime.Object) *unstructured.Unstructured { + t.Helper() + m, err := runtime.DefaultUnstructuredConverter.ToUnstructured(obj) + require.NoError(t, err) + return &unstructured.Unstructured{Object: m} +} + +// contentForWorker0 builds a SnapshotContent referencing pod inference/worker-0 with a given +// creation time, optionally carrying a terminal condition (SnapshotConditionReady/Failed). +func contentForWorker0(name string, created metav1.Time, terminal string) *nvidiacomv1alpha1.SnapshotContent { + c := &nvidiacomv1alpha1.SnapshotContent{ + ObjectMeta: metav1.ObjectMeta{Name: name, CreationTimestamp: created}, + Spec: nvidiacomv1alpha1.SnapshotContentSpec{ + SnapshotRef: nvidiacomv1alpha1.SnapshotReference{Namespace: "inference", Name: "snapshot-" + name}, + Source: nvidiacomv1alpha1.SnapshotContentSource{PodRef: nvidiacomv1alpha1.PodReference{Name: "worker-0", UID: types.UID("pod-uid")}, NodeName: "node-a"}, + }, + } + if terminal != "" { + meta.SetStatusCondition(&c.Status.Conditions, metav1.Condition{Type: terminal, Status: metav1.ConditionTrue, Reason: "Done"}) + } + return c +} + +func TestPodRefIndexFunc(t *testing.T) { + keys, err := podRefIndexFunc(mustUnstructured(t, contentForWorker0("snapshotcontent-abc", metav1.Unix(1000, 0), ""))) + require.NoError(t, err) + assert.Equal(t, []string{"inference/worker-0"}, keys) +} + +func TestPodRefIndexFunc_MissingFieldsOrWrongType(t *testing.T) { + keys, err := podRefIndexFunc(&unstructured.Unstructured{Object: map[string]interface{}{"spec": map[string]interface{}{}}}) + require.NoError(t, err) + assert.Nil(t, keys) + + keys, err = podRefIndexFunc("not-unstructured") + require.NoError(t, err) + assert.Nil(t, keys) +} + +func TestContentFromInformerObj(t *testing.T) { + u := mustUnstructured(t, contentForWorker0("snapshotcontent-abc", metav1.Unix(1000, 0), "")) + + c, ok := contentFromInformerObj(u) + require.True(t, ok) + assert.Equal(t, "snapshotcontent-abc", c.Name) + + c, ok = contentFromInformerObj(cache.DeletedFinalStateUnknown{Key: "k", Obj: u}) + require.True(t, ok) + assert.Equal(t, "snapshotcontent-abc", c.Name) + + _, ok = contentFromInformerObj(cache.DeletedFinalStateUnknown{Key: "k", Obj: "bad"}) + assert.False(t, ok) + _, ok = contentFromInformerObj("bad") + assert.False(t, ok) +} + +func TestChooseActiveContent_OldestNonTerminalWins(t *testing.T) { + // "snapshotcontent-a" sorts first by name but is newer; oldest-by-CreationTimestamp must win. + newer := mustUnstructured(t, contentForWorker0("snapshotcontent-a", metav1.Unix(2000, 0), "")) + older := mustUnstructured(t, contentForWorker0("snapshotcontent-b", metav1.Unix(1000, 0), "")) + assert.Equal(t, "snapshotcontent-b", chooseActiveContent([]interface{}{newer, older})) +} + +func TestChooseActiveContent_SkipsTerminalAndTieBreaksByName(t *testing.T) { + terminal := mustUnstructured(t, contentForWorker0("snapshotcontent-old", metav1.Unix(1000, 0), nvidiacomv1alpha1.SnapshotConditionReady)) + tieA := mustUnstructured(t, contentForWorker0("snapshotcontent-a", metav1.Unix(2000, 0), "")) + tieB := mustUnstructured(t, contentForWorker0("snapshotcontent-b", metav1.Unix(2000, 0), "")) + assert.Equal(t, "snapshotcontent-a", chooseActiveContent([]interface{}{terminal, tieB, tieA})) +} + +func TestChooseActiveContent_AllTerminalReturnsEmpty(t *testing.T) { + ready := mustUnstructured(t, contentForWorker0("snapshotcontent-a", metav1.Unix(1000, 0), nvidiacomv1alpha1.SnapshotConditionReady)) + failed := mustUnstructured(t, contentForWorker0("snapshotcontent-b", metav1.Unix(2000, 0), nvidiacomv1alpha1.SnapshotConditionFailed)) + assert.Equal(t, "", chooseActiveContent([]interface{}{ready, failed})) +} + +// podWithFailedSibling builds the inference/worker-0 source pod with the target Running and a +// sibling Terminated non-zero, so a reconcile triggers the unstick. +func podWithFailedSibling() *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-0", + Namespace: "inference", + UID: types.UID("pod-uid"), + Labels: map[string]string{snapshotprotocol.CheckpointIDLabel: "abc"}, + Annotations: map[string]string{snapshotprotocol.TargetContainersAnnotation: "main"}, + }, + Spec: corev1.PodSpec{NodeName: "node-a"}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, ContainerID: "containerd://main-id"}, + {Name: "helper", State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{ExitCode: 1, Reason: "Error"}}, ContainerID: "containerd://helper-id"}, + }, + }, + } +} + +func seedIndex(t *testing.T, contents ...*nvidiacomv1alpha1.SnapshotContent) cache.Indexer { + t.Helper() + idx := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{podRefIndex: podRefIndexFunc}) + for _, c := range contents { + require.NoError(t, idx.Add(mustUnstructured(t, c))) + } + return idx +} + +func TestReconcileSourcePod_TriggersUnstick(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + content.CreationTimestamp = metav1.Unix(1000, 0) + pod := podWithFailedSibling() + fc := &fakeCheckpointer{} + rt := &fakeRuntime{} + w := makeNodeController(t, fc, content, pod) + w.runtime = rt + + w.reconcileSourcePod(context.Background(), pod) + + got := getContent(t, w, content.Name) + cond := meta.FindStatusCondition(got.Status.Conditions, nvidiacomv1alpha1.SnapshotConditionFailed) + require.NotNil(t, cond) + assert.Equal(t, "CheckpointContainerFailed", cond.Reason) + assert.Equal(t, []string{"main-id"}, rt.resolvedContainerIDs) + assert.False(t, fc.wasCalled()) +} + +func TestReconcileSourcePod_PodNotIndexedNoOp(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + pod := podWithFailedSibling() + w := makeNodeController(t, &fakeCheckpointer{}, content, pod) + w.contentIndexer = seedIndex(t) // override: empty index + + w.reconcileSourcePod(context.Background(), pod) + assert.Empty(t, getContent(t, w, content.Name).Status.Conditions) +} + +func TestReconcileSourcePod_OtherNodeNoOp(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + pod := podWithFailedSibling() + pod.Spec.NodeName = "node-b" + w := makeNodeController(t, &fakeCheckpointer{}, content, pod) + + w.reconcileSourcePod(context.Background(), pod) + assert.Empty(t, getContent(t, w, content.Name).Status.Conditions) +} + +func TestReconcileSourcePod_IndexErrorNoOp(t *testing.T) { + content := makeWorkOrder("snapshotcontent-abc", "node-a", "abc") + pod := podWithFailedSibling() + w := makeNodeController(t, &fakeCheckpointer{}, content, pod) + // Indexer without podRefIndex registered → ByIndex returns an error; reconcile must log and no-op. + w.contentIndexer = cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{}) + + w.reconcileSourcePod(context.Background(), pod) + assert.Empty(t, getContent(t, w, content.Name).Status.Conditions) +} diff --git a/deploy/snapshot/internal/controller/util.go b/deploy/snapshot/internal/controller/util.go index 58dca5ca4631..778ce9f161e4 100644 --- a/deploy/snapshot/internal/controller/util.go +++ b/deploy/snapshot/internal/controller/util.go @@ -7,7 +7,6 @@ import ( "time" "github.com/go-logr/logr" - batchv1 "k8s.io/api/batch/v1" coordinationv1 "k8s.io/api/coordination/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -15,6 +14,8 @@ import ( ktypes "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" ) const ( @@ -82,164 +83,111 @@ func annotatePod(ctx context.Context, clientset kubernetes.Interface, log logr.L return err } -func getCheckpointJob(ctx context.Context, clientset kubernetes.Interface, pod *corev1.Pod) (*batchv1.Job, error) { - jobName := pod.Labels["batch.kubernetes.io/job-name"] - if jobName == "" { - return nil, fmt.Errorf("pod %s/%s has no batch.kubernetes.io/job-name label", pod.Namespace, pod.Name) - } - - job, err := clientset.BatchV1().Jobs(pod.Namespace).Get(ctx, jobName, metav1.GetOptions{}) - if err != nil { - return nil, fmt.Errorf("failed to get checkpoint job %s/%s: %w", pod.Namespace, jobName, err) - } - return job, nil -} - -func acquireCheckpointLease(ctx context.Context, clientset kubernetes.Interface, log logr.Logger, job *batchv1.Job, holderIdentity string) (bool, error) { - leaseName := job.Name +// acquireLease acquires or renews a checkpoint lease at an arbitrary namespace/name key, +// returning false when another live holder owns it. +func (w *NodeController) acquireLease(ctx context.Context, key client.ObjectKey) (bool, error) { now := metav1.NewMicroTime(time.Now()) leaseDurationSeconds := int32(checkpointLeaseDuration.Seconds()) - leaseClient := clientset.CoordinationV1().Leases(job.Namespace) - existingLease, err := leaseClient.Get(ctx, leaseName, metav1.GetOptions{}) + leaseClient := w.clientset.CoordinationV1().Leases(key.Namespace) + existing, err := leaseClient.Get(ctx, key.Name, metav1.GetOptions{}) if err != nil { if !apierrors.IsNotFound(err) { - return false, fmt.Errorf("failed to get checkpoint lease %s/%s: %w", job.Namespace, leaseName, err) + return false, fmt.Errorf("get checkpoint lease %s: %w", key.String(), err) } - lease := &coordinationv1.Lease{ - ObjectMeta: metav1.ObjectMeta{ - Name: leaseName, - Namespace: job.Namespace, - }, + ObjectMeta: metav1.ObjectMeta{Name: key.Name, Namespace: key.Namespace}, Spec: coordinationv1.LeaseSpec{ - HolderIdentity: &holderIdentity, + HolderIdentity: &w.holderID, LeaseDurationSeconds: &leaseDurationSeconds, AcquireTime: &now, RenewTime: &now, }, } - if _, err := leaseClient.Create(ctx, lease, metav1.CreateOptions{}); err != nil { if apierrors.IsAlreadyExists(err) { return false, nil } - return false, fmt.Errorf("failed to create checkpoint lease %s/%s: %w", job.Namespace, leaseName, err) + return false, fmt.Errorf("create checkpoint lease %s: %w", key.String(), err) } return true, nil } - if !checkpointLeaseExpired(existingLease, now.Time) && - existingLease.Spec.HolderIdentity != nil && - *existingLease.Spec.HolderIdentity != holderIdentity { + if !checkpointLeaseExpired(existing, now.Time) && + existing.Spec.HolderIdentity != nil && + *existing.Spec.HolderIdentity != w.holderID { return false, nil } - - existingLease.Spec.HolderIdentity = &holderIdentity - existingLease.Spec.LeaseDurationSeconds = &leaseDurationSeconds - if existingLease.Spec.AcquireTime == nil || checkpointLeaseExpired(existingLease, now.Time) { - existingLease.Spec.AcquireTime = &now + existing.Spec.HolderIdentity = &w.holderID + existing.Spec.LeaseDurationSeconds = &leaseDurationSeconds + if existing.Spec.AcquireTime == nil || checkpointLeaseExpired(existing, now.Time) { + existing.Spec.AcquireTime = &now } - existingLease.Spec.RenewTime = &now - - if _, err := leaseClient.Update(ctx, existingLease, metav1.UpdateOptions{}); err != nil { + existing.Spec.RenewTime = &now + if _, err := leaseClient.Update(ctx, existing, metav1.UpdateOptions{}); err != nil { if apierrors.IsConflict(err) { - log.V(1).Info("Checkpoint lease update conflicted", "lease", fmt.Sprintf("%s/%s", job.Namespace, leaseName)) return false, nil } - return false, fmt.Errorf("failed to update checkpoint lease %s/%s: %w", job.Namespace, leaseName, err) + return false, fmt.Errorf("update checkpoint lease %s: %w", key.String(), err) } - return true, nil } -func renewCheckpointLease(ctx context.Context, clientset kubernetes.Interface, job *batchv1.Job, holderIdentity string) error { - leaseName := job.Name - leaseClient := clientset.CoordinationV1().Leases(job.Namespace) - lease, err := leaseClient.Get(ctx, leaseName, metav1.GetOptions{}) +// renewLease periodically renews the lease until ctx is cancelled. +func (w *NodeController) renewLease(ctx context.Context, key client.ObjectKey) { + ticker := time.NewTicker(checkpointLeaseRenewInterval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + if err := w.renewLeaseOnce(ctx, key); err != nil { + log.FromContext(ctx).Error(err, "Failed to renew checkpoint lease", "lease", key.String()) + return + } + } + } +} + +// renewLeaseOnce bumps the lease renew time, failing if this holder no longer owns it. +func (w *NodeController) renewLeaseOnce(ctx context.Context, key client.ObjectKey) error { + leaseClient := w.clientset.CoordinationV1().Leases(key.Namespace) + lease, err := leaseClient.Get(ctx, key.Name, metav1.GetOptions{}) if err != nil { - return fmt.Errorf("failed to get checkpoint lease %s/%s for renewal: %w", job.Namespace, leaseName, err) + return fmt.Errorf("get checkpoint lease %s for renewal: %w", key.String(), err) } - if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity != holderIdentity { - return fmt.Errorf("checkpoint lease %s/%s is no longer held by %q", job.Namespace, leaseName, holderIdentity) + if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity != w.holderID { + return fmt.Errorf("checkpoint lease %s is no longer held by %q", key.String(), w.holderID) } - now := metav1.NewMicroTime(time.Now()) leaseDurationSeconds := int32(checkpointLeaseDuration.Seconds()) lease.Spec.LeaseDurationSeconds = &leaseDurationSeconds lease.Spec.RenewTime = &now - if _, err := leaseClient.Update(ctx, lease, metav1.UpdateOptions{}); err != nil { - return fmt.Errorf("failed to renew checkpoint lease %s/%s: %w", job.Namespace, leaseName, err) + return fmt.Errorf("renew checkpoint lease %s: %w", key.String(), err) } return nil } -func releaseCheckpointLease(ctx context.Context, clientset kubernetes.Interface, log logr.Logger, job *batchv1.Job, holderIdentity string) error { - leaseName := job.Name - leaseClient := clientset.CoordinationV1().Leases(job.Namespace) - lease, err := leaseClient.Get(ctx, leaseName, metav1.GetOptions{}) +// releaseLease deletes the lease if this holder owns it. +func (w *NodeController) releaseLease(ctx context.Context, key client.ObjectKey) error { + leaseClient := w.clientset.CoordinationV1().Leases(key.Namespace) + lease, err := leaseClient.Get(ctx, key.Name, metav1.GetOptions{}) if err != nil { if apierrors.IsNotFound(err) { return nil } - return fmt.Errorf("failed to get checkpoint lease %s/%s for release: %w", job.Namespace, leaseName, err) + return fmt.Errorf("get checkpoint lease %s for release: %w", key.String(), err) } - - if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity != holderIdentity { - log.V(1).Info("Skipping checkpoint lease release because another holder owns it", - "lease", fmt.Sprintf("%s/%s", job.Namespace, leaseName), - "holder", holderIdentity, - ) + if lease.Spec.HolderIdentity == nil || *lease.Spec.HolderIdentity != w.holderID { return nil } - - if err := leaseClient.Delete(ctx, leaseName, metav1.DeleteOptions{}); err != nil && !apierrors.IsNotFound(err) { - return fmt.Errorf("failed to delete checkpoint lease %s/%s: %w", job.Namespace, leaseName, err) + if err := leaseClient.Delete(ctx, key.Name, metav1.DeleteOptions{}); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("delete checkpoint lease %s: %w", key.String(), err) } return nil } - -func (w *NodeController) renewCheckpointLease(ctx context.Context, log logr.Logger, job *batchv1.Job, stopLease context.CancelCauseFunc) { - ticker := time.NewTicker(checkpointLeaseRenewInterval) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - if err := renewCheckpointLease(ctx, w.clientset, job, w.holderID); err != nil { - log.Error(err, "Failed to renew checkpoint lease") - stopLease(fmt.Errorf("checkpoint lease renewal failed: %w", err)) - return - } - } - } -} - -func annotateJob(ctx context.Context, clientset kubernetes.Interface, log logr.Logger, job *batchv1.Job, annotations map[string]string) error { - patchBytes, err := json.Marshal(map[string]any{ - "metadata": map[string]any{ - "annotations": annotations, - }, - }) - if err != nil { - return fmt.Errorf("failed to build job annotation patch payload: %w", err) - } - - _, err = clientset.BatchV1().Jobs(job.Namespace).Patch( - ctx, job.Name, ktypes.MergePatchType, patchBytes, metav1.PatchOptions{}, - ) - if err != nil { - log.Error(err, "Failed to annotate checkpoint job", - "job", fmt.Sprintf("%s/%s", job.Namespace, job.Name), - "annotations", annotations, - ) - } - return err -} - func emitPodEvent(ctx context.Context, clientset kubernetes.Interface, log logr.Logger, pod *corev1.Pod, component, eventType, reason, message string) { event := &corev1.Event{ ObjectMeta: metav1.ObjectMeta{ diff --git a/deploy/snapshot/protocol/checkpoint.go b/deploy/snapshot/protocol/checkpoint.go index 14221be301bd..80f37da74286 100644 --- a/deploy/snapshot/protocol/checkpoint.go +++ b/deploy/snapshot/protocol/checkpoint.go @@ -24,21 +24,6 @@ type CheckpointJobOptions struct { WrapLaunchJob bool } -type CheckpointObservationPhase string - -const ( - CheckpointObservationPhaseRunning CheckpointObservationPhase = "running" - CheckpointObservationPhaseWaitingForConfirmation CheckpointObservationPhase = "waiting_for_confirmation" - CheckpointObservationPhaseReady CheckpointObservationPhase = "ready" - CheckpointObservationPhaseFailed CheckpointObservationPhase = "failed" -) - -type CheckpointObservation struct { - Phase CheckpointObservationPhase - Reason string - Message string -} - func GetCheckpointJobName(checkpointID string, artifactVersion string) string { return "checkpoint-job-" + checkpointID + "-" + ArtifactVersion(artifactVersion) } @@ -126,65 +111,6 @@ func NewCheckpointJob(podTemplate *corev1.PodTemplateSpec, opts CheckpointJobOpt }, nil } -func ObserveCheckpointJob(job *batchv1.Job, checkpointWorkerActive bool) CheckpointObservation { - jobComplete := false - jobFailed := false - for _, condition := range job.Status.Conditions { - if condition.Status != corev1.ConditionTrue { - continue - } - if condition.Type == batchv1.JobComplete { - jobComplete = true - continue - } - if condition.Type == batchv1.JobFailed { - jobFailed = true - } - } - - status := job.Annotations[CheckpointStatusAnnotation] - if status == CheckpointStatusFailed { - observation := CheckpointObservation{ - Phase: CheckpointObservationPhaseFailed, - Reason: "JobFailed", - Message: "Checkpoint job failed", - } - if jobComplete { - observation.Reason = "CheckpointVerificationFailed" - observation.Message = "Checkpoint job completed but snapshot-agent reported checkpoint failure" - } - return observation - } - - if jobComplete { - if status == CheckpointStatusCompleted { - return CheckpointObservation{ - Phase: CheckpointObservationPhaseReady, - Reason: "JobSucceeded", - Message: "Checkpoint job completed successfully", - } - } - if checkpointWorkerActive { - return CheckpointObservation{Phase: CheckpointObservationPhaseWaitingForConfirmation} - } - return CheckpointObservation{ - Phase: CheckpointObservationPhaseFailed, - Reason: "CheckpointVerificationFailed", - Message: "Checkpoint job completed without snapshot-agent completion confirmation", - } - } - - if jobFailed { - return CheckpointObservation{ - Phase: CheckpointObservationPhaseFailed, - Reason: "JobFailed", - Message: "Checkpoint job failed", - } - } - - return CheckpointObservation{Phase: CheckpointObservationPhaseRunning} -} - // EnsureLocalhostSeccompProfile sets the pod-level localhost seccomp profile // to the given path, allocating PodSecurityContext if needed. An empty profile // is a no-op so callers can disable injection entirely without conditional diff --git a/deploy/snapshot/protocol/checkpoint_observation_test.go b/deploy/snapshot/protocol/checkpoint_observation_test.go deleted file mode 100644 index 2b0f40f3c0fc..000000000000 --- a/deploy/snapshot/protocol/checkpoint_observation_test.go +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -package protocol - -import ( - "testing" - - batchv1 "k8s.io/api/batch/v1" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -func TestObserveCheckpointJob(t *testing.T) { - makeJob := func(annotation string, conditions ...batchv1.JobCondition) *batchv1.Job { - job := &batchv1.Job{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{}, - }, - Status: batchv1.JobStatus{ - Conditions: conditions, - }, - } - if annotation != "" { - job.Annotations[CheckpointStatusAnnotation] = annotation - } - return job - } - - tests := []struct { - name string - job *batchv1.Job - checkpointWorkerActive bool - wantPhase CheckpointObservationPhase - wantReason string - wantMessage string - }{ - { - name: "running job stays running", - job: makeJob(""), - wantPhase: CheckpointObservationPhaseRunning, - }, - { - name: "completed job with completion annotation is ready", - job: makeJob( - CheckpointStatusCompleted, - batchv1.JobCondition{Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, - ), - wantPhase: CheckpointObservationPhaseReady, - wantReason: "JobSucceeded", - wantMessage: "Checkpoint job completed successfully", - }, - { - name: "completed job waits for terminal confirmation while worker is active", - job: makeJob( - "", - batchv1.JobCondition{Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, - ), - checkpointWorkerActive: true, - wantPhase: CheckpointObservationPhaseWaitingForConfirmation, - }, - { - name: "completed job fails without confirmation once worker is inactive", - job: makeJob( - "", - batchv1.JobCondition{Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, - ), - wantPhase: CheckpointObservationPhaseFailed, - wantReason: "CheckpointVerificationFailed", - wantMessage: "Checkpoint job completed without snapshot-agent completion confirmation", - }, - { - name: "failed checkpoint annotation wins over completed job", - job: makeJob( - CheckpointStatusFailed, - batchv1.JobCondition{Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, - ), - checkpointWorkerActive: true, - wantPhase: CheckpointObservationPhaseFailed, - wantReason: "CheckpointVerificationFailed", - wantMessage: "Checkpoint job completed but snapshot-agent reported checkpoint failure", - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - observation := ObserveCheckpointJob(tc.job, tc.checkpointWorkerActive) - if observation.Phase != tc.wantPhase { - t.Fatalf("phase = %q, want %q", observation.Phase, tc.wantPhase) - } - if observation.Reason != tc.wantReason { - t.Fatalf("reason = %q, want %q", observation.Reason, tc.wantReason) - } - if observation.Message != tc.wantMessage { - t.Fatalf("message = %q, want %q", observation.Message, tc.wantMessage) - } - }) - } -} diff --git a/deploy/snapshot/protocol/common.go b/deploy/snapshot/protocol/common.go index 43416eb9c601..9248887ecdca 100644 --- a/deploy/snapshot/protocol/common.go +++ b/deploy/snapshot/protocol/common.go @@ -19,6 +19,10 @@ const ( CheckpointArtifactVersionAnnotation = "nvidia.com/snapshot-artifact-version" + // SnapshotNodeLabel mirrors SnapshotContent.spec.source.nodeName onto the + // object so the per-node agent's cache can label-select work for its node. + SnapshotNodeLabel = "nvidia.com/snapshot-node" + // Required comma-separated checkpoint/restore target container list. TargetContainersAnnotation = "nvidia.com/snapshot-target-containers"