Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ spec:
spec:
description: |-
SnapshotContentSpec defines the desired state of SnapshotContent. It is
populated by the node agent at creation time and is immutable thereafter.
populated by the SnapshotReconciler (operator) at creation time and is
immutable thereafter.
properties:
snapshotRef:
description: |-
Expand All @@ -86,18 +87,35 @@ spec:
- namespace
type: object
source:
description: Source locates the physical artifact via a self-contained, opaque handle.
description: 'Source describes what to capture: the source pod and the node it runs on.'
properties:
snapshotHandle:
nodeName:
description: |-
SnapshotHandle is a self-contained, opaque artifact locator. The v1alpha1
PVC format is:
pvc://<namespace>/<claimName>/<basePath>/<checkpointID>/versions/<version>
It fully locates the artifact without correlating any other field.
NodeName is the node the source pod runs on, denormalized from the live
pod so it travels with PodRef as one immutable unit and selects the node
agent that performs the dump.
minLength: 1
type: string
podRef:
description: |-
PodRef identifies the pod to dump. Its UID guards against dumping a
same-named recreation of the pod.
properties:
name:
description: Name of the source pod.
minLength: 1
type: string
uid:
description: |-
UID of the source pod, recorded so the node agent dumps that specific
pod and not a same-named recreation.
type: string
required:
- name
type: object
required:
- snapshotHandle
- nodeName
- podRef
type: object
required:
- snapshotRef
Expand Down Expand Up @@ -164,11 +182,6 @@ spec:
- type
type: object
type: array
snapshotHandle:
description: |-
SnapshotHandle mirrors spec.source.snapshotHandle once the node agent has
verified the artifact.
type: string
type: object
type: object
x-kubernetes-validations:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,6 @@ spec:
scope: Namespaced
versions:
- additionalPrinterColumns:
- description: Artifact identity
jsonPath: .spec.checkpointID
name: CheckpointID
type: string
- description: Bound SnapshotContent
jsonPath: .status.boundSnapshotContentName
name: Content
Expand Down Expand Up @@ -64,21 +60,12 @@ spec:
description: |-
SnapshotSpec defines the desired state of Snapshot.

Minimal "trigger" shape: it names what to capture (an existing pod) and the
artifact identity (CheckpointID). Capture parameters the node agent needs at
dump time (target container, storage base path) are read from the referenced
pod's existing annotations and mounts, not duplicated here. The spec is
immutable after creation.
Minimal "trigger" shape: it names what to capture (an existing pod). All
capture parameters the node agent needs at dump time (checkpoint ID, target
container, storage base path) are read from the referenced pod's existing
labels/annotations and mounts, not duplicated here. The spec is immutable
after creation.
properties:
checkpointID:
description: |-
CheckpointID is the stable artifact identity and the on-PVC artifact
subdirectory name (<basePath>/<checkpointID>/versions/<version>/). It is
the primary key of the storage contract shared with the restore path and
is immutable after creation.
maxLength: 253
minLength: 1
type: string
source:
description: |-
Source identifies the captured workload. It is a struct (rather than an
Expand All @@ -94,14 +81,18 @@ spec:
description: Name of the source pod.
minLength: 1
type: string
uid:
description: |-
UID of the source pod, recorded so the node agent dumps that specific
pod and not a same-named recreation.
type: string
required:
- name
type: object
required:
- podRef
type: object
required:
- checkpointID
- source
type: object
status:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ rules:
- dynamographdeploymentscalingadapters
- dynamomodels
- dynamoworkermetadatas
- snapshotcontents
- snapshots
verbs:
- create
Expand All @@ -347,6 +348,7 @@ rules:
- dynamographdeploymentrequests/finalizers
- dynamographdeployments/finalizers
- dynamomodels/finalizers
- snapshots/finalizers
verbs:
- update
- apiGroups:
Expand All @@ -358,6 +360,8 @@ rules:
- dynamographdeployments/status
- dynamographdeploymentscalingadapters/status
- dynamomodels/status
- snapshotcontents/status
- snapshots/status
verbs:
- get
- patch
Expand Down
27 changes: 26 additions & 1 deletion deploy/helm/charts/snapshot/templates/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,32 @@ rules:
- apiGroups: ["resource.k8s.io"]
resources: ["resourceclaims"]
verbs: ["get", "list"]
{{- else }}
{{- end }}
{{- end }}

{{- if .Values.rbac.create }}
---
# SnapshotContent is cluster-scoped, so the agent always needs a ClusterRole for it.
# The agent reads work orders and writes only their status; it never creates, deletes,
# or touches Snapshots (the work order is self-contained).
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "snapshot.fullname" . }}-agent-snapshotcontents
labels:
{{- include "snapshot.labels" . | nindent 4 }}
rules:
- apiGroups: ["nvidia.com"]
resources: ["snapshotcontents"]
verbs: ["get", "list", "watch", "update", "patch"]
- apiGroups: ["nvidia.com"]
resources: ["snapshotcontents/status"]
verbs: ["update", "patch"]
{{- end }}

{{- if .Values.rbac.create }}
{{- if not .Values.rbac.namespaceRestricted }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
Expand Down
19 changes: 19 additions & 0 deletions deploy/helm/charts/snapshot/templates/rolebinding.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,25 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.rbac.create }}
---
# Bind agent to the cluster-scoped SnapshotContent ClusterRole (capture work orders).
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "snapshot.fullname" . }}-agent-snapshotcontents
labels:
{{- include "snapshot.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "snapshot.fullname" . }}-agent-snapshotcontents
subjects:
- kind: ServiceAccount
name: {{ include "snapshot.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
{{- end }}

{{- if .Values.rbac.create }}
{{- if .Values.rbac.namespaceRestricted }}
apiVersion: rbac.authorization.k8s.io/v1
Expand Down
33 changes: 17 additions & 16 deletions deploy/operator/api/v1alpha1/snapshot_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
package v1alpha1

import (
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
)

// Snapshot and SnapshotContent status condition types. Both objects share this
Expand All @@ -31,23 +33,18 @@ const (
SnapshotConditionFailed = "Failed"
)

// IsSnapshotSucceeded reports whether the Snapshot's Ready condition is True.
func IsSnapshotSucceeded(s *Snapshot) bool {
return meta.IsStatusConditionTrue(s.Status.Conditions, SnapshotConditionReady)
}

// IsSnapshotFailed reports whether the Snapshot's Failed condition is True.
func IsSnapshotFailed(s *Snapshot) bool {
return meta.IsStatusConditionTrue(s.Status.Conditions, SnapshotConditionFailed)
}

// SnapshotSpec defines the desired state of Snapshot.
//
// Minimal "trigger" shape: it names what to capture (an existing pod) and the
// artifact identity (CheckpointID). Capture parameters the node agent needs at
// dump time (target container, storage base path) are read from the referenced
// pod's existing annotations and mounts, not duplicated here. The spec is
// immutable after creation.
type SnapshotSpec struct {
// CheckpointID is the stable artifact identity and the on-PVC artifact
// subdirectory name (<basePath>/<checkpointID>/versions/<version>/). It is
// the primary key of the storage contract shared with the restore path and
// is immutable after creation.
// +kubebuilder:validation:Required
// +kubebuilder:validation:MinLength=1
// +kubebuilder:validation:MaxLength=253
CheckpointID string `json:"checkpointID"`

// Source identifies the captured workload. It is a struct (rather than an
// inlined reference) so future source variants can be added additively.
// +kubebuilder:validation:Required
Expand All @@ -69,6 +66,11 @@ type PodReference struct {
// +kubebuilder:validation:Required
// +kubebuilder:validation:MinLength=1
Name string `json:"name"`

// UID of the source pod, recorded so the node agent dumps that specific
// pod and not a same-named recreation.
// +optional
UID types.UID `json:"uid,omitempty"`
}

// SnapshotStatus defines the observed state of Snapshot.
Expand All @@ -88,7 +90,6 @@ type SnapshotStatus struct {
// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:resource:scope=Namespaced,shortName=snap
// +kubebuilder:printcolumn:name="CheckpointID",type="string",JSONPath=".spec.checkpointID",description="Artifact identity"
// +kubebuilder:printcolumn:name="Content",type="string",JSONPath=".status.boundSnapshotContentName",description="Bound SnapshotContent"
// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status",description="Ready condition"
// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
Expand Down
20 changes: 11 additions & 9 deletions deploy/operator/api/v1alpha1/snapshot_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@ func TestSnapshotDeepCopyIsIndependent(t *testing.T) {
original := &Snapshot{
ObjectMeta: metav1.ObjectMeta{Name: "snap-a", Namespace: "inference"},
Spec: SnapshotSpec{
CheckpointID: "abc123",
Source: SnapshotSource{PodRef: PodReference{Name: "worker-0"}},
Source: SnapshotSource{PodRef: PodReference{Name: "worker-0"}},
},
Status: SnapshotStatus{
Conditions: []metav1.Condition{{Type: "Ready", Status: metav1.ConditionTrue, Reason: "Captured"}},
Expand All @@ -59,10 +58,10 @@ func TestSnapshotDeepCopyIsIndependent(t *testing.T) {
t.Fatalf("DeepCopy is not equal to original")
}

clone.Spec.CheckpointID = "mutated"
clone.Spec.Source.PodRef.Name = "mutated"
clone.Status.Conditions[0].Reason = "Changed"
if original.Spec.CheckpointID != "abc123" {
t.Errorf("mutating clone spec changed original: got %q", original.Spec.CheckpointID)
if original.Spec.Source.PodRef.Name != "worker-0" {
t.Errorf("mutating clone spec changed original: got %q", original.Spec.Source.PodRef.Name)
}
if original.Status.Conditions[0].Reason != "Captured" {
t.Errorf("mutating clone condition changed original: got %q", original.Status.Conditions[0].Reason)
Expand All @@ -76,7 +75,10 @@ func TestSnapshotContentDeepCopyIsIndependent(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{Name: "content-a"},
Spec: SnapshotContentSpec{
SnapshotRef: SnapshotReference{Namespace: "inference", Name: "snap-a", UID: types.UID("uid-1")},
Source: SnapshotContentSource{SnapshotHandle: "pvc://inference/ckpt-pvc/checkpoints/abc123/versions/1"},
Source: SnapshotContentSource{
PodRef: PodReference{Name: "worker-0", UID: types.UID("pod-uid-1")},
NodeName: "node-a",
},
},
Status: SnapshotContentStatus{
Conditions: []metav1.Condition{{Type: "Ready", Status: metav1.ConditionTrue, Reason: "Bound"}},
Expand All @@ -88,10 +90,10 @@ func TestSnapshotContentDeepCopyIsIndependent(t *testing.T) {
t.Fatalf("DeepCopy is not equal to original")
}

clone.Spec.Source.SnapshotHandle = "mutated"
clone.Spec.Source.PodRef.Name = "mutated"
clone.Status.Conditions[0].Reason = "Changed"
if original.Spec.Source.SnapshotHandle != "pvc://inference/ckpt-pvc/checkpoints/abc123/versions/1" {
t.Errorf("mutating clone changed original handle: got %q", original.Spec.Source.SnapshotHandle)
if original.Spec.Source.PodRef.Name != "worker-0" {
t.Errorf("mutating clone changed original podRef name: got %q", original.Spec.Source.PodRef.Name)
}
if original.Status.Conditions[0].Reason != "Bound" {
t.Errorf("mutating clone condition changed original: got %q", original.Status.Conditions[0].Reason)
Expand Down
38 changes: 25 additions & 13 deletions deploy/operator/api/v1alpha1/snapshotcontent_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,31 @@
package v1alpha1

import (
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
)

// IsSnapshotContentSucceeded reports whether the SnapshotContent's Ready condition is True.
func IsSnapshotContentSucceeded(c *SnapshotContent) bool {
return meta.IsStatusConditionTrue(c.Status.Conditions, SnapshotConditionReady)
}

// IsSnapshotContentFailed reports whether the SnapshotContent's Failed condition is True.
func IsSnapshotContentFailed(c *SnapshotContent) bool {
return meta.IsStatusConditionTrue(c.Status.Conditions, SnapshotConditionFailed)
}

// SnapshotContentSpec defines the desired state of SnapshotContent. It is
// populated by the node agent at creation time and is immutable thereafter.
// populated by the SnapshotReconciler (operator) at creation time and is
// immutable thereafter.
type SnapshotContentSpec struct {
// SnapshotRef is the back-pointer to the bound Snapshot. It may span
// namespaces because SnapshotContent is cluster-scoped.
// +kubebuilder:validation:Required
SnapshotRef SnapshotReference `json:"snapshotRef"`

// Source locates the physical artifact via a self-contained, opaque handle.
// Source describes what to capture: the source pod and the node it runs on.
// +kubebuilder:validation:Required
Source SnapshotContentSource `json:"source"`
}
Expand All @@ -51,24 +63,24 @@ type SnapshotReference struct {
UID types.UID `json:"uid,omitempty"`
}

// SnapshotContentSource locates the physical checkpoint artifact.
// SnapshotContentSource is the immutable source descriptor: what to dump
// (PodRef) and where it runs (NodeName).
type SnapshotContentSource struct {
// SnapshotHandle is a self-contained, opaque artifact locator. The v1alpha1
// PVC format is:
// pvc://<namespace>/<claimName>/<basePath>/<checkpointID>/versions/<version>
// It fully locates the artifact without correlating any other field.
// PodRef identifies the pod to dump. Its UID guards against dumping a
// same-named recreation of the pod.
// +kubebuilder:validation:Required
PodRef PodReference `json:"podRef"`

// NodeName is the node the source pod runs on, denormalized from the live
// pod so it travels with PodRef as one immutable unit and selects the node
// agent that performs the dump.
// +kubebuilder:validation:Required
// +kubebuilder:validation:MinLength=1
SnapshotHandle string `json:"snapshotHandle"`
NodeName string `json:"nodeName"`
}

// SnapshotContentStatus defines the observed state of SnapshotContent.
type SnapshotContentStatus struct {
// SnapshotHandle mirrors spec.source.snapshotHandle once the node agent has
// verified the artifact.
// +optional
SnapshotHandle *string `json:"snapshotHandle,omitempty"`

// Conditions reflect the latest observations of the SnapshotContent's state.
// Standard types are Ready and Failed.
// +optional
Expand Down
Loading
Loading