diff --git a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshotcontents.yaml b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshotcontents.yaml new file mode 100644 index 000000000000..99dfbc11dee1 --- /dev/null +++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshotcontents.yaml @@ -0,0 +1,180 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + helm.sh/resource-policy: keep + name: snapshotcontents.nvidia.com +spec: + group: nvidia.com + names: + kind: SnapshotContent + listKind: SnapshotContentList + plural: snapshotcontents + shortNames: + - snapcontent + singular: snapshotcontent + scope: Cluster + versions: + - additionalPrinterColumns: + - description: Bound Snapshot + jsonPath: .spec.snapshotRef.name + name: Snapshot + type: string + - description: Snapshot namespace + jsonPath: .spec.snapshotRef.namespace + name: Namespace + type: string + - description: Ready condition + jsonPath: .status.conditions[?(@.type=='Ready')].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + SnapshotContent is the Schema for the snapshotcontents API. It is the + cluster-scoped artifact-of-record for a captured container checkpoint. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + SnapshotContentSpec defines the desired state of SnapshotContent. It is + populated by the node agent at creation time and is immutable thereafter. + properties: + snapshotRef: + description: |- + SnapshotRef is the back-pointer to the bound Snapshot. It may span + namespaces because SnapshotContent is cluster-scoped. + properties: + name: + description: Name of the referenced Snapshot. + type: string + namespace: + description: Namespace of the referenced Snapshot. + type: string + uid: + description: |- + UID of the referenced Snapshot, recorded at binding time to detect a + stale reference after a delete and recreate. + type: string + required: + - name + - namespace + type: object + source: + description: Source locates the physical artifact via a self-contained, opaque handle. + properties: + snapshotHandle: + description: |- + SnapshotHandle is a self-contained, opaque artifact locator. The v1alpha1 + PVC format is: + pvc://////versions/ + It fully locates the artifact without correlating any other field. + minLength: 1 + type: string + required: + - snapshotHandle + type: object + required: + - snapshotRef + - source + type: object + status: + description: SnapshotContentStatus defines the observed state of SnapshotContent. + properties: + conditions: + description: |- + Conditions reflect the latest observations of the SnapshotContent's state. + Standard types are Ready and Failed. + items: + description: Condition contains details for one aspect of the current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + snapshotHandle: + description: |- + SnapshotHandle mirrors spec.source.snapshotHandle once the node agent has + verified the artifact. + type: string + type: object + type: object + x-kubernetes-validations: + - message: spec is immutable + rule: '!has(oldSelf.spec) || self.spec == oldSelf.spec' + served: true + storage: true + subresources: + status: {} diff --git a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshots.yaml b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshots.yaml new file mode 100644 index 000000000000..a5101b641309 --- /dev/null +++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_snapshots.yaml @@ -0,0 +1,182 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + helm.sh/resource-policy: keep + name: snapshots.nvidia.com +spec: + group: nvidia.com + names: + kind: Snapshot + listKind: SnapshotList + plural: snapshots + shortNames: + - snap + singular: snapshot + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: Artifact identity + jsonPath: .spec.checkpointID + name: CheckpointID + type: string + - description: Bound SnapshotContent + jsonPath: .status.boundSnapshotContentName + name: Content + type: string + - description: Ready condition + jsonPath: .status.conditions[?(@.type=='Ready')].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + Snapshot is the Schema for the snapshots API. It is the namespaced binding + for a captured container checkpoint and is consumed by restore paths. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + SnapshotSpec defines the desired state of Snapshot. + + Minimal "trigger" shape: it names what to capture (an existing pod) and the + artifact identity (CheckpointID). Capture parameters the node agent needs at + dump time (target container, storage base path) are read from the referenced + pod's existing annotations and mounts, not duplicated here. The spec is + immutable after creation. + properties: + checkpointID: + description: |- + CheckpointID is the stable artifact identity and the on-PVC artifact + subdirectory name (//versions//). It is + the primary key of the storage contract shared with the restore path and + is immutable after creation. + maxLength: 253 + minLength: 1 + type: string + source: + description: |- + Source identifies the captured workload. It is a struct (rather than an + inlined reference) so future source variants can be added additively. + properties: + podRef: + description: |- + PodRef references the pod, in the Snapshot's namespace, that is captured. + The operator prepares the pod (control volume, target-container annotation, + checkpoint storage mount) before creating the Snapshot. + properties: + name: + description: Name of the source pod. + minLength: 1 + type: string + required: + - name + type: object + required: + - podRef + type: object + required: + - checkpointID + - source + type: object + status: + description: SnapshotStatus defines the observed state of Snapshot. + properties: + boundSnapshotContentName: + description: |- + BoundSnapshotContentName is the name of the cluster-scoped SnapshotContent + this Snapshot is bound to. It is nil until the agent has created the + content and recorded the binding. + type: string + conditions: + description: |- + Conditions reflect the latest observations of the Snapshot's state. + Standard types are Ready and Failed. + items: + description: Condition contains details for one aspect of the current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + type: object + type: object + x-kubernetes-validations: + - message: spec is immutable + rule: '!has(oldSelf.spec) || self.spec == oldSelf.spec' + served: true + storage: true + subresources: + status: {} diff --git a/deploy/operator/api/v1alpha1/snapshot_types.go b/deploy/operator/api/v1alpha1/snapshot_types.go new file mode 100644 index 000000000000..8efe51ca4526 --- /dev/null +++ b/deploy/operator/api/v1alpha1/snapshot_types.go @@ -0,0 +1,118 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// Snapshot and SnapshotContent status condition types. Both objects share this +// vocabulary; the operator and node agent set them via meta.SetStatusCondition. +const ( + // SnapshotConditionReady is True when capture and binding completed and the + // artifact is usable for restore. + SnapshotConditionReady = "Ready" + // SnapshotConditionFailed is True when capture or binding failed terminally. + SnapshotConditionFailed = "Failed" +) + +// SnapshotSpec defines the desired state of Snapshot. +// +// Minimal "trigger" shape: it names what to capture (an existing pod) and the +// artifact identity (CheckpointID). Capture parameters the node agent needs at +// dump time (target container, storage base path) are read from the referenced +// pod's existing annotations and mounts, not duplicated here. The spec is +// immutable after creation. +type SnapshotSpec struct { + // CheckpointID is the stable artifact identity and the on-PVC artifact + // subdirectory name (//versions//). It is + // the primary key of the storage contract shared with the restore path and + // is immutable after creation. + // +kubebuilder:validation:Required + // +kubebuilder:validation:MinLength=1 + // +kubebuilder:validation:MaxLength=253 + CheckpointID string `json:"checkpointID"` + + // Source identifies the captured workload. It is a struct (rather than an + // inlined reference) so future source variants can be added additively. + // +kubebuilder:validation:Required + Source SnapshotSource `json:"source"` +} + +// SnapshotSource identifies the workload captured by a Snapshot. +type SnapshotSource struct { + // PodRef references the pod, in the Snapshot's namespace, that is captured. + // The operator prepares the pod (control volume, target-container annotation, + // checkpoint storage mount) before creating the Snapshot. + // +kubebuilder:validation:Required + PodRef PodReference `json:"podRef"` +} + +// PodReference names a pod in the same namespace as the referencing Snapshot. +type PodReference struct { + // Name of the source pod. + // +kubebuilder:validation:Required + // +kubebuilder:validation:MinLength=1 + Name string `json:"name"` +} + +// SnapshotStatus defines the observed state of Snapshot. +type SnapshotStatus struct { + // BoundSnapshotContentName is the name of the cluster-scoped SnapshotContent + // this Snapshot is bound to. It is nil until the agent has created the + // content and recorded the binding. + // +optional + BoundSnapshotContentName *string `json:"boundSnapshotContentName,omitempty"` + + // Conditions reflect the latest observations of the Snapshot's state. + // Standard types are Ready and Failed. + // +optional + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Namespaced,shortName=snap +// +kubebuilder:printcolumn:name="CheckpointID",type="string",JSONPath=".spec.checkpointID",description="Artifact identity" +// +kubebuilder:printcolumn:name="Content",type="string",JSONPath=".status.boundSnapshotContentName",description="Bound SnapshotContent" +// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status",description="Ready condition" +// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" +// +kubebuilder:validation:XValidation:rule="!has(oldSelf.spec) || self.spec == oldSelf.spec",message="spec is immutable" + +// Snapshot is the Schema for the snapshots API. It is the namespaced binding +// for a captured container checkpoint and is consumed by restore paths. +type Snapshot struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec SnapshotSpec `json:"spec,omitempty"` + Status SnapshotStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// SnapshotList contains a list of Snapshot. +type SnapshotList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []Snapshot `json:"items"` +} + +func init() { + SchemeBuilder.Register(&Snapshot{}, &SnapshotList{}) +} diff --git a/deploy/operator/api/v1alpha1/snapshot_types_test.go b/deploy/operator/api/v1alpha1/snapshot_types_test.go new file mode 100644 index 000000000000..d562910e5f2c --- /dev/null +++ b/deploy/operator/api/v1alpha1/snapshot_types_test.go @@ -0,0 +1,99 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package v1alpha1 + +import ( + "reflect" + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" +) + +// TestSchemeRegistersSnapshotKinds verifies the init() registrations expose all +// four new kinds through the package AddToScheme. +func TestSchemeRegistersSnapshotKinds(t *testing.T) { + scheme := runtime.NewScheme() + if err := AddToScheme(scheme); err != nil { + t.Fatalf("AddToScheme failed: %v", err) + } + for _, kind := range []string{"Snapshot", "SnapshotList", "SnapshotContent", "SnapshotContentList"} { + if !scheme.Recognizes(GroupVersion.WithKind(kind)) { + t.Errorf("scheme does not recognize kind %q in %s", kind, GroupVersion.String()) + } + } +} + +// TestSnapshotDeepCopyIsIndependent verifies the generated deepcopy produces an +// equal but independent Snapshot (mutating the clone must not touch the source). +func TestSnapshotDeepCopyIsIndependent(t *testing.T) { + original := &Snapshot{ + ObjectMeta: metav1.ObjectMeta{Name: "snap-a", Namespace: "inference"}, + Spec: SnapshotSpec{ + CheckpointID: "abc123", + Source: SnapshotSource{PodRef: PodReference{Name: "worker-0"}}, + }, + Status: SnapshotStatus{ + Conditions: []metav1.Condition{{Type: "Ready", Status: metav1.ConditionTrue, Reason: "Captured"}}, + }, + } + + clone := original.DeepCopy() + if !reflect.DeepEqual(original, clone) { + t.Fatalf("DeepCopy is not equal to original") + } + + clone.Spec.CheckpointID = "mutated" + clone.Status.Conditions[0].Reason = "Changed" + if original.Spec.CheckpointID != "abc123" { + t.Errorf("mutating clone spec changed original: got %q", original.Spec.CheckpointID) + } + if original.Status.Conditions[0].Reason != "Captured" { + t.Errorf("mutating clone condition changed original: got %q", original.Status.Conditions[0].Reason) + } +} + +// TestSnapshotContentDeepCopyIsIndependent verifies the generated deepcopy for +// the cluster-scoped SnapshotContent is equal but independent. +func TestSnapshotContentDeepCopyIsIndependent(t *testing.T) { + original := &SnapshotContent{ + ObjectMeta: metav1.ObjectMeta{Name: "content-a"}, + Spec: SnapshotContentSpec{ + SnapshotRef: SnapshotReference{Namespace: "inference", Name: "snap-a", UID: types.UID("uid-1")}, + Source: SnapshotContentSource{SnapshotHandle: "pvc://inference/ckpt-pvc/checkpoints/abc123/versions/1"}, + }, + Status: SnapshotContentStatus{ + Conditions: []metav1.Condition{{Type: "Ready", Status: metav1.ConditionTrue, Reason: "Bound"}}, + }, + } + + clone := original.DeepCopy() + if !reflect.DeepEqual(original, clone) { + t.Fatalf("DeepCopy is not equal to original") + } + + clone.Spec.Source.SnapshotHandle = "mutated" + clone.Status.Conditions[0].Reason = "Changed" + if original.Spec.Source.SnapshotHandle != "pvc://inference/ckpt-pvc/checkpoints/abc123/versions/1" { + t.Errorf("mutating clone changed original handle: got %q", original.Spec.Source.SnapshotHandle) + } + if original.Status.Conditions[0].Reason != "Bound" { + t.Errorf("mutating clone condition changed original: got %q", original.Status.Conditions[0].Reason) + } +} diff --git a/deploy/operator/api/v1alpha1/snapshotcontent_types.go b/deploy/operator/api/v1alpha1/snapshotcontent_types.go new file mode 100644 index 000000000000..7a970f5b959d --- /dev/null +++ b/deploy/operator/api/v1alpha1/snapshotcontent_types.go @@ -0,0 +1,108 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" +) + +// SnapshotContentSpec defines the desired state of SnapshotContent. It is +// populated by the node agent at creation time and is immutable thereafter. +type SnapshotContentSpec struct { + // SnapshotRef is the back-pointer to the bound Snapshot. It may span + // namespaces because SnapshotContent is cluster-scoped. + // +kubebuilder:validation:Required + SnapshotRef SnapshotReference `json:"snapshotRef"` + + // Source locates the physical artifact via a self-contained, opaque handle. + // +kubebuilder:validation:Required + Source SnapshotContentSource `json:"source"` +} + +// SnapshotReference is a cross-namespace reference to a Snapshot. +type SnapshotReference struct { + // Namespace of the referenced Snapshot. + // +kubebuilder:validation:Required + Namespace string `json:"namespace"` + + // Name of the referenced Snapshot. + // +kubebuilder:validation:Required + Name string `json:"name"` + + // UID of the referenced Snapshot, recorded at binding time to detect a + // stale reference after a delete and recreate. + // +optional + UID types.UID `json:"uid,omitempty"` +} + +// SnapshotContentSource locates the physical checkpoint artifact. +type SnapshotContentSource struct { + // SnapshotHandle is a self-contained, opaque artifact locator. The v1alpha1 + // PVC format is: + // pvc://////versions/ + // It fully locates the artifact without correlating any other field. + // +kubebuilder:validation:Required + // +kubebuilder:validation:MinLength=1 + SnapshotHandle string `json:"snapshotHandle"` +} + +// SnapshotContentStatus defines the observed state of SnapshotContent. +type SnapshotContentStatus struct { + // SnapshotHandle mirrors spec.source.snapshotHandle once the node agent has + // verified the artifact. + // +optional + SnapshotHandle *string `json:"snapshotHandle,omitempty"` + + // Conditions reflect the latest observations of the SnapshotContent's state. + // Standard types are Ready and Failed. + // +optional + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Cluster,shortName=snapcontent +// +kubebuilder:printcolumn:name="Snapshot",type="string",JSONPath=".spec.snapshotRef.name",description="Bound Snapshot" +// +kubebuilder:printcolumn:name="Namespace",type="string",JSONPath=".spec.snapshotRef.namespace",description="Snapshot namespace" +// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status",description="Ready condition" +// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" +// +kubebuilder:validation:XValidation:rule="!has(oldSelf.spec) || self.spec == oldSelf.spec",message="spec is immutable" + +// SnapshotContent is the Schema for the snapshotcontents API. It is the +// cluster-scoped artifact-of-record for a captured container checkpoint. +type SnapshotContent struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec SnapshotContentSpec `json:"spec,omitempty"` + Status SnapshotContentStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// SnapshotContentList contains a list of SnapshotContent. +type SnapshotContentList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []SnapshotContent `json:"items"` +} + +func init() { + SchemeBuilder.Register(&SnapshotContent{}, &SnapshotContentList{}) +} diff --git a/deploy/operator/api/v1alpha1/zz_generated.deepcopy.go b/deploy/operator/api/v1alpha1/zz_generated.deepcopy.go index c163acf27da3..119d1b89ce90 100644 --- a/deploy/operator/api/v1alpha1/zz_generated.deepcopy.go +++ b/deploy/operator/api/v1alpha1/zz_generated.deepcopy.go @@ -1524,6 +1524,21 @@ func (in *PVC) DeepCopy() *PVC { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PodReference) DeepCopyInto(out *PodReference) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodReference. +func (in *PodReference) DeepCopy() *PodReference { + if in == nil { + return nil + } + out := new(PodReference) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ProfilingConfigSpec) DeepCopyInto(out *ProfilingConfigSpec) { *out = *in @@ -1839,6 +1854,257 @@ func (in *SharedMemorySpec) DeepCopy() *SharedMemorySpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Snapshot) DeepCopyInto(out *Snapshot) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Snapshot. +func (in *Snapshot) DeepCopy() *Snapshot { + if in == nil { + return nil + } + out := new(Snapshot) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *Snapshot) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SnapshotContent) DeepCopyInto(out *SnapshotContent) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SnapshotContent. +func (in *SnapshotContent) DeepCopy() *SnapshotContent { + if in == nil { + return nil + } + out := new(SnapshotContent) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *SnapshotContent) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SnapshotContentList) DeepCopyInto(out *SnapshotContentList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]SnapshotContent, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SnapshotContentList. +func (in *SnapshotContentList) DeepCopy() *SnapshotContentList { + if in == nil { + return nil + } + out := new(SnapshotContentList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *SnapshotContentList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SnapshotContentSource) DeepCopyInto(out *SnapshotContentSource) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SnapshotContentSource. +func (in *SnapshotContentSource) DeepCopy() *SnapshotContentSource { + if in == nil { + return nil + } + out := new(SnapshotContentSource) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SnapshotContentSpec) DeepCopyInto(out *SnapshotContentSpec) { + *out = *in + out.SnapshotRef = in.SnapshotRef + out.Source = in.Source +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SnapshotContentSpec. +func (in *SnapshotContentSpec) DeepCopy() *SnapshotContentSpec { + if in == nil { + return nil + } + out := new(SnapshotContentSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SnapshotContentStatus) DeepCopyInto(out *SnapshotContentStatus) { + *out = *in + if in.SnapshotHandle != nil { + in, out := &in.SnapshotHandle, &out.SnapshotHandle + *out = new(string) + **out = **in + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SnapshotContentStatus. +func (in *SnapshotContentStatus) DeepCopy() *SnapshotContentStatus { + if in == nil { + return nil + } + out := new(SnapshotContentStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SnapshotList) DeepCopyInto(out *SnapshotList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]Snapshot, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SnapshotList. +func (in *SnapshotList) DeepCopy() *SnapshotList { + if in == nil { + return nil + } + out := new(SnapshotList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *SnapshotList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SnapshotReference) DeepCopyInto(out *SnapshotReference) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SnapshotReference. +func (in *SnapshotReference) DeepCopy() *SnapshotReference { + if in == nil { + return nil + } + out := new(SnapshotReference) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SnapshotSource) DeepCopyInto(out *SnapshotSource) { + *out = *in + out.PodRef = in.PodRef +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SnapshotSource. +func (in *SnapshotSource) DeepCopy() *SnapshotSource { + if in == nil { + return nil + } + out := new(SnapshotSource) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SnapshotSpec) DeepCopyInto(out *SnapshotSpec) { + *out = *in + out.Source = in.Source +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SnapshotSpec. +func (in *SnapshotSpec) DeepCopy() *SnapshotSpec { + if in == nil { + return nil + } + out := new(SnapshotSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SnapshotStatus) DeepCopyInto(out *SnapshotStatus) { + *out = *in + if in.BoundSnapshotContentName != nil { + in, out := &in.BoundSnapshotContentName, &out.BoundSnapshotContentName + *out = new(string) + **out = **in + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SnapshotStatus. +func (in *SnapshotStatus) DeepCopy() *SnapshotStatus { + if in == nil { + return nil + } + out := new(SnapshotStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SpecTopologyConstraint) DeepCopyInto(out *SpecTopologyConstraint) { *out = *in diff --git a/deploy/operator/config/crd/bases/nvidia.com_snapshotcontents.yaml b/deploy/operator/config/crd/bases/nvidia.com_snapshotcontents.yaml new file mode 100644 index 000000000000..99dfbc11dee1 --- /dev/null +++ b/deploy/operator/config/crd/bases/nvidia.com_snapshotcontents.yaml @@ -0,0 +1,180 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + helm.sh/resource-policy: keep + name: snapshotcontents.nvidia.com +spec: + group: nvidia.com + names: + kind: SnapshotContent + listKind: SnapshotContentList + plural: snapshotcontents + shortNames: + - snapcontent + singular: snapshotcontent + scope: Cluster + versions: + - additionalPrinterColumns: + - description: Bound Snapshot + jsonPath: .spec.snapshotRef.name + name: Snapshot + type: string + - description: Snapshot namespace + jsonPath: .spec.snapshotRef.namespace + name: Namespace + type: string + - description: Ready condition + jsonPath: .status.conditions[?(@.type=='Ready')].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + SnapshotContent is the Schema for the snapshotcontents API. It is the + cluster-scoped artifact-of-record for a captured container checkpoint. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + SnapshotContentSpec defines the desired state of SnapshotContent. It is + populated by the node agent at creation time and is immutable thereafter. + properties: + snapshotRef: + description: |- + SnapshotRef is the back-pointer to the bound Snapshot. It may span + namespaces because SnapshotContent is cluster-scoped. + properties: + name: + description: Name of the referenced Snapshot. + type: string + namespace: + description: Namespace of the referenced Snapshot. + type: string + uid: + description: |- + UID of the referenced Snapshot, recorded at binding time to detect a + stale reference after a delete and recreate. + type: string + required: + - name + - namespace + type: object + source: + description: Source locates the physical artifact via a self-contained, opaque handle. + properties: + snapshotHandle: + description: |- + SnapshotHandle is a self-contained, opaque artifact locator. The v1alpha1 + PVC format is: + pvc://////versions/ + It fully locates the artifact without correlating any other field. + minLength: 1 + type: string + required: + - snapshotHandle + type: object + required: + - snapshotRef + - source + type: object + status: + description: SnapshotContentStatus defines the observed state of SnapshotContent. + properties: + conditions: + description: |- + Conditions reflect the latest observations of the SnapshotContent's state. + Standard types are Ready and Failed. + items: + description: Condition contains details for one aspect of the current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + snapshotHandle: + description: |- + SnapshotHandle mirrors spec.source.snapshotHandle once the node agent has + verified the artifact. + type: string + type: object + type: object + x-kubernetes-validations: + - message: spec is immutable + rule: '!has(oldSelf.spec) || self.spec == oldSelf.spec' + served: true + storage: true + subresources: + status: {} diff --git a/deploy/operator/config/crd/bases/nvidia.com_snapshots.yaml b/deploy/operator/config/crd/bases/nvidia.com_snapshots.yaml new file mode 100644 index 000000000000..a5101b641309 --- /dev/null +++ b/deploy/operator/config/crd/bases/nvidia.com_snapshots.yaml @@ -0,0 +1,182 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + helm.sh/resource-policy: keep + name: snapshots.nvidia.com +spec: + group: nvidia.com + names: + kind: Snapshot + listKind: SnapshotList + plural: snapshots + shortNames: + - snap + singular: snapshot + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: Artifact identity + jsonPath: .spec.checkpointID + name: CheckpointID + type: string + - description: Bound SnapshotContent + jsonPath: .status.boundSnapshotContentName + name: Content + type: string + - description: Ready condition + jsonPath: .status.conditions[?(@.type=='Ready')].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + Snapshot is the Schema for the snapshots API. It is the namespaced binding + for a captured container checkpoint and is consumed by restore paths. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + SnapshotSpec defines the desired state of Snapshot. + + Minimal "trigger" shape: it names what to capture (an existing pod) and the + artifact identity (CheckpointID). Capture parameters the node agent needs at + dump time (target container, storage base path) are read from the referenced + pod's existing annotations and mounts, not duplicated here. The spec is + immutable after creation. + properties: + checkpointID: + description: |- + CheckpointID is the stable artifact identity and the on-PVC artifact + subdirectory name (//versions//). It is + the primary key of the storage contract shared with the restore path and + is immutable after creation. + maxLength: 253 + minLength: 1 + type: string + source: + description: |- + Source identifies the captured workload. It is a struct (rather than an + inlined reference) so future source variants can be added additively. + properties: + podRef: + description: |- + PodRef references the pod, in the Snapshot's namespace, that is captured. + The operator prepares the pod (control volume, target-container annotation, + checkpoint storage mount) before creating the Snapshot. + properties: + name: + description: Name of the source pod. + minLength: 1 + type: string + required: + - name + type: object + required: + - podRef + type: object + required: + - checkpointID + - source + type: object + status: + description: SnapshotStatus defines the observed state of Snapshot. + properties: + boundSnapshotContentName: + description: |- + BoundSnapshotContentName is the name of the cluster-scoped SnapshotContent + this Snapshot is bound to. It is nil until the agent has created the + content and recorded the binding. + type: string + conditions: + description: |- + Conditions reflect the latest observations of the Snapshot's state. + Standard types are Ready and Failed. + items: + description: Condition contains details for one aspect of the current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + type: object + type: object + x-kubernetes-validations: + - message: spec is immutable + rule: '!has(oldSelf.spec) || self.spec == oldSelf.spec' + served: true + storage: true + subresources: + status: {} diff --git a/deploy/operator/config/crd/kustomization.yaml b/deploy/operator/config/crd/kustomization.yaml index 315762dae038..4f6708220f69 100644 --- a/deploy/operator/config/crd/kustomization.yaml +++ b/deploy/operator/config/crd/kustomization.yaml @@ -21,6 +21,8 @@ resources: - bases/nvidia.com_dynamographdeployments.yaml - bases/nvidia.com_dynamomodels.yaml - bases/nvidia.com_dynamocheckpoints.yaml +- bases/nvidia.com_snapshots.yaml +- bases/nvidia.com_snapshotcontents.yaml #+kubebuilder:scaffold:crdkustomizeresource patches: []