diff --git a/.ai/spec/how/reconciler.md b/.ai/spec/how/reconciler.md index 62bd0997..c793b2c7 100644 --- a/.ai/spec/how/reconciler.md +++ b/.ai/spec/how/reconciler.md @@ -171,7 +171,7 @@ Audience: AI agents. Behavioral rules and phase semantics live in **what/** spec - **`SandboxProvider`:** Swappable claim/wait/release (tests can fake). Implementations: `SandboxManager` (sandbox-claim mode), `BarePodManager` (bare-pod mode). `SetStep` provides resolved step config before each `Claim` call. - **`PodSpecBuilder`:** Shared pod-spec assembly. Produces typed `corev1.PodSpec` from image + resolved step config. Used directly by `BarePodManager`; shared helper functions also used by `EnsureAgentTemplate` (unstructured path). - **`resolveProposal`:** Produces `resolvedWorkflow` with cached `Agent` + `LLMProvider` per name; applies per-stage agent overrides from `ProposalApproval` via `getStageOverrideAgent`; `Execution`/`Verification` steps nil when corresponding spec sections are zero. -- **`EnsureAgentTemplate`:** Deterministic derived `SandboxTemplate` name from hash of LLM spec, model, skills, MCP servers, required secrets, step, and **base template resourceVersion**. Patches pod template env/volumes for credentials, Vertex/Bedrock/Azure extras, skills image/paths, and MCP JSON env. GC older templates labeled for same agent+step. +- **`EnsureAgentTemplate`:** Deterministic derived `SandboxTemplate` name from hash of LLM spec, model, skills, MCP servers, required secrets, dataSource PVC, step, and **base template resourceVersion**. `dataSource` is extracted from `tools.DataSource` (set in `ToolsSpec`). Patches pod template env/volumes for credentials, Vertex/Bedrock/Azure extras, skills image/paths, MCP JSON env, `LIGHTSPEED_MODE`, and optional `/data/input` PVC mount. GC older templates labeled for same agent+step. --- diff --git a/.gitignore b/.gitignore index a9260aa3..5fcdbbbc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Binaries and local tool installs (Makefile uses ./bin for controller-gen, kustomize, etc.) /bin/ +/oc-agentic # IDE / OS .idea/ diff --git a/api/v1alpha1/proposal_types.go b/api/v1alpha1/proposal_types.go index 1c74ab53..869b3375 100644 --- a/api/v1alpha1/proposal_types.go +++ b/api/v1alpha1/proposal_types.go @@ -264,10 +264,42 @@ type ProposalStep struct { // for this step. Use this when different steps need different skills. // +optional Tools ToolsSpec `json:"tools,omitzero"` + + // timeoutMinutes sets the timeout for this step's sandbox agent call. + // This controls how long the operator waits for the sandbox pod to + // become ready and for the agent to complete its work. Increase this + // for long-running tools (e.g., IntelliAide RCA takes 10-30 minutes). + // Defaults to 5 minutes when omitted. + // + // Mutable: can be adjusted before approving a step. + // +optional + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Maximum=60 + TimeoutMinutes int32 `json:"timeoutMinutes,omitempty"` } func (s ProposalStep) IsZero() bool { - return s.Agent == "" && s.Tools.IsZero() + return s.Agent == "" && s.Tools.IsZero() && s.TimeoutMinutes == 0 +} + +// DataSource references a pre-existing PersistentVolumeClaim containing +// input data for this proposal (e.g., must-gather bundles, diagnostic data). +// The PVC must already exist in the same namespace as the Proposal and be +// pre-populated with data before the Proposal is created. The operator +// mounts it read-only at a well-known path (/data/input) accessible to +// all skills in the sandbox pod. +type DataSource struct { + // claimName is the name of the PersistentVolumeClaim to mount. + // The PVC must exist in the same namespace as the Proposal. + // +required + // +kubebuilder:validation:MinLength=1 + // +kubebuilder:validation:MaxLength=253 + // +kubebuilder:validation:XValidation:rule="!format.dns1123Subdomain().validate(self).hasValue()",message="must be a valid DNS subdomain" + ClaimName string `json:"claimName,omitempty"` +} + +func (d DataSource) IsZero() bool { + return d.ClaimName == "" } // ProposalSpec defines the desired state of Proposal. @@ -281,9 +313,9 @@ func (s ProposalStep) IsZero() bool { // +kubebuilder:validation:XValidation:rule="!has(oldSelf.analysisOutput) || (has(self.analysisOutput) && self.analysisOutput == oldSelf.analysisOutput)",message="analysisOutput is immutable once set" // +kubebuilder:validation:XValidation:rule="!has(self.analysisOutput) || self.analysisOutput.mode != 'Minimal' || (!has(self.execution) && !has(self.verification))",message="analysisOutput mode Minimal is only allowed for analysis-only proposals (no execution or verification steps)" // +kubebuilder:validation:XValidation:rule="!has(oldSelf.tools) || (has(self.tools) && self.tools == oldSelf.tools)",message="tools is immutable once set" -// +kubebuilder:validation:XValidation:rule="!has(oldSelf.analysis) || (has(self.analysis) && self.analysis == oldSelf.analysis)",message="analysis is immutable once set" -// +kubebuilder:validation:XValidation:rule="!has(oldSelf.execution) || (has(self.execution) && self.execution == oldSelf.execution)",message="execution is immutable once set" -// +kubebuilder:validation:XValidation:rule="!has(oldSelf.verification) || (has(self.verification) && self.verification == oldSelf.verification)",message="verification is immutable once set" +// +kubebuilder:validation:XValidation:rule="!has(oldSelf.analysis) || (has(self.analysis) && self.analysis.agent == oldSelf.analysis.agent && self.analysis.tools == oldSelf.analysis.tools)",message="analysis agent and tools are immutable once set" +// +kubebuilder:validation:XValidation:rule="!has(oldSelf.execution) || (has(self.execution) && self.execution.agent == oldSelf.execution.agent && self.execution.tools == oldSelf.execution.tools)",message="execution agent and tools are immutable once set" +// +kubebuilder:validation:XValidation:rule="!has(oldSelf.verification) || (has(self.verification) && self.verification.agent == oldSelf.verification.agent && self.verification.tools == oldSelf.verification.tools)",message="verification agent and tools are immutable once set" type ProposalSpec struct { // request is the user's original request, alert description, or a // description of what triggered this proposal. This text is passed to @@ -331,9 +363,10 @@ type ProposalSpec struct { AnalysisOutput AnalysisOutput `json:"analysisOutput,omitzero"` // tools defines the default tools for all steps: skills images, - // MCP servers, and required secrets. Per-step tools - // (analysis.tools, execution.tools, verification.tools) replace - // this default for individual steps. + // MCP servers, required secrets, and an optional dataSource PVC. + // Per-step tools (analysis.tools, execution.tools, verification.tools) + // replace this default for individual steps, so a dataSource set in + // spec.analysis.tools is mounted only in the analysis sandbox. // // Immutable: the skills and secrets available to the agent are // fixed at creation. Changing tools mid-flight could violate the diff --git a/api/v1alpha1/tools_types.go b/api/v1alpha1/tools_types.go index 4955d28e..bd9135d6 100644 --- a/api/v1alpha1/tools_types.go +++ b/api/v1alpha1/tools_types.go @@ -109,12 +109,15 @@ type SecretRequirement struct { } // ToolsSpec defines the tools available to an agent in its sandbox pod. -// This includes skills images, MCP servers, and required secrets. +// This includes skills images, MCP servers, required secrets, and an +// optional data source PVC. // // ToolsSpec is specified on a Proposal either as a shared default // (spec.tools) or per-step (spec.analysis.tools, spec.execution.tools, // spec.verification.tools). Per-step tools replace the shared default -// for that step. +// for that step, so a dataSource set in spec.analysis.tools is mounted +// only in the analysis sandbox, while one in spec.tools is mounted in +// every step that does not override tools. // // +kubebuilder:validation:MinProperties=1 type ToolsSpec struct { @@ -147,8 +150,16 @@ type ToolsSpec struct { // +kubebuilder:validation:MinItems=1 // +kubebuilder:validation:MaxItems=20 RequiredSecrets []SecretRequirement `json:"requiredSecrets,omitempty"` + + // dataSource references a pre-existing PersistentVolumeClaim containing + // input data for this step (e.g., must-gather bundles, diagnostic data). + // The PVC must already exist in the same namespace as the Proposal and be + // pre-populated with data before the Proposal is created. The operator + // mounts it read-only at /data/input in the sandbox pod. + // +optional + DataSource DataSource `json:"dataSource,omitzero"` } func (t ToolsSpec) IsZero() bool { - return len(t.Skills) == 0 && len(t.MCPServers) == 0 && len(t.RequiredSecrets) == 0 + return len(t.Skills) == 0 && len(t.MCPServers) == 0 && len(t.RequiredSecrets) == 0 && t.DataSource.IsZero() } diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index f6720157..8016b467 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -544,6 +544,21 @@ func (in *AzureOpenAIConfig) DeepCopy() *AzureOpenAIConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DataSource) DeepCopyInto(out *DataSource) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DataSource. +func (in *DataSource) DeepCopy() *DataSource { + if in == nil { + return nil + } + out := new(DataSource) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DiagnosisResult) DeepCopyInto(out *DiagnosisResult) { *out = *in @@ -1601,6 +1616,7 @@ func (in *ToolsSpec) DeepCopyInto(out *ToolsSpec) { *out = make([]SecretRequirement, len(*in)) copy(*out, *in) } + out.DataSource = in.DataSource } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ToolsSpec. diff --git a/config/crd/bases/agentic.openshift.io_proposals.yaml b/config/crd/bases/agentic.openshift.io_proposals.yaml index da391135..aede511e 100644 --- a/config/crd/bases/agentic.openshift.io_proposals.yaml +++ b/config/crd/bases/agentic.openshift.io_proposals.yaml @@ -83,12 +83,46 @@ spec: - message: 'must be a valid DNS subdomain: lowercase alphanumeric characters, hyphens, and dots' rule: '!format.dns1123Subdomain().validate(self).hasValue()' + timeoutMinutes: + description: |- + timeoutMinutes sets the timeout for this step's sandbox agent call. + This controls how long the operator waits for the sandbox pod to + become ready and for the agent to complete its work. Increase this + for long-running tools (e.g., IntelliAide RCA takes 10-30 minutes). + Defaults to 5 minutes when omitted. + + Mutable: can be adjusted before approving a step. + format: int32 + maximum: 60 + minimum: 1 + type: integer tools: description: |- tools provides per-step tools that replace the shared spec.tools for this step. Use this when different steps need different skills. minProperties: 1 properties: + dataSource: + description: |- + dataSource references a pre-existing PersistentVolumeClaim containing + input data for this step (e.g., must-gather bundles, diagnostic data). + The PVC must already exist in the same namespace as the Proposal and be + pre-populated with data before the Proposal is created. The operator + mounts it read-only at /data/input in the sandbox pod. + properties: + claimName: + description: |- + claimName is the name of the PersistentVolumeClaim to mount. + The PVC must exist in the same namespace as the Proposal. + maxLength: 253 + minLength: 1 + type: string + x-kubernetes-validations: + - message: must be a valid DNS subdomain + rule: '!format.dns1123Subdomain().validate(self).hasValue()' + required: + - claimName + type: object mcpServers: description: |- mcpServers defines external MCP (Model Context Protocol) servers the @@ -500,12 +534,46 @@ spec: - message: 'must be a valid DNS subdomain: lowercase alphanumeric characters, hyphens, and dots' rule: '!format.dns1123Subdomain().validate(self).hasValue()' + timeoutMinutes: + description: |- + timeoutMinutes sets the timeout for this step's sandbox agent call. + This controls how long the operator waits for the sandbox pod to + become ready and for the agent to complete its work. Increase this + for long-running tools (e.g., IntelliAide RCA takes 10-30 minutes). + Defaults to 5 minutes when omitted. + + Mutable: can be adjusted before approving a step. + format: int32 + maximum: 60 + minimum: 1 + type: integer tools: description: |- tools provides per-step tools that replace the shared spec.tools for this step. Use this when different steps need different skills. minProperties: 1 properties: + dataSource: + description: |- + dataSource references a pre-existing PersistentVolumeClaim containing + input data for this step (e.g., must-gather bundles, diagnostic data). + The PVC must already exist in the same namespace as the Proposal and be + pre-populated with data before the Proposal is created. The operator + mounts it read-only at /data/input in the sandbox pod. + properties: + claimName: + description: |- + claimName is the name of the PersistentVolumeClaim to mount. + The PVC must exist in the same namespace as the Proposal. + maxLength: 253 + minLength: 1 + type: string + x-kubernetes-validations: + - message: must be a valid DNS subdomain + rule: '!format.dns1123Subdomain().validate(self).hasValue()' + required: + - claimName + type: object mcpServers: description: |- mcpServers defines external MCP (Model Context Protocol) servers the @@ -916,15 +984,37 @@ spec: tools: description: |- tools defines the default tools for all steps: skills images, - MCP servers, and required secrets. Per-step tools - (analysis.tools, execution.tools, verification.tools) replace - this default for individual steps. + MCP servers, required secrets, and an optional dataSource PVC. + Per-step tools (analysis.tools, execution.tools, verification.tools) + replace this default for individual steps, so a dataSource set in + spec.analysis.tools is mounted only in the analysis sandbox. Immutable: the skills and secrets available to the agent are fixed at creation. Changing tools mid-flight could violate the assumptions of an in-progress analysis or execution. minProperties: 1 properties: + dataSource: + description: |- + dataSource references a pre-existing PersistentVolumeClaim containing + input data for this step (e.g., must-gather bundles, diagnostic data). + The PVC must already exist in the same namespace as the Proposal and be + pre-populated with data before the Proposal is created. The operator + mounts it read-only at /data/input in the sandbox pod. + properties: + claimName: + description: |- + claimName is the name of the PersistentVolumeClaim to mount. + The PVC must exist in the same namespace as the Proposal. + maxLength: 253 + minLength: 1 + type: string + x-kubernetes-validations: + - message: must be a valid DNS subdomain + rule: '!format.dns1123Subdomain().validate(self).hasValue()' + required: + - claimName + type: object mcpServers: description: |- mcpServers defines external MCP (Model Context Protocol) servers the @@ -1294,12 +1384,46 @@ spec: - message: 'must be a valid DNS subdomain: lowercase alphanumeric characters, hyphens, and dots' rule: '!format.dns1123Subdomain().validate(self).hasValue()' + timeoutMinutes: + description: |- + timeoutMinutes sets the timeout for this step's sandbox agent call. + This controls how long the operator waits for the sandbox pod to + become ready and for the agent to complete its work. Increase this + for long-running tools (e.g., IntelliAide RCA takes 10-30 minutes). + Defaults to 5 minutes when omitted. + + Mutable: can be adjusted before approving a step. + format: int32 + maximum: 60 + minimum: 1 + type: integer tools: description: |- tools provides per-step tools that replace the shared spec.tools for this step. Use this when different steps need different skills. minProperties: 1 properties: + dataSource: + description: |- + dataSource references a pre-existing PersistentVolumeClaim containing + input data for this step (e.g., must-gather bundles, diagnostic data). + The PVC must already exist in the same namespace as the Proposal and be + pre-populated with data before the Proposal is created. The operator + mounts it read-only at /data/input in the sandbox pod. + properties: + claimName: + description: |- + claimName is the name of the PersistentVolumeClaim to mount. + The PVC must exist in the same namespace as the Proposal. + maxLength: 253 + minLength: 1 + type: string + x-kubernetes-validations: + - message: must be a valid DNS subdomain + rule: '!format.dns1123Subdomain().validate(self).hasValue()' + required: + - claimName + type: object mcpServers: description: |- mcpServers defines external MCP (Model Context Protocol) servers the @@ -1675,15 +1799,15 @@ spec: || (!has(self.execution) && !has(self.verification))' - message: tools is immutable once set rule: '!has(oldSelf.tools) || (has(self.tools) && self.tools == oldSelf.tools)' - - message: analysis is immutable once set - rule: '!has(oldSelf.analysis) || (has(self.analysis) && self.analysis - == oldSelf.analysis)' - - message: execution is immutable once set - rule: '!has(oldSelf.execution) || (has(self.execution) && self.execution - == oldSelf.execution)' - - message: verification is immutable once set - rule: '!has(oldSelf.verification) || (has(self.verification) && self.verification - == oldSelf.verification)' + - message: analysis agent and tools are immutable once set + rule: '!has(oldSelf.analysis) || (has(self.analysis) && self.analysis.agent + == oldSelf.analysis.agent && self.analysis.tools == oldSelf.analysis.tools)' + - message: execution agent and tools are immutable once set + rule: '!has(oldSelf.execution) || (has(self.execution) && self.execution.agent + == oldSelf.execution.agent && self.execution.tools == oldSelf.execution.tools)' + - message: verification agent and tools are immutable once set + rule: '!has(oldSelf.verification) || (has(self.verification) && self.verification.agent + == oldSelf.verification.agent && self.verification.tools == oldSelf.verification.tools)' status: description: status defines the observed state of Proposal. minProperties: 1 diff --git a/controller/proposal/agent.go b/controller/proposal/agent.go index e6698ae3..9b07b7b7 100644 --- a/controller/proposal/agent.go +++ b/controller/proposal/agent.go @@ -2,6 +2,7 @@ package proposal import ( "context" + "time" agenticv1alpha1 "github.com/openshift/lightspeed-agentic-operator/api/v1alpha1" ) @@ -42,10 +43,10 @@ type EscalationOutput struct { // HTTP implementations POST to /v1/agent/run — a step-agnostic // endpoint where all workflow context is in the request payload. type AgentCaller interface { - Analyze(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string) (*AnalysisOutput, error) - Execute(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, serviceAccount string) (*ExecutionOutput, error) - Verify(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, exec *ExecutionOutput, serviceAccount string) (*VerificationOutput, error) - Escalate(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string) (*EscalationOutput, error) + Analyze(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string, timeout time.Duration) (*AnalysisOutput, error) + Execute(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, serviceAccount string, timeout time.Duration) (*ExecutionOutput, error) + Verify(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, exec *ExecutionOutput, serviceAccount string, timeout time.Duration) (*VerificationOutput, error) + Escalate(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string, timeout time.Duration) (*EscalationOutput, error) ReleaseSandboxes(ctx context.Context, proposal *agenticv1alpha1.Proposal) error } @@ -53,7 +54,7 @@ type AgentCaller interface { // implementation (sandbox + HTTP) when the agent infrastructure is ready. type StubAgentCaller struct{} -func (s *StubAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string) (*AnalysisOutput, error) { +func (s *StubAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string, _ time.Duration) (*AnalysisOutput, error) { return &AnalysisOutput{ Success: true, Options: []agenticv1alpha1.RemediationOption{{ @@ -73,7 +74,7 @@ func (s *StubAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal }, nil } -func (s *StubAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ string) (*ExecutionOutput, error) { +func (s *StubAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ string, _ time.Duration) (*ExecutionOutput, error) { return &ExecutionOutput{ Success: true, ActionsTaken: []agenticv1alpha1.ExecutionAction{{ @@ -88,7 +89,7 @@ func (s *StubAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal }, nil } -func (s *StubAgentCaller) Escalate(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string) (*EscalationOutput, error) { +func (s *StubAgentCaller) Escalate(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string, _ time.Duration) (*EscalationOutput, error) { return &EscalationOutput{ Success: true, Summary: "Stub escalation summary", @@ -100,7 +101,7 @@ func (s *StubAgentCaller) ReleaseSandboxes(_ context.Context, _ *agenticv1alpha1 return nil } -func (s *StubAgentCaller) Verify(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ *ExecutionOutput, _ string) (*VerificationOutput, error) { +func (s *StubAgentCaller) Verify(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ *ExecutionOutput, _ string, _ time.Duration) (*VerificationOutput, error) { return &VerificationOutput{ Success: true, Checks: []agenticv1alpha1.VerifyCheck{{ diff --git a/controller/proposal/client.go b/controller/proposal/client.go index 2ac08385..713b3335 100644 --- a/controller/proposal/client.go +++ b/controller/proposal/client.go @@ -79,10 +79,13 @@ type AgentHTTPClient struct { endpoint string } -func NewAgentHTTPClient(endpoint string) AgentHTTPClientInterface { +func NewAgentHTTPClient(endpoint string, timeout time.Duration) AgentHTTPClientInterface { + if timeout <= 0 { + timeout = defaultSandboxTimeout + } return &AgentHTTPClient{ httpClient: &http.Client{ - Timeout: 5 * time.Minute, + Timeout: timeout, Transport: &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, //nolint:gosec // internal cluster traffic }, @@ -92,11 +95,13 @@ func NewAgentHTTPClient(endpoint string) AgentHTTPClientInterface { } func (c *AgentHTTPClient) Run(ctx context.Context, systemPrompt, query string, outputSchema json.RawMessage, agentCtx *agentContext) (*agentRunResponse, error) { + timeoutMs := int64(c.httpClient.Timeout / time.Millisecond) req := agentRunRequest{ Query: query, SystemPrompt: systemPrompt, OutputSchema: outputSchema, Context: agentCtx, + TimeoutMs: &timeoutMs, } body, err := json.Marshal(req) diff --git a/controller/proposal/client_test.go b/controller/proposal/client_test.go index 51ef7e15..1515aef5 100644 --- a/controller/proposal/client_test.go +++ b/controller/proposal/client_test.go @@ -6,6 +6,7 @@ import ( "net/http" "net/http/httptest" "testing" + "time" agenticv1alpha1 "github.com/openshift/lightspeed-agentic-operator/api/v1alpha1" ) @@ -35,7 +36,7 @@ func TestAgentHTTPClient_RunSuccess(t *testing.T) { })) defer server.Close() - client := NewAgentHTTPClient(server.URL) + client := NewAgentHTTPClient(server.URL, 0) resp, err := client.Run(context.Background(), "You are an SRE agent", "check health", nil, nil) if err != nil { t.Fatalf("unexpected error: %v", err) @@ -52,7 +53,7 @@ func TestAgentHTTPClient_RunHTTPError(t *testing.T) { })) defer server.Close() - client := NewAgentHTTPClient(server.URL) + client := NewAgentHTTPClient(server.URL, 0) _, err := client.Run(context.Background(), "", "test", nil, nil) if err == nil { t.Fatal("expected error for HTTP 500") @@ -60,7 +61,7 @@ func TestAgentHTTPClient_RunHTTPError(t *testing.T) { } func TestAgentHTTPClient_RunConnectionError(t *testing.T) { - client := NewAgentHTTPClient("http://127.0.0.1:1") + client := NewAgentHTTPClient("http://127.0.0.1:1", 0) _, err := client.Run(context.Background(), "", "test", nil, nil) if err == nil { t.Fatal("expected error for connection failure") @@ -100,7 +101,7 @@ func TestAgentHTTPClient_RunWithExecutionResult(t *testing.T) { })) defer server.Close() - client := NewAgentHTTPClient(server.URL) + client := NewAgentHTTPClient(server.URL, 0) agentCtx := &agentContext{ TargetNamespaces: []string{"production"}, ExecutionResult: &agentExecutionResult{ @@ -135,7 +136,7 @@ func TestAgentHTTPClient_RunWithoutExecutionResult(t *testing.T) { })) defer server.Close() - client := NewAgentHTTPClient(server.URL) + client := NewAgentHTTPClient(server.URL, 0) agentCtx := &agentContext{ TargetNamespaces: []string{"production"}, } @@ -169,7 +170,7 @@ func TestAgentHTTPClient_RunWithContext(t *testing.T) { })) defer server.Close() - client := NewAgentHTTPClient(server.URL) + client := NewAgentHTTPClient(server.URL, 0) agentCtx := &agentContext{ TargetNamespaces: []string{"production"}, PreviousAttempts: []agentPreviousAttempt{{Attempt: 1, FailureReason: "timeout"}}, @@ -179,3 +180,64 @@ func TestAgentHTTPClient_RunWithContext(t *testing.T) { t.Fatalf("unexpected error: %v", err) } } + +// TestAgentHTTPClient_TimeoutMs_CustomTimeout verifies that a positive timeout +// is serialized as timeout_ms in the request body sent to the agent. +func TestAgentHTTPClient_TimeoutMs_CustomTimeout(t *testing.T) { + const customTimeout = 10 * time.Second + var gotTimeoutMs *int64 + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + var req agentRunRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + t.Fatalf("decode request: %v", err) + } + gotTimeoutMs = req.TimeoutMs + w.Header().Set("Content-Type", "application/json") + w.Write([]byte(`{"success": true}`)) + })) + defer server.Close() + + c := NewAgentHTTPClient(server.URL, customTimeout) + if _, err := c.Run(context.Background(), "", "ping", nil, nil); err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if gotTimeoutMs == nil { + t.Fatal("timeout_ms was not sent in request") + } + wantMs := int64(customTimeout / time.Millisecond) + if *gotTimeoutMs != wantMs { + t.Errorf("timeout_ms = %d, want %d", *gotTimeoutMs, wantMs) + } +} + +// TestAgentHTTPClient_TimeoutMs_ZeroFallsBackToDefault verifies that timeout <= 0 +// falls back to defaultSandboxTimeout and is reflected in timeout_ms. +func TestAgentHTTPClient_TimeoutMs_ZeroFallsBackToDefault(t *testing.T) { + var gotTimeoutMs *int64 + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + var req agentRunRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + t.Fatalf("decode request: %v", err) + } + gotTimeoutMs = req.TimeoutMs + w.Header().Set("Content-Type", "application/json") + w.Write([]byte(`{"success": true}`)) + })) + defer server.Close() + + c := NewAgentHTTPClient(server.URL, 0) // zero should fall back to defaultSandboxTimeout + if _, err := c.Run(context.Background(), "", "ping", nil, nil); err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if gotTimeoutMs == nil { + t.Fatal("timeout_ms was not sent in request") + } + wantMs := int64(defaultSandboxTimeout / time.Millisecond) + if *gotTimeoutMs != wantMs { + t.Errorf("timeout_ms = %d, want %d (defaultSandboxTimeout)", *gotTimeoutMs, wantMs) + } +} diff --git a/controller/proposal/handlers.go b/controller/proposal/handlers.go index b3662872..99e1d1eb 100644 --- a/controller/proposal/handlers.go +++ b/controller/proposal/handlers.go @@ -85,7 +85,8 @@ func (r *ProposalReconciler) handleAnalysis( return ctrl.Result{}, fmt.Errorf("%s: %w", ErrUpdateToAnalyzing, err) } - analysisResult, err := r.Agent.Analyze(ctx, proposal, resolved.Analysis, proposal.Spec.Request, defaultSandboxSA) + timeout := stepTimeout(resolved.Analysis) + analysisResult, err := r.Agent.Analyze(ctx, proposal, resolved.Analysis, proposal.Spec.Request, defaultSandboxSA, timeout) if err != nil { return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionAnalyzed, err) } @@ -151,7 +152,8 @@ func (r *ProposalReconciler) handleRevision( revisionSuffix := buildRevisionContext(proposal) requestWithRevision := proposal.Spec.Request + "\n\n" + revisionSuffix - analysisResult, err := r.Agent.Analyze(ctx, proposal, resolved.Analysis, requestWithRevision, defaultSandboxSA) + timeout := stepTimeout(resolved.Analysis) + analysisResult, err := r.Agent.Analyze(ctx, proposal, resolved.Analysis, requestWithRevision, defaultSandboxSA, timeout) if err != nil { return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionAnalyzed, err) } @@ -270,7 +272,8 @@ func (r *ProposalReconciler) handleExecution( return ctrl.Result{}, fmt.Errorf("%s: %w", ErrUpdateToExecuting, err) } - execResult, err := r.Agent.Execute(ctx, proposal, *resolved.Execution, selectedOption, execSA) + timeout := stepTimeout(*resolved.Execution) + execResult, err := r.Agent.Execute(ctx, proposal, *resolved.Execution, selectedOption, execSA, timeout) if err != nil { return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionExecuted, err) } @@ -387,7 +390,8 @@ func (r *ProposalReconciler) handleVerification( } } - verifyResult, err := r.Agent.Verify(ctx, proposal, *resolved.Verification, selectedOption, execOutput, defaultSandboxSA) + timeout := stepTimeout(*resolved.Verification) + verifyResult, err := r.Agent.Verify(ctx, proposal, *resolved.Verification, selectedOption, execOutput, defaultSandboxSA, timeout) if err != nil { return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionVerified, err) } @@ -586,7 +590,8 @@ func (r *ProposalReconciler) handleEscalation( } escalationText := buildEscalationRequest(proposal) - escalationResult, err := r.Agent.Escalate(ctx, proposal, step, escalationText, defaultSandboxSA) + timeout := stepTimeout(step) + escalationResult, err := r.Agent.Escalate(ctx, proposal, step, escalationText, defaultSandboxSA, timeout) if err != nil { return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionEscalated, err) } diff --git a/controller/proposal/reconciler_test.go b/controller/proposal/reconciler_test.go index 4b7db746..2acf6620 100644 --- a/controller/proposal/reconciler_test.go +++ b/controller/proposal/reconciler_test.go @@ -31,36 +31,45 @@ type testAgentCaller struct { executeResult *ExecutionOutput verifyResult *VerificationOutput escalateResult *EscalationOutput + + lastAnalyzeTimeout time.Duration + lastExecuteTimeout time.Duration + lastVerifyTimeout time.Duration + lastEscalateTimeout time.Duration } func newTestAgentCaller() *testAgentCaller { stub := &StubAgentCaller{} - a, _ := stub.Analyze(context.Background(), nil, resolvedStep{}, "", "") - e, _ := stub.Execute(context.Background(), nil, resolvedStep{}, nil, "") - v, _ := stub.Verify(context.Background(), nil, resolvedStep{}, nil, nil, "") - esc, _ := stub.Escalate(context.Background(), nil, resolvedStep{}, "", "") + a, _ := stub.Analyze(context.Background(), nil, resolvedStep{}, "", "", 0) + e, _ := stub.Execute(context.Background(), nil, resolvedStep{}, nil, "", 0) + v, _ := stub.Verify(context.Background(), nil, resolvedStep{}, nil, nil, "", 0) + esc, _ := stub.Escalate(context.Background(), nil, resolvedStep{}, "", "", 0) return &testAgentCaller{analyzeResult: a, executeResult: e, verifyResult: v, escalateResult: esc} } -func (ta *testAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string) (*AnalysisOutput, error) { +func (ta *testAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string, timeout time.Duration) (*AnalysisOutput, error) { + ta.lastAnalyzeTimeout = timeout if ta.analyzeErr != nil { return nil, ta.analyzeErr } return ta.analyzeResult, nil } -func (ta *testAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ string) (*ExecutionOutput, error) { +func (ta *testAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ string, timeout time.Duration) (*ExecutionOutput, error) { + ta.lastExecuteTimeout = timeout if ta.executeErr != nil { return nil, ta.executeErr } return ta.executeResult, nil } -func (ta *testAgentCaller) Verify(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ *ExecutionOutput, _ string) (*VerificationOutput, error) { +func (ta *testAgentCaller) Verify(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ *ExecutionOutput, _ string, timeout time.Duration) (*VerificationOutput, error) { + ta.lastVerifyTimeout = timeout if ta.verifyErr != nil { return nil, ta.verifyErr } return ta.verifyResult, nil } -func (ta *testAgentCaller) Escalate(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string) (*EscalationOutput, error) { +func (ta *testAgentCaller) Escalate(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string, timeout time.Duration) (*EscalationOutput, error) { + ta.lastEscalateTimeout = timeout if ta.escalateErr != nil { return nil, ta.escalateErr } @@ -247,7 +256,7 @@ func newMockSandboxAgent(analysisJSON, executionJSON, verificationJSON string) ( caller := &SandboxAgentCaller{ Sandbox: sandbox, K8sClient: fc, - ClientFactory: func(_ string) AgentHTTPClientInterface { + ClientFactory: func(_ string, _ time.Duration) AgentHTTPClientInterface { resp := responses[callCount%len(responses)] callCount++ httpClient.response = &agentRunResponse{Response: json.RawMessage(resp)} @@ -496,3 +505,40 @@ func TestHandleSuspension(t *testing.T) { }) } } + +// TestReconcile_PropagatesStepTimeout verifies that timeoutMinutes set on a +// ProposalStep is resolved and forwarded to the AgentCaller as time.Duration. +func TestReconcile_PropagatesStepTimeout(t *testing.T) { + const wantMinutes int32 = 30 + + scheme := testScheme() + proposal := &agenticv1alpha1.Proposal{ + ObjectMeta: metav1.ObjectMeta{Name: "timeout-check", Namespace: "default"}, + Spec: agenticv1alpha1.ProposalSpec{ + Request: "Pod crashing", + Tools: testTools(), + Analysis: agenticv1alpha1.ProposalStep{ + Agent: "default", + TimeoutMinutes: wantMinutes, + }, + Execution: agenticv1alpha1.ProposalStep{Agent: "default"}, + Verification: agenticv1alpha1.ProposalStep{Agent: "default"}, + }, + } + + objs := append([]client.Object{proposal}, defaultObjects()...) + fc := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...). + WithStatusSubresource(proposal, &agenticv1alpha1.AnalysisResult{}, &agenticv1alpha1.ExecutionResult{}, &agenticv1alpha1.VerificationResult{}, &agenticv1alpha1.EscalationResult{}).Build() + + caller := newTestAgentCaller() + r := &ProposalReconciler{Client: fc, Agent: caller, Namespace: "default"} + + if _, err := reconcileOnce(r, "timeout-check"); err != nil { + t.Fatalf("reconcile: %v", err) + } + + want := time.Duration(wantMinutes) * time.Minute + if caller.lastAnalyzeTimeout != want { + t.Errorf("Analyze timeout = %v, want %v", caller.lastAnalyzeTimeout, want) + } +} diff --git a/controller/proposal/resolve.go b/controller/proposal/resolve.go index 56c1fff2..3a754d66 100644 --- a/controller/proposal/resolve.go +++ b/controller/proposal/resolve.go @@ -19,9 +19,10 @@ const ( ) type resolvedStep struct { - Agent *agenticv1alpha1.Agent - LLM *agenticv1alpha1.LLMProvider - Tools *agenticv1alpha1.ToolsSpec + Agent *agenticv1alpha1.Agent + LLM *agenticv1alpha1.LLMProvider + Tools *agenticv1alpha1.ToolsSpec + TimeoutMinutes int32 } type resolvedWorkflow struct { @@ -80,14 +81,14 @@ func resolveProposal(ctx context.Context, c client.Client, proposal *agenticv1al if err != nil { return nil, fmt.Errorf("%s: %w", ErrResolveAnalysisStep, err) } - resolved.Analysis = resolvedStep{Agent: agent, LLM: llm, Tools: toolsForStep(proposal.Spec.Analysis)} + resolved.Analysis = resolvedStep{Agent: agent, LLM: llm, Tools: toolsForStep(proposal.Spec.Analysis), TimeoutMinutes: proposal.Spec.Analysis.TimeoutMinutes} if !proposal.Spec.Execution.IsZero() { agent, llm, err := resolveAgent(effectiveAgent(agenticv1alpha1.SandboxStepExecution, proposal.Spec.Execution)) if err != nil { return nil, fmt.Errorf("%s: %w", ErrResolveExecutionStep, err) } - resolved.Execution = &resolvedStep{Agent: agent, LLM: llm, Tools: toolsForStep(proposal.Spec.Execution)} + resolved.Execution = &resolvedStep{Agent: agent, LLM: llm, Tools: toolsForStep(proposal.Spec.Execution), TimeoutMinutes: proposal.Spec.Execution.TimeoutMinutes} } if !proposal.Spec.Verification.IsZero() { @@ -95,7 +96,7 @@ func resolveProposal(ctx context.Context, c client.Client, proposal *agenticv1al if err != nil { return nil, fmt.Errorf("%s: %w", ErrResolveVerificationStep, err) } - resolved.Verification = &resolvedStep{Agent: agent, LLM: llm, Tools: toolsForStep(proposal.Spec.Verification)} + resolved.Verification = &resolvedStep{Agent: agent, LLM: llm, Tools: toolsForStep(proposal.Spec.Verification), TimeoutMinutes: proposal.Spec.Verification.TimeoutMinutes} } return resolved, nil diff --git a/controller/proposal/sandbox_agent.go b/controller/proposal/sandbox_agent.go index fec57c4b..6ac58ed8 100644 --- a/controller/proposal/sandbox_agent.go +++ b/controller/proposal/sandbox_agent.go @@ -51,7 +51,7 @@ type verificationResponse struct { type SandboxAgentCaller struct { Sandbox SandboxProvider K8sClient client.Client - ClientFactory func(endpoint string) AgentHTTPClientInterface + ClientFactory func(endpoint string, timeout time.Duration) AgentHTTPClientInterface Namespace string Timeout time.Duration } @@ -59,7 +59,7 @@ type SandboxAgentCaller struct { func NewSandboxAgentCaller( sandbox SandboxProvider, k8sClient client.Client, - clientFactory func(endpoint string) AgentHTTPClientInterface, + clientFactory func(endpoint string, timeout time.Duration) AgentHTTPClientInterface, namespace string, ) *SandboxAgentCaller { return &SandboxAgentCaller{ @@ -71,13 +71,23 @@ func NewSandboxAgentCaller( } } +// stepTimeout returns the effective timeout for a single step's sandbox operation. +// Reads the step's timeoutMinutes when set; falls back to defaultSandboxTimeout. +// This is the single place where timeout policy is decided. +func stepTimeout(step resolvedStep) time.Duration { + if step.TimeoutMinutes > 0 { + return time.Duration(step.TimeoutMinutes) * time.Minute + } + return defaultSandboxTimeout +} + func stepString(step agenticv1alpha1.SandboxStep) string { return strings.ToLower(string(step)) } -func (s *SandboxAgentCaller) Analyze(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string) (*AnalysisOutput, error) { +func (s *SandboxAgentCaller) Analyze(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string, timeout time.Duration) (*AnalysisOutput, error) { query := buildAnalysisQuery(requestText, proposal) - raw, err := s.callWithSandbox(ctx, proposal, stepString(agenticv1alpha1.SandboxStepAnalysis), step, query, buildAgentContext(proposal), serviceAccount) + raw, err := s.callWithSandbox(ctx, proposal, stepString(agenticv1alpha1.SandboxStepAnalysis), step, query, buildAgentContext(proposal), serviceAccount, timeout) if err != nil { return nil, fmt.Errorf("%s: %w", ErrAnalysisAgentCall, err) } @@ -93,14 +103,14 @@ func (s *SandboxAgentCaller) Analyze(ctx context.Context, proposal *agenticv1alp }, nil } -func (s *SandboxAgentCaller) Execute(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, serviceAccount string) (*ExecutionOutput, error) { +func (s *SandboxAgentCaller) Execute(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, serviceAccount string, timeout time.Duration) (*ExecutionOutput, error) { agentCtx := buildAgentContext(proposal) if option != nil { agentCtx.ApprovedOption = option } query := buildExecutionQuery(option) - raw, err := s.callWithSandbox(ctx, proposal, stepString(agenticv1alpha1.SandboxStepExecution), step, query, agentCtx, serviceAccount) + raw, err := s.callWithSandbox(ctx, proposal, stepString(agenticv1alpha1.SandboxStepExecution), step, query, agentCtx, serviceAccount, timeout) if err != nil { return nil, fmt.Errorf("%s: %w", ErrExecutionAgentCall, err) } @@ -120,7 +130,7 @@ func (s *SandboxAgentCaller) Execute(ctx context.Context, proposal *agenticv1alp return out, nil } -func (s *SandboxAgentCaller) Verify(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, exec *ExecutionOutput, serviceAccount string) (*VerificationOutput, error) { +func (s *SandboxAgentCaller) Verify(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, exec *ExecutionOutput, serviceAccount string, timeout time.Duration) (*VerificationOutput, error) { agentCtx := buildAgentContext(proposal) if option != nil { agentCtx.ApprovedOption = option @@ -128,7 +138,7 @@ func (s *SandboxAgentCaller) Verify(ctx context.Context, proposal *agenticv1alph agentCtx.ExecutionResult = executionOutputToAgentResult(exec) query := buildVerificationQuery(option, exec) - raw, err := s.callWithSandbox(ctx, proposal, stepString(agenticv1alpha1.SandboxStepVerification), step, query, agentCtx, serviceAccount) + raw, err := s.callWithSandbox(ctx, proposal, stepString(agenticv1alpha1.SandboxStepVerification), step, query, agentCtx, serviceAccount, timeout) if err != nil { return nil, fmt.Errorf("%s: %w", ErrVerificationAgentCall, err) } @@ -145,9 +155,9 @@ func (s *SandboxAgentCaller) Verify(ctx context.Context, proposal *agenticv1alph }, nil } -func (s *SandboxAgentCaller) Escalate(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string) (*EscalationOutput, error) { +func (s *SandboxAgentCaller) Escalate(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string, timeout time.Duration) (*EscalationOutput, error) { agentCtx := buildAgentContext(proposal) - raw, err := s.callWithSandbox(ctx, proposal, stepString(agenticv1alpha1.SandboxStepEscalation), step, requestText, agentCtx, serviceAccount) + raw, err := s.callWithSandbox(ctx, proposal, stepString(agenticv1alpha1.SandboxStepEscalation), step, requestText, agentCtx, serviceAccount, timeout) if err != nil { return nil, fmt.Errorf("%s: %w", ErrEscalationAgentCall, err) } @@ -176,9 +186,14 @@ func (s *SandboxAgentCaller) callWithSandbox( query string, agentCtx *agentContext, serviceAccount string, + timeout time.Duration, ) (json.RawMessage, error) { s.Sandbox.SetStep(step.Agent, step.LLM, step.Tools, serviceAccount) + if timeout <= 0 { + timeout = defaultSandboxTimeout + } + claimName, err := s.Sandbox.Claim(ctx, proposal.Name, stepName, "") if err != nil { return nil, fmt.Errorf("%s: %w", ErrClaimSandbox, err) @@ -188,12 +203,11 @@ func (s *SandboxAgentCaller) callWithSandbox( // while the sandbox is still starting up s.patchSandboxInfo(ctx, proposal, stepName, claimName) - timeout := s.Timeout - if timeout == 0 { - timeout = defaultSandboxTimeout - } - - endpoint, err := s.Sandbox.WaitReady(ctx, claimName, timeout) + // Pod startup is an infrastructure concern unrelated to the agent's work + // budget. Using a fixed ceiling here ensures that the full user-configured + // timeout is available for the agent call itself, and avoids the effective + // wall-clock time being 2× the configured value. + endpoint, err := s.Sandbox.WaitReady(ctx, claimName, defaultSandboxTimeout) if err != nil { return nil, fmt.Errorf("%s: %w", ErrWaitForSandbox, err) } @@ -205,7 +219,7 @@ func (s *SandboxAgentCaller) callWithSandbox( schema := outputSchemaForStep(stepName, proposal) - client := s.ClientFactory(agentURL) + client := s.ClientFactory(agentURL, timeout) resp, err := client.Run(ctx, "", query, schema, agentCtx) if err != nil { return nil, err diff --git a/controller/proposal/sandbox_agent_test.go b/controller/proposal/sandbox_agent_test.go index 9a046058..d5052969 100644 --- a/controller/proposal/sandbox_agent_test.go +++ b/controller/proposal/sandbox_agent_test.go @@ -18,13 +18,14 @@ import ( // --- Hand-written mocks --- type mockSandboxProvider struct { - claimName string - claimErr error - endpoint string - readyErr error - releaseErr error - claimCalls int - releaseCalls int + claimName string + claimErr error + endpoint string + readyErr error + releaseErr error + claimCalls int + releaseCalls int + lastWaitReadyTimeout time.Duration } func (m *mockSandboxProvider) SetStep(_ *agenticv1alpha1.Agent, _ *agenticv1alpha1.LLMProvider, _ *agenticv1alpha1.ToolsSpec, _ string) { @@ -33,7 +34,8 @@ func (m *mockSandboxProvider) Claim(_ context.Context, _, _, _ string) (string, m.claimCalls++ return m.claimName, m.claimErr } -func (m *mockSandboxProvider) WaitReady(_ context.Context, _ string, _ time.Duration) (string, error) { +func (m *mockSandboxProvider) WaitReady(_ context.Context, _ string, d time.Duration) (string, error) { + m.lastWaitReadyTimeout = d return m.endpoint, m.readyErr } func (m *mockSandboxProvider) Release(_ context.Context, _ string) error { @@ -62,7 +64,7 @@ func newTestSandboxAgentCaller(sandbox *mockSandboxProvider, httpClient *mockHTT return &SandboxAgentCaller{ Sandbox: sandbox, K8sClient: fc, - ClientFactory: func(_ string) AgentHTTPClientInterface { return httpClient }, + ClientFactory: func(_ string, _ time.Duration) AgentHTTPClientInterface { return httpClient }, Namespace: "test-ns", Timeout: 5 * time.Minute, } @@ -77,7 +79,7 @@ func newTestSandboxAgentCallerWithProposal(sandbox *mockSandboxProvider, httpCli return &SandboxAgentCaller{ Sandbox: sandbox, K8sClient: fc, - ClientFactory: func(_ string) AgentHTTPClientInterface { return httpClient }, + ClientFactory: func(_ string, _ time.Duration) AgentHTTPClientInterface { return httpClient }, Namespace: "test-ns", Timeout: 5 * time.Minute, } @@ -107,7 +109,7 @@ func TestSandboxAgentCaller_Analyze_HappyPath(t *testing.T) { } caller := newTestSandboxAgentCaller(sandbox, httpClient) - result, err := caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "Pod crashing", defaultSandboxSA) + result, err := caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "Pod crashing", defaultSandboxSA, defaultSandboxTimeout) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -132,7 +134,7 @@ func TestSandboxAgentCaller_Execute_HappyPath(t *testing.T) { caller := newTestSandboxAgentCaller(sandbox, httpClient) option := &agenticv1alpha1.RemediationOption{Title: "Fix it"} - result, err := caller.Execute(context.Background(), testSandboxProposal(), testSandboxStep(), option, defaultSandboxSA) + result, err := caller.Execute(context.Background(), testSandboxProposal(), testSandboxStep(), option, defaultSandboxSA, defaultSandboxTimeout) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -156,7 +158,7 @@ func TestSandboxAgentCaller_Verify_HappyPath(t *testing.T) { } caller := newTestSandboxAgentCaller(sandbox, httpClient) - result, err := caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), nil, nil, defaultSandboxSA) + result, err := caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), nil, nil, defaultSandboxSA, defaultSandboxTimeout) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -178,7 +180,7 @@ func TestSandboxAgentCaller_ClaimError(t *testing.T) { httpClient := &mockHTTPClient{} caller := newTestSandboxAgentCaller(sandbox, httpClient) - _, err := caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "test", defaultSandboxSA) + _, err := caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "test", defaultSandboxSA, defaultSandboxTimeout) if err == nil { t.Fatal("expected error") } @@ -195,7 +197,7 @@ func TestSandboxAgentCaller_WaitReadyError(t *testing.T) { httpClient := &mockHTTPClient{} caller := newTestSandboxAgentCaller(sandbox, httpClient) - _, err := caller.Execute(context.Background(), testSandboxProposal(), testSandboxStep(), nil, defaultSandboxSA) + _, err := caller.Execute(context.Background(), testSandboxProposal(), testSandboxStep(), nil, defaultSandboxSA, defaultSandboxTimeout) if err == nil { t.Fatal("expected error") } @@ -209,7 +211,7 @@ func TestSandboxAgentCaller_HTTPError(t *testing.T) { httpClient := &mockHTTPClient{err: fmt.Errorf("connection refused")} caller := newTestSandboxAgentCaller(sandbox, httpClient) - _, err := caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), nil, nil, defaultSandboxSA) + _, err := caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), nil, nil, defaultSandboxSA, defaultSandboxTimeout) if err == nil { t.Fatal("expected error") } @@ -225,7 +227,7 @@ func TestSandboxAgentCaller_ParseError(t *testing.T) { } caller := newTestSandboxAgentCaller(sandbox, httpClient) - _, err := caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "test", defaultSandboxSA) + _, err := caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "test", defaultSandboxSA, defaultSandboxTimeout) if err == nil { t.Fatal("expected parse error") } @@ -241,7 +243,7 @@ func TestSandboxAgentCaller_SandboxNotReleasedAfterCall(t *testing.T) { } caller := newTestSandboxAgentCaller(sandbox, httpClient) - _, _ = caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "test", defaultSandboxSA) + _, _ = caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "test", defaultSandboxSA, defaultSandboxTimeout) if sandbox.claimCalls != 1 { t.Errorf("Claim calls = %d, want 1", sandbox.claimCalls) @@ -282,7 +284,7 @@ func TestSandboxAgentCaller_ContextPropagation(t *testing.T) { }, } - _, _ = caller.Analyze(context.Background(), proposal, testSandboxStep(), "test", defaultSandboxSA) + _, _ = caller.Analyze(context.Background(), proposal, testSandboxStep(), "test", defaultSandboxSA, defaultSandboxTimeout) if httpClient.lastCtx == nil { t.Fatal("expected context to be set") @@ -318,7 +320,7 @@ func TestSandboxAgentCaller_VerifyPassesExecutionResult(t *testing.T) { }, } - _, _ = caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), option, exec, defaultSandboxSA) + _, _ = caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), option, exec, defaultSandboxSA, defaultSandboxTimeout) if httpClient.lastCtx == nil { t.Fatal("expected context to be set") @@ -353,7 +355,7 @@ func TestSandboxAgentCaller_VerifyNilExecLeavesExecutionResultNil(t *testing.T) } caller := newTestSandboxAgentCaller(sandbox, httpClient) - _, _ = caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), nil, nil, defaultSandboxSA) + _, _ = caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), nil, nil, defaultSandboxSA, defaultSandboxTimeout) if httpClient.lastCtx == nil { t.Fatal("expected context to be set") @@ -377,7 +379,7 @@ func TestSandboxAgentCaller_VerifyExecWithoutInlineVerification(t *testing.T) { }, } - _, _ = caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), nil, exec, defaultSandboxSA) + _, _ = caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), nil, exec, defaultSandboxSA, defaultSandboxTimeout) if httpClient.lastCtx.ExecutionResult == nil { t.Fatal("expected executionResult in context") @@ -395,7 +397,7 @@ func TestSandboxAgentCaller_ExecutePassesApprovedOption(t *testing.T) { caller := newTestSandboxAgentCaller(sandbox, httpClient) option := &agenticv1alpha1.RemediationOption{Title: "Scale up replicas"} - _, _ = caller.Execute(context.Background(), testSandboxProposal(), testSandboxStep(), option, defaultSandboxSA) + _, _ = caller.Execute(context.Background(), testSandboxProposal(), testSandboxStep(), option, defaultSandboxSA, defaultSandboxTimeout) if httpClient.lastCtx == nil || httpClient.lastCtx.ApprovedOption == nil { t.Fatal("expected approved option in context") @@ -414,7 +416,7 @@ func TestSandboxAgentCaller_AnalysisQueryFraming(t *testing.T) { } caller := newTestSandboxAgentCaller(sandbox, httpClient) - _, _ = caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "Pod crashing with OOMKilled", defaultSandboxSA) + _, _ = caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "Pod crashing with OOMKilled", defaultSandboxSA, defaultSandboxTimeout) if !strings.Contains(httpClient.lastQuery, "analysis agent") { t.Error("analysis query should contain role framing") @@ -444,7 +446,7 @@ func TestSandboxAgentCaller_ExecutionQueryFraming(t *testing.T) { } proposal := testSandboxProposal() proposal.Spec.Request = "Pod crashing with OOMKilled" - _, _ = caller.Execute(context.Background(), proposal, testSandboxStep(), option, defaultSandboxSA) + _, _ = caller.Execute(context.Background(), proposal, testSandboxStep(), option, defaultSandboxSA, defaultSandboxTimeout) if !strings.Contains(httpClient.lastQuery, "execution agent") { t.Error("execution query should contain role framing") @@ -473,7 +475,7 @@ func TestSandboxAgentCaller_VerificationQueryFraming(t *testing.T) { } proposal := testSandboxProposal() proposal.Spec.Request = "Pod crashing with OOMKilled" - _, _ = caller.Verify(context.Background(), proposal, testSandboxStep(), option, exec, defaultSandboxSA) + _, _ = caller.Verify(context.Background(), proposal, testSandboxStep(), option, exec, defaultSandboxSA, defaultSandboxTimeout) if !strings.Contains(httpClient.lastQuery, "verification agent") { t.Error("verification query should contain role framing") @@ -496,7 +498,7 @@ func TestSandboxAgentCaller_ExecutionQueryNilOption(t *testing.T) { } caller := newTestSandboxAgentCaller(sandbox, httpClient) - _, _ = caller.Execute(context.Background(), testSandboxProposal(), testSandboxStep(), nil, defaultSandboxSA) + _, _ = caller.Execute(context.Background(), testSandboxProposal(), testSandboxStep(), nil, defaultSandboxSA, defaultSandboxTimeout) if !strings.Contains(httpClient.lastQuery, "execution agent") { t.Error("execution query should still contain role framing with nil option") @@ -519,7 +521,7 @@ func TestSandboxAgentCaller_Analyze_PatchesSandboxInfo(t *testing.T) { proposal := testSandboxProposal() caller := newTestSandboxAgentCallerWithProposal(sandbox, httpClient, proposal) - _, err := caller.Analyze(context.Background(), proposal, testSandboxStep(), "Pod crashing", defaultSandboxSA) + _, err := caller.Analyze(context.Background(), proposal, testSandboxStep(), "Pod crashing", defaultSandboxSA, defaultSandboxTimeout) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -548,7 +550,7 @@ func TestSandboxAgentCaller_Execute_PatchesSandboxInfo(t *testing.T) { proposal := testSandboxProposal() caller := newTestSandboxAgentCallerWithProposal(sandbox, httpClient, proposal) - _, err := caller.Execute(context.Background(), proposal, testSandboxStep(), nil, defaultSandboxSA) + _, err := caller.Execute(context.Background(), proposal, testSandboxStep(), nil, defaultSandboxSA, defaultSandboxTimeout) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -574,7 +576,7 @@ func TestSandboxAgentCaller_Verify_PatchesSandboxInfo(t *testing.T) { proposal := testSandboxProposal() caller := newTestSandboxAgentCallerWithProposal(sandbox, httpClient, proposal) - _, err := caller.Verify(context.Background(), proposal, testSandboxStep(), nil, nil, defaultSandboxSA) + _, err := caller.Verify(context.Background(), proposal, testSandboxStep(), nil, nil, defaultSandboxSA, defaultSandboxTimeout) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -602,7 +604,7 @@ func TestSandboxAgentCaller_SandboxInfoPatch_DoesNotBlockOnError(t *testing.T) { caller := newTestSandboxAgentCaller(sandbox, httpClient) proposal := testSandboxProposal() - _, err := caller.Analyze(context.Background(), proposal, testSandboxStep(), "test", defaultSandboxSA) + _, err := caller.Analyze(context.Background(), proposal, testSandboxStep(), "test", defaultSandboxSA, defaultSandboxTimeout) if err != nil { t.Fatalf("analysis should succeed even when sandbox info patch fails: %v", err) } @@ -730,3 +732,49 @@ func (m *trackingMockSandbox) Release(_ context.Context, claimName string) error } return nil } + +// TestSandboxAgentCaller_TimeoutPropagation verifies the two-phase timeout +// design: WaitReady (pod startup) always uses the fixed defaultSandboxTimeout +// regardless of the step's configured timeout, while ClientFactory receives the +// full user-configured timeout for the agent's work. +func TestSandboxAgentCaller_TimeoutPropagation(t *testing.T) { + const customTimeout = 20 * time.Minute + + sandbox := &mockSandboxProvider{ + claimName: "ls-analysis-test", + endpoint: "http://sandbox:8080", + } + httpClient := &mockHTTPClient{ + response: &agentRunResponse{ + Response: json.RawMessage(`{"success": true, "options": []}`), + }, + } + + var lastFactoryTimeout time.Duration + fc := fake.NewClientBuilder().WithScheme(testScheme()).Build() + _ = fc.Create(context.Background(), fakeBaseTemplate()) + caller := &SandboxAgentCaller{ + Sandbox: sandbox, + K8sClient: fc, + ClientFactory: func(_ string, d time.Duration) AgentHTTPClientInterface { + lastFactoryTimeout = d + return httpClient + }, + Namespace: "test-ns", + Timeout: defaultSandboxTimeout, + } + + _, err := caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "test", "", customTimeout) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Pod startup always uses the fixed ceiling, not the step-level timeout. + if sandbox.lastWaitReadyTimeout != defaultSandboxTimeout { + t.Errorf("WaitReady timeout = %v, want defaultSandboxTimeout (%v)", sandbox.lastWaitReadyTimeout, defaultSandboxTimeout) + } + // The agent's work budget is the full configured timeout. + if lastFactoryTimeout != customTimeout { + t.Errorf("ClientFactory timeout = %v, want %v", lastFactoryTimeout, customTimeout) + } +} diff --git a/controller/proposal/sandbox_templates.go b/controller/proposal/sandbox_templates.go index fcb5d3ce..175b5321 100644 --- a/controller/proposal/sandbox_templates.go +++ b/controller/proposal/sandbox_templates.go @@ -73,10 +73,15 @@ var sandboxTemplateGVK = schema.GroupVersionKind{ } const ( - llmCredsMountPath = "/var/run/secrets/llm-credentials" - llmCredsVolumeName = "llm-credentials" - mcpHeadersMountRoot = "/var/secrets/mcp" - mcpServersEnvVar = "LIGHTSPEED_MCP_SERVERS" + agentModeEnvVar = "LIGHTSPEED_MODE" + vertexCredsMountPath = "/var/secrets/google" + vertexCredsFileName = "credentials.json" + llmCredsVolumeName = "llm-credentials" + llmCredsMountPath = "/var/run/secrets/llm-credentials" + mcpHeadersMountRoot = "/var/secrets/mcp" + mcpServersEnvVar = "LIGHTSPEED_MCP_SERVERS" + dataSourceMountPath = "/data/input" + dataSourceVolumeName = "lightspeed-data-source" LabelManaged = "agentic.openshift.io/managed" LabelBaseTemplate = "agentic.openshift.io/base-template" @@ -92,6 +97,7 @@ type templateHashInput struct { Skills []agenticv1alpha1.SkillsSource `json:"skills"` MCPServers []agenticv1alpha1.MCPServerConfig `json:"mcpServers,omitempty"` RequiredSecrets []agenticv1alpha1.SecretRequirement `json:"requiredSecrets,omitempty"` + DataSource *agenticv1alpha1.DataSource `json:"dataSource,omitempty"` Step string `json:"step"` BaseResourceVersion string `json:"baseRV"` ServiceAccount string `json:"serviceAccount"` @@ -103,6 +109,7 @@ func computeTemplateHash( skills []agenticv1alpha1.SkillsSource, mcpServers []agenticv1alpha1.MCPServerConfig, requiredSecrets []agenticv1alpha1.SecretRequirement, + dataSource *agenticv1alpha1.DataSource, step string, baseResourceVersion string, serviceAccount string, @@ -113,6 +120,7 @@ func computeTemplateHash( Skills: skills, MCPServers: mcpServers, RequiredSecrets: requiredSecrets, + DataSource: dataSource, Step: step, BaseResourceVersion: baseResourceVersion, ServiceAccount: serviceAccount, @@ -130,7 +138,8 @@ func agentTemplateName(step, agentName, hash string) string { } // EnsureAgentTemplate creates a SandboxTemplate derived from the base template -// with skills, LLM credentials, MCP servers, and required secrets from the CRD chain. +// with skills, LLM credentials, MCP servers, required secrets, and an optional +// dataSource PVC from the CRD chain. // Template name includes a config hash — same input = same template = no-op. // Old templates for the same agent+phase are garbage-collected. func EnsureAgentTemplate( @@ -162,13 +171,17 @@ func EnsureAgentTemplate( var skills []agenticv1alpha1.SkillsSource var mcpServers []agenticv1alpha1.MCPServerConfig var requiredSecrets []agenticv1alpha1.SecretRequirement + var dataSource *agenticv1alpha1.DataSource if tools != nil { skills = tools.Skills mcpServers = tools.MCPServers requiredSecrets = tools.RequiredSecrets + if !tools.DataSource.IsZero() { + dataSource = &tools.DataSource + } } - hash, err := computeTemplateHash(llm, agent.Spec.Model, skills, mcpServers, requiredSecrets, step, base.GetResourceVersion(), serviceAccount) + hash, err := computeTemplateHash(llm, agent.Spec.Model, skills, mcpServers, requiredSecrets, dataSource, step, base.GetResourceVersion(), serviceAccount) if err != nil { return "", fmt.Errorf("%s: %w", ErrComputeTemplateHash, err) } @@ -234,6 +247,12 @@ func EnsureAgentTemplate( } } + if dataSource != nil { + if err := patchDataSource(derived, dataSource); err != nil { + return "", fmt.Errorf("patch data source: %w", err) + } + } + if err := patchProbes(derived); err != nil { return "", fmt.Errorf("%s: %w", ErrPatchProbes, err) } @@ -616,6 +635,36 @@ func addSecretVolume(tmpl *unstructured.Unstructured, volumeName, secretName str return unstructured.SetNestedSlice(tmpl.Object, volumes, "spec", "podTemplate", "spec", "volumes") } +func addPVCVolume(tmpl *unstructured.Unstructured, volumeName, claimName string) error { + volumes, _, _ := unstructured.NestedSlice(tmpl.Object, "spec", "podTemplate", "spec", "volumes") + vol := map[string]any{ + "name": volumeName, + "persistentVolumeClaim": map[string]any{ + "claimName": claimName, + }, + } + for i, v := range volumes { + existing, ok := v.(map[string]any) + if !ok { + continue + } + if existing["name"] == volumeName { + volumes[i] = vol + return unstructured.SetNestedSlice(tmpl.Object, volumes, "spec", "podTemplate", "spec", "volumes") + } + } + volumes = append(volumes, vol) + return unstructured.SetNestedSlice(tmpl.Object, volumes, "spec", "podTemplate", "spec", "volumes") +} + +func patchDataSource(tmpl *unstructured.Unstructured, ds *agenticv1alpha1.DataSource) error { + volName := dataSourceVolumeName + if err := addPVCVolume(tmpl, volName, ds.ClaimName); err != nil { + return fmt.Errorf("add data source PVC volume: %w", err) + } + return addVolumeMount(tmpl, volName, dataSourceMountPath, true) +} + func addVolumeMount(tmpl *unstructured.Unstructured, name, mountPath string, readOnly bool) error { container, containers, err := firstContainer(tmpl) if err != nil { diff --git a/controller/proposal/sandbox_templates_test.go b/controller/proposal/sandbox_templates_test.go index 4ffb3fbc..09bb3452 100644 --- a/controller/proposal/sandbox_templates_test.go +++ b/controller/proposal/sandbox_templates_test.go @@ -77,7 +77,7 @@ func emptyTemplate() *unstructured.Unstructured { func mustHash(t *testing.T, llm *agenticv1alpha1.LLMProvider, model string, skills []agenticv1alpha1.SkillsSource, requiredSecrets []agenticv1alpha1.SecretRequirement, phase string) string { t.Helper() - h, err := computeTemplateHash(llm, model, skills, nil, requiredSecrets, phase, "", "") + h, err := computeTemplateHash(llm, model, skills, nil, requiredSecrets, nil, phase, "", "") if err != nil { t.Fatalf("computeTemplateHash: %v", err) } @@ -627,11 +627,11 @@ func TestComputeTemplateHash_DifferentBaseResourceVersion(t *testing.T) { llm := testLLMProvider(agenticv1alpha1.LLMProviderGoogleCloudVertex) skills := []agenticv1alpha1.SkillsSource{{Image: "quay.io/test/skills:latest"}} - h1, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, "analysis", "1000", "") + h1, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, nil, "analysis", "1000", "") if err != nil { t.Fatal(err) } - h2, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, "analysis", "2000", "") + h2, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, nil, "analysis", "2000", "") if err != nil { t.Fatal(err) } @@ -645,11 +645,11 @@ func TestComputeTemplateHash_SameBaseResourceVersion(t *testing.T) { llm := testLLMProvider(agenticv1alpha1.LLMProviderGoogleCloudVertex) skills := []agenticv1alpha1.SkillsSource{{Image: "quay.io/test/skills:latest"}} - h1, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, "analysis", "1000", "") + h1, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, nil, "analysis", "1000", "") if err != nil { t.Fatal(err) } - h2, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, "analysis", "1000", "") + h2, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, nil, "analysis", "1000", "") if err != nil { t.Fatal(err) } @@ -730,3 +730,89 @@ func TestPatchProbes(t *testing.T) { } }) } + +// TestComputeTemplateHash_DataSource verifies that a non-nil dataSource changes +// the hash, and that two different claimNames produce two different hashes. +// DataSource is optional — nil is valid and covered by all other hash tests above. +func TestComputeTemplateHash_DataSource(t *testing.T) { + llm := testLLMProvider(agenticv1alpha1.LLMProviderGoogleCloudVertex) + skills := []agenticv1alpha1.SkillsSource{{Image: "quay.io/test/skills:latest"}} + + hNoDS, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, nil, "analysis", "", "") + if err != nil { + t.Fatalf("computeTemplateHash (nil dataSource): %v", err) + } + + ds1 := &agenticv1alpha1.DataSource{ClaimName: "pvc-alpha"} + hDS1, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, ds1, "analysis", "", "") + if err != nil { + t.Fatalf("computeTemplateHash (dataSource pvc-alpha): %v", err) + } + + ds2 := &agenticv1alpha1.DataSource{ClaimName: "pvc-beta"} + hDS2, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, ds2, "analysis", "", "") + if err != nil { + t.Fatalf("computeTemplateHash (dataSource pvc-beta): %v", err) + } + + if hNoDS == hDS1 { + t.Error("nil dataSource and non-nil dataSource should produce different hashes") + } + if hDS1 == hDS2 { + t.Error("different claimNames should produce different hashes") + } +} + +// TestPatchDataSource_MountsReadOnly verifies that patchDataSource adds a PVC +// volume with the correct claimName and mounts it read-only at /data/input. +func TestPatchDataSource_MountsReadOnly(t *testing.T) { + tmpl := emptyTemplate() + ds := &agenticv1alpha1.DataSource{ClaimName: "my-must-gather-pvc"} + + if err := patchDataSource(tmpl, ds); err != nil { + t.Fatalf("patchDataSource: %v", err) + } + + // Verify volume was added with the correct name and claimName. + volumes, _, _ := unstructured.NestedSlice(tmpl.Object, "spec", "podTemplate", "spec", "volumes") + var foundVol bool + for _, v := range volumes { + vol, ok := v.(map[string]any) + if !ok { + continue + } + if vol["name"] != dataSourceVolumeName { + continue + } + foundVol = true + pvc, ok := vol["persistentVolumeClaim"].(map[string]any) + if !ok { + t.Fatalf("volume %q has no persistentVolumeClaim", dataSourceVolumeName) + } + if got := pvc["claimName"]; got != "my-must-gather-pvc" { + t.Errorf("claimName = %q, want %q", got, "my-must-gather-pvc") + } + } + if !foundVol { + t.Errorf("volume %q not found after patchDataSource", dataSourceVolumeName) + } + + // Verify volumeMount is read-only at /data/input. + mounts := getVolumeMounts(tmpl) + var foundMount bool + for _, m := range mounts { + if m["mountPath"] != dataSourceMountPath { + continue + } + foundMount = true + if m["name"] != dataSourceVolumeName { + t.Errorf("mount name = %q, want %q", m["name"], dataSourceVolumeName) + } + if ro, _ := m["readOnly"].(bool); !ro { + t.Error("dataSource mount should be readOnly=true") + } + } + if !foundMount { + t.Errorf("volumeMount at %q not found after patchDataSource", dataSourceMountPath) + } +} diff --git a/examples/setup/09-intelliaide-proposals.yaml b/examples/setup/09-intelliaide-proposals.yaml new file mode 100644 index 00000000..5c488e78 --- /dev/null +++ b/examples/setup/09-intelliaide-proposals.yaml @@ -0,0 +1,231 @@ +# IntelliAide RCA Proposals +# +# IntelliAide is a live-cluster / must-gather RCA pipeline that runs a 3-pass +# analysis (High → Medium → Low priority) using Claude as the orchestrator. +# +# Integration model: +# User triggers a Proposal CR → operator launches analysis sandbox +# → Claude reads /app/skills/intelliaide/SKILL.md +# → Claude calls skill scripts (extract_cluster, select_files, analyze_data, perform_rca) +# → All LLM reasoning is performed by the orchestrating Claude session +# → Claude maps results into standard diagnosis/proposal fields +# +# Workflow shape: analysis ONLY (advisory). +# IntelliAide returns diagnosis and recommendations. There is no automated +# execution or verification step — humans decide on follow-up actions. +# +# Output rendering: +# RCA findings are mapped into the standard diagnosis/proposal fields so they +# render with MarkdownText in the console UI. No custom outputSchema is used. +# +# IntelliAide triggering: +# The operator is skill-agnostic — it mounts the skills image and passes +# spec.request to the agent. IntelliAide is invoked by the agent itself when +# it reads the SKILL.md from the mounted skills image. No keyword routing +# is done by the operator. Phrase the request to describe the problem; the +# agent decides which skill to invoke. +# +# Modes: +# Live mode (default) — IntelliAide queries live cluster state via kubectl/oc. +# No dataSource needed. +# Must-gather mode — Provide a pre-populated PVC via tools.dataSource. +# The operator mounts it read-only at /data/input. +# Mention the bundle path in spec.request so the agent +# knows to analyze it instead of collecting live data. +# +# Pre-requisites (cluster admin, one-time): +# 1. Build and push skills image: +# podman build -f Dockerfile.skills \ +# -t quay.io//intelliaide-skills:latest . +# podman push quay.io//intelliaide-skills:latest +# # Retag and push to the in-cluster registry so Proposals can pull it: +# podman tag quay.io//intelliaide-skills:latest \ +# image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest +# podman push \ +# image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest +# 2. Label openshift-lightspeed namespace (Kubernetes 1.21+ does this +# automatically via kubernetes.io/metadata.name): +# oc label namespace openshift-lightspeed \ +# kubernetes.io/metadata.name=openshift-lightspeed --overwrite +# +# Trigger: +# kubectl apply -f 09-intelliaide-proposals.yaml +# oc agentic proposal approve -n openshift-lightspeed --stage=analysis + +--- +# etcd RCA — etcd member health / leader election failures +apiVersion: agentic.openshift.io/v1alpha1 +kind: Proposal +metadata: + name: intelliaide-rca-etcd + namespace: openshift-lightspeed + labels: + agentic.openshift.io/source: intelliaide + agentic.openshift.io/component: etcd + agentic.openshift.io/mode: live +spec: + request: | + Perform root cause analysis for the following issue: + One or more etcd pods are not ready, or etcd endpoint health is degraded + in namespace openshift-etcd. Investigate the root cause using live cluster + data and provide remediation options. + + targetNamespaces: + - openshift-etcd + + tools: + skills: + - image: image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest + + analysis: + agent: smart + # IntelliAide RCA takes 10-30 minutes; override the default 5-min timeout. + timeoutMinutes: 30 + +--- +# Cluster update failure RCA +apiVersion: agentic.openshift.io/v1alpha1 +kind: Proposal +metadata: + name: intelliaide-rca-update-failure + namespace: openshift-lightspeed + labels: + agentic.openshift.io/source: intelliaide + agentic.openshift.io/component: cluster-version-operator + agentic.openshift.io/mode: live +spec: + request: | + Perform root cause analysis for the following issue: + OpenShift cluster update has stalled or failed. ClusterVersion reports + Progressing=True with no forward progress, or Degraded=True. Investigate + the root cause using live cluster data and recommend remediation steps. + + targetNamespaces: + - openshift-cluster-version + - openshift-machine-config-operator + + tools: + skills: + - image: image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest + + analysis: + agent: smart + timeoutMinutes: 30 + +--- +# Live-cluster RCA — intermittent router/ingress latency spikes +# +# Tricky because the symptoms (HTTP 504s, slow responses) appear at the +# application layer but the underlying cause is almost always one of: +# - HAProxy router thread saturation from a single high-fan-out Service +# - A misconfigured keepalive / timeout on one backend pool causing cascading +# connection exhaustion across unrelated routes +# - An mTLS re-handshake storm triggered by certificate rotation on a subset +# of pods, which the router cannot distinguish from normal slow backends +# +# IntelliAide runs in LIVE mode so it reads current router metrics, endpoint +# slice states, and HAProxy stats in real time without collecting a must-gather. +apiVersion: agentic.openshift.io/v1alpha1 +kind: Proposal +metadata: + name: intelliaide-rca-router-latency + namespace: openshift-lightspeed + labels: + agentic.openshift.io/source: intelliaide + agentic.openshift.io/component: ingress + agentic.openshift.io/mode: live +spec: + request: | + Perform root cause analysis for the following issue: + Applications behind the OpenShift router are intermittently experiencing + HTTP 504 Gateway Timeout and elevated P99 latency (>2 s) during normal + business hours, even though individual pod health checks report healthy. + The issue is not reproducible on every request and affects multiple routes + in the same namespace. No recent changes were made to the Deployments or + Services involved. Investigate the root cause using live cluster data and + provide remediation options. + + targetNamespaces: + - openshift-ingress + - openshift-ingress-operator + + tools: + skills: + - image: image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest + + analysis: + agent: smart + timeoutMinutes: 30 + +--- +# General RCA — operator-managed workload degradation (no dedicated template) +# +# Use this as a starting point when no specific proposal template fits. +# Covers scenarios such as: Deployment rollout stuck, pods CrashLoopBackOff, +# OOMKill cycles, PVC mount failures, or admission webhook rejections that +# are not caused by a single obvious misconfiguration. IntelliAide examines +# the affected namespace holistically and correlates events, logs, and +# resource state across the 3-pass priority model. +apiVersion: agentic.openshift.io/v1alpha1 +kind: Proposal +metadata: + name: intelliaide-rca-general + namespace: openshift-lightspeed + labels: + agentic.openshift.io/source: intelliaide + agentic.openshift.io/mode: live +spec: + request: | + Perform a deeper root cause analysis for the following issue: + One or more workloads in the cluster are degraded or unavailable and the + cause is not immediately obvious from describe output or recent events. + Use IntelliAide to collect and correlate live cluster data across + namespaces, then identify the root cause and recommend remediation steps. + + targetNamespaces: [] # fill in the affected namespace(s) before applying + + tools: + skills: + - image: image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest + + analysis: + agent: smart + timeoutMinutes: 30 + +--- +# Must-gather mode RCA — analyze a pre-collected must-gather bundle +# +# Use this when live cluster access is not available or the issue is +# transient and you have already collected a must-gather bundle. +# The PVC must exist in the same namespace as the Proposal and be +# pre-populated with the must-gather data before applying this file. +# +# Pre-requisites: +# 1. Collect a must-gather bundle: +# oc adm must-gather --dest-dir=/tmp/must-gather +# 2. Create a PVC and populate it with the bundle data. +# 3. Apply this Proposal (after replacing ): +# kubectl apply -f 09-intelliaide-proposals.yaml +apiVersion: agentic.openshift.io/v1alpha1 +kind: Proposal +metadata: + name: intelliaide-rca-must-gather + namespace: openshift-lightspeed + labels: + agentic.openshift.io/source: intelliaide + agentic.openshift.io/mode: must-gather +spec: + request: | + Perform root cause analysis on the must-gather bundle mounted at + /data/input. Analyze the collected cluster state and identify the + root cause of the issue, then provide remediation options. + + tools: + skills: + - image: image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest + dataSource: + claimName: # replace with your PVC name + + analysis: + agent: smart + timeoutMinutes: 30