From 0468de85260207ad35d664d1c6dfd351b26dc94e Mon Sep 17 00:00:00 2001 From: Sakshi Patidar Date: Fri, 29 May 2026 14:37:33 +0530 Subject: [PATCH 1/3] feat: add per-proposal timeoutMinutes and configurable sandbox timeout --- .gitignore | 1 + api/v1alpha1/proposal_types.go | 10 +++ .../bases/agentic.openshift.io_proposals.yaml | 38 ++++++++++++ config/rbac/role.yaml | 8 +++ controller/proposal/agent.go | 17 ++--- controller/proposal/client.go | 9 ++- controller/proposal/client_test.go | 12 ++-- controller/proposal/handlers.go | 15 +++-- controller/proposal/reconciler_test.go | 18 +++--- controller/proposal/sandbox_agent.go | 47 ++++++++------ controller/proposal/sandbox_agent_test.go | 62 +++++++++---------- 11 files changed, 158 insertions(+), 79 deletions(-) diff --git a/.gitignore b/.gitignore index a9260aa3..5fcdbbbc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Binaries and local tool installs (Makefile uses ./bin for controller-gen, kustomize, etc.) /bin/ +/oc-agentic # IDE / OS .idea/ diff --git a/api/v1alpha1/proposal_types.go b/api/v1alpha1/proposal_types.go index 1c74ab53..56f47417 100644 --- a/api/v1alpha1/proposal_types.go +++ b/api/v1alpha1/proposal_types.go @@ -373,6 +373,16 @@ type ProposalSpec struct { // +kubebuilder:validation:MinLength=1 // +kubebuilder:validation:MaxLength=32768 RevisionFeedback string `json:"revisionFeedback,omitempty"` + + // timeoutMinutes overrides the default sandbox operation timeout for + // this proposal. When set, all sandbox wait and HTTP client timeouts + // use this value instead of the operator default (5 minutes). + // + // Immutable: timeout policy is fixed at creation. + // +optional + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Maximum=120 + TimeoutMinutes *int32 `json:"timeoutMinutes,omitzero"` } // ProposalStatus defines the observed state of Proposal. All fields are diff --git a/config/crd/bases/agentic.openshift.io_proposals.yaml b/config/crd/bases/agentic.openshift.io_proposals.yaml index da391135..3dc17f0d 100644 --- a/config/crd/bases/agentic.openshift.io_proposals.yaml +++ b/config/crd/bases/agentic.openshift.io_proposals.yaml @@ -481,6 +481,28 @@ spec: x-kubernetes-validations: - message: schema is required when mode is Minimal rule: self.mode != 'Minimal' || has(self.schema) + dataSource: + description: |- + dataSource references a PVC containing pre-populated input data + (e.g., must-gather bundles, diagnostic data). The operator mounts + it read-only at /data/input in the sandbox pod. Skills discover + input data at this standard location. + + Immutable: input data source is fixed at creation. + properties: + claimName: + description: |- + claimName is the name of the PersistentVolumeClaim to mount. + The PVC must exist in the same namespace as the Proposal. + maxLength: 253 + minLength: 1 + type: string + x-kubernetes-validations: + - message: must be a valid DNS subdomain + rule: '!format.dns1123Subdomain().validate(self).hasValue()' + required: + - claimName + type: object execution: description: |- execution defines per-step configuration for the execution step. @@ -913,6 +935,19 @@ spec: x-kubernetes-validations: - message: each namespace must be a valid DNS label rule: self.all(ns, !format.dns1123Label().validate(ns).hasValue()) + timeoutMinutes: + description: |- + timeoutMinutes sets the per-step timeout for sandbox agent calls. + This controls how long the operator waits for the sandbox pod to + become ready and for the agent to complete its work. Increase this + for long-running tools (e.g., IntelliAide RCA takes 10-30 minutes). + Defaults to 5 minutes when omitted. + + Mutable: can be adjusted before approving a step. + format: int32 + maximum: 60 + minimum: 1 + type: integer tools: description: |- tools defines the default tools for all steps: skills images, @@ -1684,6 +1719,9 @@ spec: - message: verification is immutable once set rule: '!has(oldSelf.verification) || (has(self.verification) && self.verification == oldSelf.verification)' + - message: dataSource is immutable once set + rule: '!has(oldSelf.dataSource) || (has(self.dataSource) && self.dataSource + == oldSelf.dataSource)' status: description: status defines the observed state of Proposal. minProperties: 1 diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 718b2ff3..795513c6 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -33,6 +33,14 @@ rules: - create - delete - get +- apiGroups: + - "" + resources: + - persistentvolumeclaims + verbs: + - get + - list + - watch - apiGroups: - agentic.openshift.io resources: diff --git a/controller/proposal/agent.go b/controller/proposal/agent.go index e6698ae3..9b07b7b7 100644 --- a/controller/proposal/agent.go +++ b/controller/proposal/agent.go @@ -2,6 +2,7 @@ package proposal import ( "context" + "time" agenticv1alpha1 "github.com/openshift/lightspeed-agentic-operator/api/v1alpha1" ) @@ -42,10 +43,10 @@ type EscalationOutput struct { // HTTP implementations POST to /v1/agent/run — a step-agnostic // endpoint where all workflow context is in the request payload. type AgentCaller interface { - Analyze(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string) (*AnalysisOutput, error) - Execute(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, serviceAccount string) (*ExecutionOutput, error) - Verify(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, exec *ExecutionOutput, serviceAccount string) (*VerificationOutput, error) - Escalate(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string) (*EscalationOutput, error) + Analyze(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string, timeout time.Duration) (*AnalysisOutput, error) + Execute(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, serviceAccount string, timeout time.Duration) (*ExecutionOutput, error) + Verify(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, exec *ExecutionOutput, serviceAccount string, timeout time.Duration) (*VerificationOutput, error) + Escalate(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string, timeout time.Duration) (*EscalationOutput, error) ReleaseSandboxes(ctx context.Context, proposal *agenticv1alpha1.Proposal) error } @@ -53,7 +54,7 @@ type AgentCaller interface { // implementation (sandbox + HTTP) when the agent infrastructure is ready. type StubAgentCaller struct{} -func (s *StubAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string) (*AnalysisOutput, error) { +func (s *StubAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string, _ time.Duration) (*AnalysisOutput, error) { return &AnalysisOutput{ Success: true, Options: []agenticv1alpha1.RemediationOption{{ @@ -73,7 +74,7 @@ func (s *StubAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal }, nil } -func (s *StubAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ string) (*ExecutionOutput, error) { +func (s *StubAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ string, _ time.Duration) (*ExecutionOutput, error) { return &ExecutionOutput{ Success: true, ActionsTaken: []agenticv1alpha1.ExecutionAction{{ @@ -88,7 +89,7 @@ func (s *StubAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal }, nil } -func (s *StubAgentCaller) Escalate(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string) (*EscalationOutput, error) { +func (s *StubAgentCaller) Escalate(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string, _ time.Duration) (*EscalationOutput, error) { return &EscalationOutput{ Success: true, Summary: "Stub escalation summary", @@ -100,7 +101,7 @@ func (s *StubAgentCaller) ReleaseSandboxes(_ context.Context, _ *agenticv1alpha1 return nil } -func (s *StubAgentCaller) Verify(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ *ExecutionOutput, _ string) (*VerificationOutput, error) { +func (s *StubAgentCaller) Verify(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ *ExecutionOutput, _ string, _ time.Duration) (*VerificationOutput, error) { return &VerificationOutput{ Success: true, Checks: []agenticv1alpha1.VerifyCheck{{ diff --git a/controller/proposal/client.go b/controller/proposal/client.go index 2ac08385..713b3335 100644 --- a/controller/proposal/client.go +++ b/controller/proposal/client.go @@ -79,10 +79,13 @@ type AgentHTTPClient struct { endpoint string } -func NewAgentHTTPClient(endpoint string) AgentHTTPClientInterface { +func NewAgentHTTPClient(endpoint string, timeout time.Duration) AgentHTTPClientInterface { + if timeout <= 0 { + timeout = defaultSandboxTimeout + } return &AgentHTTPClient{ httpClient: &http.Client{ - Timeout: 5 * time.Minute, + Timeout: timeout, Transport: &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, //nolint:gosec // internal cluster traffic }, @@ -92,11 +95,13 @@ func NewAgentHTTPClient(endpoint string) AgentHTTPClientInterface { } func (c *AgentHTTPClient) Run(ctx context.Context, systemPrompt, query string, outputSchema json.RawMessage, agentCtx *agentContext) (*agentRunResponse, error) { + timeoutMs := int64(c.httpClient.Timeout / time.Millisecond) req := agentRunRequest{ Query: query, SystemPrompt: systemPrompt, OutputSchema: outputSchema, Context: agentCtx, + TimeoutMs: &timeoutMs, } body, err := json.Marshal(req) diff --git a/controller/proposal/client_test.go b/controller/proposal/client_test.go index 51ef7e15..dee0cf5a 100644 --- a/controller/proposal/client_test.go +++ b/controller/proposal/client_test.go @@ -35,7 +35,7 @@ func TestAgentHTTPClient_RunSuccess(t *testing.T) { })) defer server.Close() - client := NewAgentHTTPClient(server.URL) + client := NewAgentHTTPClient(server.URL, 0) resp, err := client.Run(context.Background(), "You are an SRE agent", "check health", nil, nil) if err != nil { t.Fatalf("unexpected error: %v", err) @@ -52,7 +52,7 @@ func TestAgentHTTPClient_RunHTTPError(t *testing.T) { })) defer server.Close() - client := NewAgentHTTPClient(server.URL) + client := NewAgentHTTPClient(server.URL, 0) _, err := client.Run(context.Background(), "", "test", nil, nil) if err == nil { t.Fatal("expected error for HTTP 500") @@ -60,7 +60,7 @@ func TestAgentHTTPClient_RunHTTPError(t *testing.T) { } func TestAgentHTTPClient_RunConnectionError(t *testing.T) { - client := NewAgentHTTPClient("http://127.0.0.1:1") + client := NewAgentHTTPClient("http://127.0.0.1:1", 0) _, err := client.Run(context.Background(), "", "test", nil, nil) if err == nil { t.Fatal("expected error for connection failure") @@ -100,7 +100,7 @@ func TestAgentHTTPClient_RunWithExecutionResult(t *testing.T) { })) defer server.Close() - client := NewAgentHTTPClient(server.URL) + client := NewAgentHTTPClient(server.URL, 0) agentCtx := &agentContext{ TargetNamespaces: []string{"production"}, ExecutionResult: &agentExecutionResult{ @@ -135,7 +135,7 @@ func TestAgentHTTPClient_RunWithoutExecutionResult(t *testing.T) { })) defer server.Close() - client := NewAgentHTTPClient(server.URL) + client := NewAgentHTTPClient(server.URL, 0) agentCtx := &agentContext{ TargetNamespaces: []string{"production"}, } @@ -169,7 +169,7 @@ func TestAgentHTTPClient_RunWithContext(t *testing.T) { })) defer server.Close() - client := NewAgentHTTPClient(server.URL) + client := NewAgentHTTPClient(server.URL, 0) agentCtx := &agentContext{ TargetNamespaces: []string{"production"}, PreviousAttempts: []agentPreviousAttempt{{Attempt: 1, FailureReason: "timeout"}}, diff --git a/controller/proposal/handlers.go b/controller/proposal/handlers.go index b3662872..0db81c4f 100644 --- a/controller/proposal/handlers.go +++ b/controller/proposal/handlers.go @@ -85,7 +85,8 @@ func (r *ProposalReconciler) handleAnalysis( return ctrl.Result{}, fmt.Errorf("%s: %w", ErrUpdateToAnalyzing, err) } - analysisResult, err := r.Agent.Analyze(ctx, proposal, resolved.Analysis, proposal.Spec.Request, defaultSandboxSA) + timeout := proposalTimeout(proposal) + analysisResult, err := r.Agent.Analyze(ctx, proposal, resolved.Analysis, proposal.Spec.Request, defaultSandboxSA, timeout) if err != nil { return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionAnalyzed, err) } @@ -151,7 +152,8 @@ func (r *ProposalReconciler) handleRevision( revisionSuffix := buildRevisionContext(proposal) requestWithRevision := proposal.Spec.Request + "\n\n" + revisionSuffix - analysisResult, err := r.Agent.Analyze(ctx, proposal, resolved.Analysis, requestWithRevision, defaultSandboxSA) + timeout := proposalTimeout(proposal) + analysisResult, err := r.Agent.Analyze(ctx, proposal, resolved.Analysis, requestWithRevision, defaultSandboxSA, timeout) if err != nil { return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionAnalyzed, err) } @@ -270,7 +272,8 @@ func (r *ProposalReconciler) handleExecution( return ctrl.Result{}, fmt.Errorf("%s: %w", ErrUpdateToExecuting, err) } - execResult, err := r.Agent.Execute(ctx, proposal, *resolved.Execution, selectedOption, execSA) + timeout := proposalTimeout(proposal) + execResult, err := r.Agent.Execute(ctx, proposal, *resolved.Execution, selectedOption, execSA, timeout) if err != nil { return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionExecuted, err) } @@ -387,7 +390,8 @@ func (r *ProposalReconciler) handleVerification( } } - verifyResult, err := r.Agent.Verify(ctx, proposal, *resolved.Verification, selectedOption, execOutput, defaultSandboxSA) + timeout := proposalTimeout(proposal) + verifyResult, err := r.Agent.Verify(ctx, proposal, *resolved.Verification, selectedOption, execOutput, defaultSandboxSA, timeout) if err != nil { return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionVerified, err) } @@ -586,7 +590,8 @@ func (r *ProposalReconciler) handleEscalation( } escalationText := buildEscalationRequest(proposal) - escalationResult, err := r.Agent.Escalate(ctx, proposal, step, escalationText, defaultSandboxSA) + timeout := proposalTimeout(proposal) + escalationResult, err := r.Agent.Escalate(ctx, proposal, step, escalationText, defaultSandboxSA, timeout) if err != nil { return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionEscalated, err) } diff --git a/controller/proposal/reconciler_test.go b/controller/proposal/reconciler_test.go index 4b7db746..8eec3125 100644 --- a/controller/proposal/reconciler_test.go +++ b/controller/proposal/reconciler_test.go @@ -35,32 +35,32 @@ type testAgentCaller struct { func newTestAgentCaller() *testAgentCaller { stub := &StubAgentCaller{} - a, _ := stub.Analyze(context.Background(), nil, resolvedStep{}, "", "") - e, _ := stub.Execute(context.Background(), nil, resolvedStep{}, nil, "") - v, _ := stub.Verify(context.Background(), nil, resolvedStep{}, nil, nil, "") - esc, _ := stub.Escalate(context.Background(), nil, resolvedStep{}, "", "") + a, _ := stub.Analyze(context.Background(), nil, resolvedStep{}, "", "", 0) + e, _ := stub.Execute(context.Background(), nil, resolvedStep{}, nil, "", 0) + v, _ := stub.Verify(context.Background(), nil, resolvedStep{}, nil, nil, "", 0) + esc, _ := stub.Escalate(context.Background(), nil, resolvedStep{}, "", "", 0) return &testAgentCaller{analyzeResult: a, executeResult: e, verifyResult: v, escalateResult: esc} } -func (ta *testAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string) (*AnalysisOutput, error) { +func (ta *testAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string, _ time.Duration) (*AnalysisOutput, error) { if ta.analyzeErr != nil { return nil, ta.analyzeErr } return ta.analyzeResult, nil } -func (ta *testAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ string) (*ExecutionOutput, error) { +func (ta *testAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ string, _ time.Duration) (*ExecutionOutput, error) { if ta.executeErr != nil { return nil, ta.executeErr } return ta.executeResult, nil } -func (ta *testAgentCaller) Verify(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ *ExecutionOutput, _ string) (*VerificationOutput, error) { +func (ta *testAgentCaller) Verify(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ *ExecutionOutput, _ string, _ time.Duration) (*VerificationOutput, error) { if ta.verifyErr != nil { return nil, ta.verifyErr } return ta.verifyResult, nil } -func (ta *testAgentCaller) Escalate(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string) (*EscalationOutput, error) { +func (ta *testAgentCaller) Escalate(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string, _ time.Duration) (*EscalationOutput, error) { if ta.escalateErr != nil { return nil, ta.escalateErr } @@ -247,7 +247,7 @@ func newMockSandboxAgent(analysisJSON, executionJSON, verificationJSON string) ( caller := &SandboxAgentCaller{ Sandbox: sandbox, K8sClient: fc, - ClientFactory: func(_ string) AgentHTTPClientInterface { + ClientFactory: func(_ string, _ time.Duration) AgentHTTPClientInterface { resp := responses[callCount%len(responses)] callCount++ httpClient.response = &agentRunResponse{Response: json.RawMessage(resp)} diff --git a/controller/proposal/sandbox_agent.go b/controller/proposal/sandbox_agent.go index fec57c4b..ae71444c 100644 --- a/controller/proposal/sandbox_agent.go +++ b/controller/proposal/sandbox_agent.go @@ -35,13 +35,13 @@ type analysisResponse struct { } type executionResponse struct { - Success bool `json:"success"` + Success bool `json:"success"` ActionsTaken []agenticv1alpha1.ExecutionAction `json:"actionsTaken"` Verification *agenticv1alpha1.ExecutionVerification `json:"verification,omitempty"` } type verificationResponse struct { - Success bool `json:"success"` + Success bool `json:"success"` Checks []agenticv1alpha1.VerifyCheck `json:"checks"` Summary string `json:"summary"` } @@ -51,7 +51,7 @@ type verificationResponse struct { type SandboxAgentCaller struct { Sandbox SandboxProvider K8sClient client.Client - ClientFactory func(endpoint string) AgentHTTPClientInterface + ClientFactory func(endpoint string, timeout time.Duration) AgentHTTPClientInterface Namespace string Timeout time.Duration } @@ -59,7 +59,7 @@ type SandboxAgentCaller struct { func NewSandboxAgentCaller( sandbox SandboxProvider, k8sClient client.Client, - clientFactory func(endpoint string) AgentHTTPClientInterface, + clientFactory func(endpoint string, timeout time.Duration) AgentHTTPClientInterface, namespace string, ) *SandboxAgentCaller { return &SandboxAgentCaller{ @@ -71,13 +71,23 @@ func NewSandboxAgentCaller( } } +// proposalTimeout returns the effective timeout for sandbox operations. +// Reads spec.timeoutMinutes when set; falls back to defaultSandboxTimeout. +// This is the single place where timeout policy is decided. +func proposalTimeout(proposal *agenticv1alpha1.Proposal) time.Duration { + if proposal.Spec.TimeoutMinutes != nil && *proposal.Spec.TimeoutMinutes > 0 { + return time.Duration(*proposal.Spec.TimeoutMinutes) * time.Minute + } + return defaultSandboxTimeout +} + func stepString(step agenticv1alpha1.SandboxStep) string { return strings.ToLower(string(step)) } -func (s *SandboxAgentCaller) Analyze(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string) (*AnalysisOutput, error) { +func (s *SandboxAgentCaller) Analyze(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string, timeout time.Duration) (*AnalysisOutput, error) { query := buildAnalysisQuery(requestText, proposal) - raw, err := s.callWithSandbox(ctx, proposal, stepString(agenticv1alpha1.SandboxStepAnalysis), step, query, buildAgentContext(proposal), serviceAccount) + raw, err := s.callWithSandbox(ctx, proposal, stepString(agenticv1alpha1.SandboxStepAnalysis), step, query, buildAgentContext(proposal), serviceAccount, timeout) if err != nil { return nil, fmt.Errorf("%s: %w", ErrAnalysisAgentCall, err) } @@ -93,14 +103,14 @@ func (s *SandboxAgentCaller) Analyze(ctx context.Context, proposal *agenticv1alp }, nil } -func (s *SandboxAgentCaller) Execute(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, serviceAccount string) (*ExecutionOutput, error) { +func (s *SandboxAgentCaller) Execute(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, serviceAccount string, timeout time.Duration) (*ExecutionOutput, error) { agentCtx := buildAgentContext(proposal) if option != nil { agentCtx.ApprovedOption = option } query := buildExecutionQuery(option) - raw, err := s.callWithSandbox(ctx, proposal, stepString(agenticv1alpha1.SandboxStepExecution), step, query, agentCtx, serviceAccount) + raw, err := s.callWithSandbox(ctx, proposal, stepString(agenticv1alpha1.SandboxStepExecution), step, query, agentCtx, serviceAccount, timeout) if err != nil { return nil, fmt.Errorf("%s: %w", ErrExecutionAgentCall, err) } @@ -120,7 +130,7 @@ func (s *SandboxAgentCaller) Execute(ctx context.Context, proposal *agenticv1alp return out, nil } -func (s *SandboxAgentCaller) Verify(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, exec *ExecutionOutput, serviceAccount string) (*VerificationOutput, error) { +func (s *SandboxAgentCaller) Verify(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, option *agenticv1alpha1.RemediationOption, exec *ExecutionOutput, serviceAccount string, timeout time.Duration) (*VerificationOutput, error) { agentCtx := buildAgentContext(proposal) if option != nil { agentCtx.ApprovedOption = option @@ -128,7 +138,7 @@ func (s *SandboxAgentCaller) Verify(ctx context.Context, proposal *agenticv1alph agentCtx.ExecutionResult = executionOutputToAgentResult(exec) query := buildVerificationQuery(option, exec) - raw, err := s.callWithSandbox(ctx, proposal, stepString(agenticv1alpha1.SandboxStepVerification), step, query, agentCtx, serviceAccount) + raw, err := s.callWithSandbox(ctx, proposal, stepString(agenticv1alpha1.SandboxStepVerification), step, query, agentCtx, serviceAccount, timeout) if err != nil { return nil, fmt.Errorf("%s: %w", ErrVerificationAgentCall, err) } @@ -145,9 +155,9 @@ func (s *SandboxAgentCaller) Verify(ctx context.Context, proposal *agenticv1alph }, nil } -func (s *SandboxAgentCaller) Escalate(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string) (*EscalationOutput, error) { +func (s *SandboxAgentCaller) Escalate(ctx context.Context, proposal *agenticv1alpha1.Proposal, step resolvedStep, requestText string, serviceAccount string, timeout time.Duration) (*EscalationOutput, error) { agentCtx := buildAgentContext(proposal) - raw, err := s.callWithSandbox(ctx, proposal, stepString(agenticv1alpha1.SandboxStepEscalation), step, requestText, agentCtx, serviceAccount) + raw, err := s.callWithSandbox(ctx, proposal, stepString(agenticv1alpha1.SandboxStepEscalation), step, requestText, agentCtx, serviceAccount, timeout) if err != nil { return nil, fmt.Errorf("%s: %w", ErrEscalationAgentCall, err) } @@ -176,9 +186,15 @@ func (s *SandboxAgentCaller) callWithSandbox( query string, agentCtx *agentContext, serviceAccount string, + timeout time.Duration, ) (json.RawMessage, error) { s.Sandbox.SetStep(step.Agent, step.LLM, step.Tools, serviceAccount) + if timeout <= 0 { + timeout = defaultSandboxTimeout + } + + claimName, err := s.Sandbox.Claim(ctx, proposal.Name, stepName, "") if err != nil { return nil, fmt.Errorf("%s: %w", ErrClaimSandbox, err) @@ -188,11 +204,6 @@ func (s *SandboxAgentCaller) callWithSandbox( // while the sandbox is still starting up s.patchSandboxInfo(ctx, proposal, stepName, claimName) - timeout := s.Timeout - if timeout == 0 { - timeout = defaultSandboxTimeout - } - endpoint, err := s.Sandbox.WaitReady(ctx, claimName, timeout) if err != nil { return nil, fmt.Errorf("%s: %w", ErrWaitForSandbox, err) @@ -205,7 +216,7 @@ func (s *SandboxAgentCaller) callWithSandbox( schema := outputSchemaForStep(stepName, proposal) - client := s.ClientFactory(agentURL) + client := s.ClientFactory(agentURL, timeout) resp, err := client.Run(ctx, "", query, schema, agentCtx) if err != nil { return nil, err diff --git a/controller/proposal/sandbox_agent_test.go b/controller/proposal/sandbox_agent_test.go index 9a046058..20ed0775 100644 --- a/controller/proposal/sandbox_agent_test.go +++ b/controller/proposal/sandbox_agent_test.go @@ -60,11 +60,11 @@ func newTestSandboxAgentCaller(sandbox *mockSandboxProvider, httpClient *mockHTT fc := fake.NewClientBuilder().WithScheme(testScheme()).Build() _ = fc.Create(context.Background(), fakeBaseTemplate()) return &SandboxAgentCaller{ - Sandbox: sandbox, - K8sClient: fc, - ClientFactory: func(_ string) AgentHTTPClientInterface { return httpClient }, - Namespace: "test-ns", - Timeout: 5 * time.Minute, + Sandbox: sandbox, + K8sClient: fc, + ClientFactory: func(_ string, _ time.Duration) AgentHTTPClientInterface { return httpClient }, + Namespace: "test-ns", + Timeout: 5 * time.Minute, } } @@ -75,11 +75,11 @@ func newTestSandboxAgentCallerWithProposal(sandbox *mockSandboxProvider, httpCli Build() _ = fc.Create(context.Background(), fakeBaseTemplate()) return &SandboxAgentCaller{ - Sandbox: sandbox, - K8sClient: fc, - ClientFactory: func(_ string) AgentHTTPClientInterface { return httpClient }, - Namespace: "test-ns", - Timeout: 5 * time.Minute, + Sandbox: sandbox, + K8sClient: fc, + ClientFactory: func(_ string, _ time.Duration) AgentHTTPClientInterface { return httpClient }, + Namespace: "test-ns", + Timeout: 5 * time.Minute, } } @@ -107,7 +107,7 @@ func TestSandboxAgentCaller_Analyze_HappyPath(t *testing.T) { } caller := newTestSandboxAgentCaller(sandbox, httpClient) - result, err := caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "Pod crashing", defaultSandboxSA) + result, err := caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "Pod crashing", defaultSandboxSA, defaultSandboxTimeout) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -132,7 +132,7 @@ func TestSandboxAgentCaller_Execute_HappyPath(t *testing.T) { caller := newTestSandboxAgentCaller(sandbox, httpClient) option := &agenticv1alpha1.RemediationOption{Title: "Fix it"} - result, err := caller.Execute(context.Background(), testSandboxProposal(), testSandboxStep(), option, defaultSandboxSA) + result, err := caller.Execute(context.Background(), testSandboxProposal(), testSandboxStep(), option, defaultSandboxSA, defaultSandboxTimeout) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -156,7 +156,7 @@ func TestSandboxAgentCaller_Verify_HappyPath(t *testing.T) { } caller := newTestSandboxAgentCaller(sandbox, httpClient) - result, err := caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), nil, nil, defaultSandboxSA) + result, err := caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), nil, nil, defaultSandboxSA, defaultSandboxTimeout) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -178,7 +178,7 @@ func TestSandboxAgentCaller_ClaimError(t *testing.T) { httpClient := &mockHTTPClient{} caller := newTestSandboxAgentCaller(sandbox, httpClient) - _, err := caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "test", defaultSandboxSA) + _, err := caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "test", defaultSandboxSA, defaultSandboxTimeout) if err == nil { t.Fatal("expected error") } @@ -195,7 +195,7 @@ func TestSandboxAgentCaller_WaitReadyError(t *testing.T) { httpClient := &mockHTTPClient{} caller := newTestSandboxAgentCaller(sandbox, httpClient) - _, err := caller.Execute(context.Background(), testSandboxProposal(), testSandboxStep(), nil, defaultSandboxSA) + _, err := caller.Execute(context.Background(), testSandboxProposal(), testSandboxStep(), nil, defaultSandboxSA, defaultSandboxTimeout) if err == nil { t.Fatal("expected error") } @@ -209,7 +209,7 @@ func TestSandboxAgentCaller_HTTPError(t *testing.T) { httpClient := &mockHTTPClient{err: fmt.Errorf("connection refused")} caller := newTestSandboxAgentCaller(sandbox, httpClient) - _, err := caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), nil, nil, defaultSandboxSA) + _, err := caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), nil, nil, defaultSandboxSA, defaultSandboxTimeout) if err == nil { t.Fatal("expected error") } @@ -225,7 +225,7 @@ func TestSandboxAgentCaller_ParseError(t *testing.T) { } caller := newTestSandboxAgentCaller(sandbox, httpClient) - _, err := caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "test", defaultSandboxSA) + _, err := caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "test", defaultSandboxSA, defaultSandboxTimeout) if err == nil { t.Fatal("expected parse error") } @@ -241,7 +241,7 @@ func TestSandboxAgentCaller_SandboxNotReleasedAfterCall(t *testing.T) { } caller := newTestSandboxAgentCaller(sandbox, httpClient) - _, _ = caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "test", defaultSandboxSA) + _, _ = caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "test", defaultSandboxSA, defaultSandboxTimeout) if sandbox.claimCalls != 1 { t.Errorf("Claim calls = %d, want 1", sandbox.claimCalls) @@ -282,7 +282,7 @@ func TestSandboxAgentCaller_ContextPropagation(t *testing.T) { }, } - _, _ = caller.Analyze(context.Background(), proposal, testSandboxStep(), "test", defaultSandboxSA) + _, _ = caller.Analyze(context.Background(), proposal, testSandboxStep(), "test", defaultSandboxSA, defaultSandboxTimeout) if httpClient.lastCtx == nil { t.Fatal("expected context to be set") @@ -318,7 +318,7 @@ func TestSandboxAgentCaller_VerifyPassesExecutionResult(t *testing.T) { }, } - _, _ = caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), option, exec, defaultSandboxSA) + _, _ = caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), option, exec, defaultSandboxSA, defaultSandboxTimeout) if httpClient.lastCtx == nil { t.Fatal("expected context to be set") @@ -353,7 +353,7 @@ func TestSandboxAgentCaller_VerifyNilExecLeavesExecutionResultNil(t *testing.T) } caller := newTestSandboxAgentCaller(sandbox, httpClient) - _, _ = caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), nil, nil, defaultSandboxSA) + _, _ = caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), nil, nil, defaultSandboxSA, defaultSandboxTimeout) if httpClient.lastCtx == nil { t.Fatal("expected context to be set") @@ -377,7 +377,7 @@ func TestSandboxAgentCaller_VerifyExecWithoutInlineVerification(t *testing.T) { }, } - _, _ = caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), nil, exec, defaultSandboxSA) + _, _ = caller.Verify(context.Background(), testSandboxProposal(), testSandboxStep(), nil, exec, defaultSandboxSA, defaultSandboxTimeout) if httpClient.lastCtx.ExecutionResult == nil { t.Fatal("expected executionResult in context") @@ -395,7 +395,7 @@ func TestSandboxAgentCaller_ExecutePassesApprovedOption(t *testing.T) { caller := newTestSandboxAgentCaller(sandbox, httpClient) option := &agenticv1alpha1.RemediationOption{Title: "Scale up replicas"} - _, _ = caller.Execute(context.Background(), testSandboxProposal(), testSandboxStep(), option, defaultSandboxSA) + _, _ = caller.Execute(context.Background(), testSandboxProposal(), testSandboxStep(), option, defaultSandboxSA, defaultSandboxTimeout) if httpClient.lastCtx == nil || httpClient.lastCtx.ApprovedOption == nil { t.Fatal("expected approved option in context") @@ -414,7 +414,7 @@ func TestSandboxAgentCaller_AnalysisQueryFraming(t *testing.T) { } caller := newTestSandboxAgentCaller(sandbox, httpClient) - _, _ = caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "Pod crashing with OOMKilled", defaultSandboxSA) + _, _ = caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "Pod crashing with OOMKilled", defaultSandboxSA, defaultSandboxTimeout) if !strings.Contains(httpClient.lastQuery, "analysis agent") { t.Error("analysis query should contain role framing") @@ -444,7 +444,7 @@ func TestSandboxAgentCaller_ExecutionQueryFraming(t *testing.T) { } proposal := testSandboxProposal() proposal.Spec.Request = "Pod crashing with OOMKilled" - _, _ = caller.Execute(context.Background(), proposal, testSandboxStep(), option, defaultSandboxSA) + _, _ = caller.Execute(context.Background(), proposal, testSandboxStep(), option, defaultSandboxSA, defaultSandboxTimeout) if !strings.Contains(httpClient.lastQuery, "execution agent") { t.Error("execution query should contain role framing") @@ -473,7 +473,7 @@ func TestSandboxAgentCaller_VerificationQueryFraming(t *testing.T) { } proposal := testSandboxProposal() proposal.Spec.Request = "Pod crashing with OOMKilled" - _, _ = caller.Verify(context.Background(), proposal, testSandboxStep(), option, exec, defaultSandboxSA) + _, _ = caller.Verify(context.Background(), proposal, testSandboxStep(), option, exec, defaultSandboxSA, defaultSandboxTimeout) if !strings.Contains(httpClient.lastQuery, "verification agent") { t.Error("verification query should contain role framing") @@ -496,7 +496,7 @@ func TestSandboxAgentCaller_ExecutionQueryNilOption(t *testing.T) { } caller := newTestSandboxAgentCaller(sandbox, httpClient) - _, _ = caller.Execute(context.Background(), testSandboxProposal(), testSandboxStep(), nil, defaultSandboxSA) + _, _ = caller.Execute(context.Background(), testSandboxProposal(), testSandboxStep(), nil, defaultSandboxSA, defaultSandboxTimeout) if !strings.Contains(httpClient.lastQuery, "execution agent") { t.Error("execution query should still contain role framing with nil option") @@ -519,7 +519,7 @@ func TestSandboxAgentCaller_Analyze_PatchesSandboxInfo(t *testing.T) { proposal := testSandboxProposal() caller := newTestSandboxAgentCallerWithProposal(sandbox, httpClient, proposal) - _, err := caller.Analyze(context.Background(), proposal, testSandboxStep(), "Pod crashing", defaultSandboxSA) + _, err := caller.Analyze(context.Background(), proposal, testSandboxStep(), "Pod crashing", defaultSandboxSA, defaultSandboxTimeout) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -548,7 +548,7 @@ func TestSandboxAgentCaller_Execute_PatchesSandboxInfo(t *testing.T) { proposal := testSandboxProposal() caller := newTestSandboxAgentCallerWithProposal(sandbox, httpClient, proposal) - _, err := caller.Execute(context.Background(), proposal, testSandboxStep(), nil, defaultSandboxSA) + _, err := caller.Execute(context.Background(), proposal, testSandboxStep(), nil, defaultSandboxSA, defaultSandboxTimeout) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -574,7 +574,7 @@ func TestSandboxAgentCaller_Verify_PatchesSandboxInfo(t *testing.T) { proposal := testSandboxProposal() caller := newTestSandboxAgentCallerWithProposal(sandbox, httpClient, proposal) - _, err := caller.Verify(context.Background(), proposal, testSandboxStep(), nil, nil, defaultSandboxSA) + _, err := caller.Verify(context.Background(), proposal, testSandboxStep(), nil, nil, defaultSandboxSA, defaultSandboxTimeout) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -602,7 +602,7 @@ func TestSandboxAgentCaller_SandboxInfoPatch_DoesNotBlockOnError(t *testing.T) { caller := newTestSandboxAgentCaller(sandbox, httpClient) proposal := testSandboxProposal() - _, err := caller.Analyze(context.Background(), proposal, testSandboxStep(), "test", defaultSandboxSA) + _, err := caller.Analyze(context.Background(), proposal, testSandboxStep(), "test", defaultSandboxSA, defaultSandboxTimeout) if err != nil { t.Fatalf("analysis should succeed even when sandbox info patch fails: %v", err) } From b69c6585922c04f036681a0905e9848bb1d660d4 Mon Sep 17 00:00:00 2001 From: Sakshi Patidar Date: Fri, 29 May 2026 14:37:33 +0530 Subject: [PATCH 2/3] feat: add dataSource PVC mount and IntelliAide proposal templates --- api/v1alpha1/proposal_types.go | 47 ++++- api/v1alpha1/zz_generated.deepcopy.go | 5 + controller/proposal/sandbox.go | 2 +- controller/proposal/sandbox_templates.go | 54 ++++- controller/proposal/sandbox_templates_test.go | 14 +- .../proposal/templates/analysis_query.tmpl | 26 +++ examples/setup/09-intelliaide-proposals.yaml | 188 ++++++++++++++++++ 7 files changed, 314 insertions(+), 22 deletions(-) create mode 100644 examples/setup/09-intelliaide-proposals.yaml diff --git a/api/v1alpha1/proposal_types.go b/api/v1alpha1/proposal_types.go index 56f47417..dbfc503b 100644 --- a/api/v1alpha1/proposal_types.go +++ b/api/v1alpha1/proposal_types.go @@ -270,6 +270,22 @@ func (s ProposalStep) IsZero() bool { return s.Agent == "" && s.Tools.IsZero() } +// DataSource references a pre-existing PersistentVolumeClaim containing +// input data for this proposal (e.g., must-gather bundles, diagnostic data). +// The PVC must already exist in the same namespace as the Proposal and be +// pre-populated with data before the Proposal is created. The operator +// mounts it read-only at a well-known path (/data/input) accessible to +// all skills in the sandbox pod. +type DataSource struct { + // claimName is the name of the PersistentVolumeClaim to mount. + // The PVC must exist in the same namespace as the Proposal. + // +required + // +kubebuilder:validation:MinLength=1 + // +kubebuilder:validation:MaxLength=253 + // +kubebuilder:validation:XValidation:rule="!format.dns1123Subdomain().validate(self).hasValue()",message="must be a valid DNS subdomain" + ClaimName string `json:"claimName"` +} + // ProposalSpec defines the desired state of Proposal. // // A Proposal defines the workflow shape inline, specifying which steps @@ -284,6 +300,7 @@ func (s ProposalStep) IsZero() bool { // +kubebuilder:validation:XValidation:rule="!has(oldSelf.analysis) || (has(self.analysis) && self.analysis == oldSelf.analysis)",message="analysis is immutable once set" // +kubebuilder:validation:XValidation:rule="!has(oldSelf.execution) || (has(self.execution) && self.execution == oldSelf.execution)",message="execution is immutable once set" // +kubebuilder:validation:XValidation:rule="!has(oldSelf.verification) || (has(self.verification) && self.verification == oldSelf.verification)",message="verification is immutable once set" +// +kubebuilder:validation:XValidation:rule="!has(oldSelf.dataSource) || (has(self.dataSource) && self.dataSource == oldSelf.dataSource)",message="dataSource is immutable once set" type ProposalSpec struct { // request is the user's original request, alert description, or a // description of what triggered this proposal. This text is passed to @@ -341,6 +358,15 @@ type ProposalSpec struct { // +optional Tools ToolsSpec `json:"tools,omitzero"` + // dataSource references a PVC containing pre-populated input data + // (e.g., must-gather bundles, diagnostic data). The operator mounts + // it read-only at /data/input in the sandbox pod. Skills discover + // input data at this standard location. + // + // Immutable: input data source is fixed at creation. + // +optional + DataSource *DataSource `json:"dataSource,omitzero"` + // analysis defines per-step configuration for the analysis step, // including which agent handles it and any per-step tools. // @@ -362,6 +388,18 @@ type ProposalSpec struct { // +optional Verification ProposalStep `json:"verification,omitzero"` + // timeoutMinutes sets the per-step timeout for sandbox agent calls. + // This controls how long the operator waits for the sandbox pod to + // become ready and for the agent to complete its work. Increase this + // for long-running tools (e.g., IntelliAide RCA takes 10-30 minutes). + // Defaults to 5 minutes when omitted. + // + // Mutable: can be adjusted before approving a step. + // +optional + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Maximum=60 + TimeoutMinutes *int32 `json:"timeoutMinutes,omitempty"` + // revisionFeedback is the user's free-text feedback requesting changes // to the analysis. Patching this field bumps metadata.generation, which // the operator detects (generation > observedGeneration) and triggers @@ -374,15 +412,6 @@ type ProposalSpec struct { // +kubebuilder:validation:MaxLength=32768 RevisionFeedback string `json:"revisionFeedback,omitempty"` - // timeoutMinutes overrides the default sandbox operation timeout for - // this proposal. When set, all sandbox wait and HTTP client timeouts - // use this value instead of the operator default (5 minutes). - // - // Immutable: timeout policy is fixed at creation. - // +optional - // +kubebuilder:validation:Minimum=1 - // +kubebuilder:validation:Maximum=120 - TimeoutMinutes *int32 `json:"timeoutMinutes,omitzero"` } // ProposalStatus defines the observed state of Proposal. All fields are diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index f6720157..41cbe9fe 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -1263,6 +1263,11 @@ func (in *ProposalSpec) DeepCopyInto(out *ProposalSpec) { in.Analysis.DeepCopyInto(&out.Analysis) in.Execution.DeepCopyInto(&out.Execution) in.Verification.DeepCopyInto(&out.Verification) + if in.TimeoutMinutes != nil { + in, out := &in.TimeoutMinutes, &out.TimeoutMinutes + *out = new(int32) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProposalSpec. diff --git a/controller/proposal/sandbox.go b/controller/proposal/sandbox.go index 6dfe09a6..a4e47d8e 100644 --- a/controller/proposal/sandbox.go +++ b/controller/proposal/sandbox.go @@ -99,7 +99,7 @@ func (m *SandboxManager) buildClaim(claimName, proposalName, step, templateName func (m *SandboxManager) Claim(ctx context.Context, proposalName, step, _ string) (string, error) { log := logf.FromContext(ctx) - templateName, err := EnsureAgentTemplate(ctx, m.Client, m.BaseTemplateName, m.Namespace, step, m.agent, m.llm, m.tools, m.serviceAccount) + templateName, err := EnsureAgentTemplate(ctx, m.Client, m.BaseTemplateName, m.Namespace, step, m.agent, m.llm, m.tools, nil, m.serviceAccount) if err != nil { return "", fmt.Errorf("%s: %w", ErrEnsureAgentTemplate, err) } diff --git a/controller/proposal/sandbox_templates.go b/controller/proposal/sandbox_templates.go index fcb5d3ce..1c691a3f 100644 --- a/controller/proposal/sandbox_templates.go +++ b/controller/proposal/sandbox_templates.go @@ -73,10 +73,14 @@ var sandboxTemplateGVK = schema.GroupVersionKind{ } const ( - llmCredsMountPath = "/var/run/secrets/llm-credentials" - llmCredsVolumeName = "llm-credentials" - mcpHeadersMountRoot = "/var/secrets/mcp" - mcpServersEnvVar = "LIGHTSPEED_MCP_SERVERS" + agentModeEnvVar = "LIGHTSPEED_MODE" + llmCredsMountPath = "/var/run/secrets/llm-credentials" + vertexCredsMountPath = "/var/secrets/google" + vertexCredsFileName = "credentials.json" + llmCredsVolumeName = "llm-credentials" + mcpHeadersMountRoot = "/var/secrets/mcp" + mcpServersEnvVar = "LIGHTSPEED_MCP_SERVERS" + dataSourceMountPath = "/data/input" LabelManaged = "agentic.openshift.io/managed" LabelBaseTemplate = "agentic.openshift.io/base-template" @@ -92,6 +96,7 @@ type templateHashInput struct { Skills []agenticv1alpha1.SkillsSource `json:"skills"` MCPServers []agenticv1alpha1.MCPServerConfig `json:"mcpServers,omitempty"` RequiredSecrets []agenticv1alpha1.SecretRequirement `json:"requiredSecrets,omitempty"` + DataSource *agenticv1alpha1.DataSource `json:"dataSource,omitempty"` Step string `json:"step"` BaseResourceVersion string `json:"baseRV"` ServiceAccount string `json:"serviceAccount"` @@ -103,6 +108,7 @@ func computeTemplateHash( skills []agenticv1alpha1.SkillsSource, mcpServers []agenticv1alpha1.MCPServerConfig, requiredSecrets []agenticv1alpha1.SecretRequirement, + dataSource *agenticv1alpha1.DataSource, step string, baseResourceVersion string, serviceAccount string, @@ -113,6 +119,7 @@ func computeTemplateHash( Skills: skills, MCPServers: mcpServers, RequiredSecrets: requiredSecrets, + DataSource: dataSource, Step: step, BaseResourceVersion: baseResourceVersion, ServiceAccount: serviceAccount, @@ -142,6 +149,7 @@ func EnsureAgentTemplate( agent *agenticv1alpha1.Agent, llm *agenticv1alpha1.LLMProvider, tools *agenticv1alpha1.ToolsSpec, + dataSource *agenticv1alpha1.DataSource, serviceAccount string, ) (string, error) { log := logf.FromContext(ctx).WithName("sandbox-templates") @@ -168,7 +176,7 @@ func EnsureAgentTemplate( requiredSecrets = tools.RequiredSecrets } - hash, err := computeTemplateHash(llm, agent.Spec.Model, skills, mcpServers, requiredSecrets, step, base.GetResourceVersion(), serviceAccount) + hash, err := computeTemplateHash(llm, agent.Spec.Model, skills, mcpServers, requiredSecrets, dataSource, step, base.GetResourceVersion(), serviceAccount) if err != nil { return "", fmt.Errorf("%s: %w", ErrComputeTemplateHash, err) } @@ -234,6 +242,12 @@ func EnsureAgentTemplate( } } + if dataSource != nil { + if err := patchDataSource(derived, dataSource); err != nil { + return "", fmt.Errorf("patch data source: %w", err) + } + } + if err := patchProbes(derived); err != nil { return "", fmt.Errorf("%s: %w", ErrPatchProbes, err) } @@ -616,6 +630,36 @@ func addSecretVolume(tmpl *unstructured.Unstructured, volumeName, secretName str return unstructured.SetNestedSlice(tmpl.Object, volumes, "spec", "podTemplate", "spec", "volumes") } +func addPVCVolume(tmpl *unstructured.Unstructured, volumeName, claimName string) error { + volumes, _, _ := unstructured.NestedSlice(tmpl.Object, "spec", "podTemplate", "spec", "volumes") + vol := map[string]any{ + "name": volumeName, + "persistentVolumeClaim": map[string]any{ + "claimName": claimName, + }, + } + for i, v := range volumes { + existing, ok := v.(map[string]any) + if !ok { + continue + } + if existing["name"] == volumeName { + volumes[i] = vol + return unstructured.SetNestedSlice(tmpl.Object, volumes, "spec", "podTemplate", "spec", "volumes") + } + } + volumes = append(volumes, vol) + return unstructured.SetNestedSlice(tmpl.Object, volumes, "spec", "podTemplate", "spec", "volumes") +} + +func patchDataSource(tmpl *unstructured.Unstructured, ds *agenticv1alpha1.DataSource) error { + volName := "data-source" + if err := addPVCVolume(tmpl, volName, ds.ClaimName); err != nil { + return fmt.Errorf("add data source PVC volume: %w", err) + } + return addVolumeMount(tmpl, volName, dataSourceMountPath, true) +} + func addVolumeMount(tmpl *unstructured.Unstructured, name, mountPath string, readOnly bool) error { container, containers, err := firstContainer(tmpl) if err != nil { diff --git a/controller/proposal/sandbox_templates_test.go b/controller/proposal/sandbox_templates_test.go index 4ffb3fbc..9ca39adc 100644 --- a/controller/proposal/sandbox_templates_test.go +++ b/controller/proposal/sandbox_templates_test.go @@ -77,7 +77,7 @@ func emptyTemplate() *unstructured.Unstructured { func mustHash(t *testing.T, llm *agenticv1alpha1.LLMProvider, model string, skills []agenticv1alpha1.SkillsSource, requiredSecrets []agenticv1alpha1.SecretRequirement, phase string) string { t.Helper() - h, err := computeTemplateHash(llm, model, skills, nil, requiredSecrets, phase, "", "") + h, err := computeTemplateHash(llm, model, skills, nil, requiredSecrets, nil, phase, "", "") if err != nil { t.Fatalf("computeTemplateHash: %v", err) } @@ -517,14 +517,14 @@ func TestSetEnvVar_FailsOnNoContainers(t *testing.T) { } func TestEnsureAgentTemplate_NilAgent(t *testing.T) { - _, err := EnsureAgentTemplate(context.Background(), nil, "base", "ns", "analysis", nil, testLLMProvider(agenticv1alpha1.LLMProviderGoogleCloudVertex), nil, "lightspeed-agent") + _, err := EnsureAgentTemplate(context.Background(), nil, "base", "ns", "analysis", nil, testLLMProvider(agenticv1alpha1.LLMProviderGoogleCloudVertex), nil, nil, "lightspeed-agent") if err == nil { t.Error("expected error for nil agent") } } func TestEnsureAgentTemplate_NilLLM(t *testing.T) { - _, err := EnsureAgentTemplate(context.Background(), nil, "base", "ns", "analysis", testDefaultAgent(), nil, nil, "lightspeed-agent") + _, err := EnsureAgentTemplate(context.Background(), nil, "base", "ns", "analysis", testDefaultAgent(), nil, nil, nil,"lightspeed-agent") if err == nil { t.Error("expected error for nil LLM") } @@ -627,11 +627,11 @@ func TestComputeTemplateHash_DifferentBaseResourceVersion(t *testing.T) { llm := testLLMProvider(agenticv1alpha1.LLMProviderGoogleCloudVertex) skills := []agenticv1alpha1.SkillsSource{{Image: "quay.io/test/skills:latest"}} - h1, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, "analysis", "1000", "") + h1, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, nil, "analysis", "1000", "") if err != nil { t.Fatal(err) } - h2, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, "analysis", "2000", "") + h2, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, nil, "analysis", "2000", "") if err != nil { t.Fatal(err) } @@ -645,11 +645,11 @@ func TestComputeTemplateHash_SameBaseResourceVersion(t *testing.T) { llm := testLLMProvider(agenticv1alpha1.LLMProviderGoogleCloudVertex) skills := []agenticv1alpha1.SkillsSource{{Image: "quay.io/test/skills:latest"}} - h1, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, "analysis", "1000", "") + h1, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, nil, "analysis", "1000", "") if err != nil { t.Fatal(err) } - h2, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, "analysis", "1000", "") + h2, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, nil, "analysis", "1000", "") if err != nil { t.Fatal(err) } diff --git a/controller/proposal/templates/analysis_query.tmpl b/controller/proposal/templates/analysis_query.tmpl index cdc002f0..ff6eea11 100644 --- a/controller/proposal/templates/analysis_query.tmpl +++ b/controller/proposal/templates/analysis_query.tmpl @@ -1,5 +1,31 @@ You are an analysis agent. Your job is to diagnose the problem, determine the root cause, and propose one or more remediation options. Do NOT attempt to fix, patch, or execute any changes — only analyze and propose. +## Skills + +A specialist deep-RCA pipeline is available at `/app/skills/intelliaide/SKILL.md`. + +Use it ONLY when the request calls for: +- Root cause analysis (RCA) or deeper / ML-assisted troubleshooting of a cluster issue +- Must-gather collection or analysis +- Investigating pod failures, etcd degradation, networking problems, storage issues, etc. +- Any request that explicitly mentions "deeper analysis", "deeper troubleshooting", "root cause", "RCA", "must-gather", or "IntelliAide" + +For routine inspection (checking pod/node status, listing events, summarising resource state, +describing objects), use `kubectl`/`oc` commands directly — do NOT invoke the IntelliAide pipeline. + +**Decision rule — apply before doing anything else:** +1. Read the `## Request` section below. +2. If it is a routine inspection query → proceed with `kubectl`/`oc` directly. +3. If it is a deep-RCA or troubleshooting request → read the skill file with ONE atomic command: + ``` + cat /app/skills/intelliaide/SKILL.md + ``` + If the command returns one or more paths, read the most relevant SKILL.md with `cat` + and follow its workflow **exactly** instead of the instructions below. + If no SKILL.md files are found, stop immediately and return a JSON error response — skills are required and their absence is a fatal misconfiguration. + +## Analysis requirements + For each option you propose, include: - A diagnosis with root cause and confidence level - A detailed remediation plan with specific actions diff --git a/examples/setup/09-intelliaide-proposals.yaml b/examples/setup/09-intelliaide-proposals.yaml new file mode 100644 index 00000000..d7e177d4 --- /dev/null +++ b/examples/setup/09-intelliaide-proposals.yaml @@ -0,0 +1,188 @@ +# IntelliAide RCA Proposals +# +# IntelliAide is a live-cluster / must-gather RCA pipeline that runs a 3-pass +# analysis (High → Medium → Low priority) using Claude as the orchestrator. +# +# Integration model: +# User triggers a Proposal CR → operator launches analysis sandbox +# → Claude reads /app/skills/intelliaide/SKILL.md +# → Claude calls skill scripts (extract_cluster, select_files, analyze_data, perform_rca) +# → All LLM reasoning is performed by the orchestrating Claude session +# → Claude maps results into standard diagnosis/proposal fields +# +# Workflow shape: analysis ONLY (advisory). +# IntelliAide returns diagnosis and recommendations. There is no automated +# execution or verification step — humans decide on follow-up actions. +# +# Output rendering: +# RCA findings are mapped into the standard diagnosis/proposal fields so they +# render with MarkdownText in the console UI. No custom outputSchema is used. +# +# IntelliAide triggering: +# The analysis prompt (analysis_query.tmpl) reads the request text and invokes +# the IntelliAide pipeline when the request contains RCA keywords such as +# "root cause analysis", "RCA", "deeper analysis", "must-gather", or "IntelliAide". +# All requests in this file are phrased to trigger IntelliAide automatically. +# +# Modes: +# Live mode (default) — IntelliAide reads live cluster state via kubectl/oc. +# Must-gather mode — set RUN_MUST_GATHER=1 env var, or phrase the request +# with "using must-gather" / "collect must-gather". +# +# Pre-requisites (cluster admin, one-time): +# 1. Build and push skills image: +# podman build -f Dockerfile.skills \ +# -t quay.io//intelliaide-skills:latest . +# podman push quay.io//intelliaide-skills:latest +# 2. Label openshift-lightspeed namespace (Kubernetes 1.21+ does this +# automatically via kubernetes.io/metadata.name): +# oc label namespace openshift-lightspeed \ +# kubernetes.io/metadata.name=openshift-lightspeed --overwrite +# +# Trigger: +# kubectl apply -f 09-intelliaide-proposals.yaml +# oc agentic proposal approve -n openshift-lightspeed --stage=analysis + +--- +# etcd RCA — etcd member health / leader election failures +apiVersion: agentic.openshift.io/v1alpha1 +kind: Proposal +metadata: + name: intelliaide-rca-etcd + namespace: openshift-lightspeed + labels: + agentic.openshift.io/source: intelliaide + agentic.openshift.io/component: etcd + agentic.openshift.io/mode: live +spec: + request: | + Perform root cause analysis for the following issue: + One or more etcd pods are not ready, or etcd endpoint health is degraded + in namespace openshift-etcd. Investigate the root cause using live cluster + data and provide remediation options. + + targetNamespaces: + - openshift-etcd + + # IntelliAide RCA takes 10-30 minutes; override the default 5-min timeout. + timeoutMinutes: 30 + + tools: + skills: + - image: image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest + + analysis: + agent: smart + +--- +# Cluster update failure RCA +apiVersion: agentic.openshift.io/v1alpha1 +kind: Proposal +metadata: + name: intelliaide-rca-update-failure + namespace: openshift-lightspeed + labels: + agentic.openshift.io/source: intelliaide + agentic.openshift.io/component: cluster-version-operator + agentic.openshift.io/mode: live +spec: + request: | + Perform root cause analysis for the following issue: + OpenShift cluster update has stalled or failed. ClusterVersion reports + Progressing=True with no forward progress, or Degraded=True. Investigate + the root cause using live cluster data and recommend remediation steps. + + targetNamespaces: + - openshift-cluster-version + - openshift-machine-config-operator + + timeoutMinutes: 30 + + tools: + skills: + - image: image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest + + analysis: + agent: smart + +--- +# Live-cluster RCA — intermittent router/ingress latency spikes +# +# Tricky because the symptoms (HTTP 504s, slow responses) appear at the +# application layer but the underlying cause is almost always one of: +# - HAProxy router thread saturation from a single high-fan-out Service +# - A misconfigured keepalive / timeout on one backend pool causing cascading +# connection exhaustion across unrelated routes +# - An mTLS re-handshake storm triggered by certificate rotation on a subset +# of pods, which the router cannot distinguish from normal slow backends +# +# IntelliAide runs in LIVE mode so it reads current router metrics, endpoint +# slice states, and HAProxy stats in real time without collecting a must-gather. +apiVersion: agentic.openshift.io/v1alpha1 +kind: Proposal +metadata: + name: intelliaide-rca-router-latency + namespace: openshift-lightspeed + labels: + agentic.openshift.io/source: intelliaide + agentic.openshift.io/component: ingress + agentic.openshift.io/mode: live +spec: + request: | + Perform root cause analysis for the following issue: + Applications behind the OpenShift router are intermittently experiencing + HTTP 504 Gateway Timeout and elevated P99 latency (>2 s) during normal + business hours, even though individual pod health checks report healthy. + The issue is not reproducible on every request and affects multiple routes + in the same namespace. No recent changes were made to the Deployments or + Services involved. Investigate the root cause using live cluster data and + provide remediation options. + + targetNamespaces: + - openshift-ingress + - openshift-ingress-operator + + timeoutMinutes: 30 + + tools: + skills: + - image: image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest + + analysis: + agent: smart + +--- +# General RCA — operator-managed workload degradation (no dedicated template) +# +# Use this as a starting point when no specific proposal template fits. +# Covers scenarios such as: Deployment rollout stuck, pods CrashLoopBackOff, +# OOMKill cycles, PVC mount failures, or admission webhook rejections that +# are not caused by a single obvious misconfiguration. IntelliAide examines +# the affected namespace holistically and correlates events, logs, and +# resource state across the 3-pass priority model. +apiVersion: agentic.openshift.io/v1alpha1 +kind: Proposal +metadata: + name: intelliaide-rca-general + namespace: openshift-lightspeed + labels: + agentic.openshift.io/source: intelliaide + agentic.openshift.io/mode: live +spec: + request: | + Perform a deeper root cause analysis for the following issue: + One or more workloads in the cluster are degraded or unavailable and the + cause is not immediately obvious from describe output or recent events. + Use IntelliAide to collect and correlate live cluster data across + namespaces, then identify the root cause and recommend remediation steps. + + targetNamespaces: [] # fill in the affected namespace(s) before applying + + timeoutMinutes: 30 + + tools: + skills: + - image: image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest + + analysis: + agent: smart From 682b7fa6f6f4388f70ccd74a6bc15cc719e21871 Mon Sep 17 00:00:00 2001 From: Sakshi Patidar Date: Wed, 3 Jun 2026 17:12:37 +0530 Subject: [PATCH 3/3] fix: per-step timeoutMinutes, tools.dataSource, and review follow-ups --- .ai/spec/how/reconciler.md | 2 +- api/v1alpha1/proposal_types.go | 56 +++--- api/v1alpha1/tools_types.go | 17 +- api/v1alpha1/zz_generated.deepcopy.go | 21 +- .../bases/agentic.openshift.io_proposals.yaml | 186 +++++++++++++----- config/rbac/role.yaml | 8 - controller/proposal/client_test.go | 62 ++++++ controller/proposal/handlers.go | 10 +- controller/proposal/reconciler_test.go | 54 ++++- controller/proposal/resolve.go | 13 +- controller/proposal/sandbox.go | 2 +- controller/proposal/sandbox_agent.go | 21 +- controller/proposal/sandbox_agent_test.go | 84 ++++++-- controller/proposal/sandbox_templates.go | 15 +- controller/proposal/sandbox_templates_test.go | 90 ++++++++- .../proposal/templates/analysis_query.tmpl | 26 --- examples/setup/09-intelliaide-proposals.yaml | 73 +++++-- 17 files changed, 551 insertions(+), 189 deletions(-) diff --git a/.ai/spec/how/reconciler.md b/.ai/spec/how/reconciler.md index 62bd0997..c793b2c7 100644 --- a/.ai/spec/how/reconciler.md +++ b/.ai/spec/how/reconciler.md @@ -171,7 +171,7 @@ Audience: AI agents. Behavioral rules and phase semantics live in **what/** spec - **`SandboxProvider`:** Swappable claim/wait/release (tests can fake). Implementations: `SandboxManager` (sandbox-claim mode), `BarePodManager` (bare-pod mode). `SetStep` provides resolved step config before each `Claim` call. - **`PodSpecBuilder`:** Shared pod-spec assembly. Produces typed `corev1.PodSpec` from image + resolved step config. Used directly by `BarePodManager`; shared helper functions also used by `EnsureAgentTemplate` (unstructured path). - **`resolveProposal`:** Produces `resolvedWorkflow` with cached `Agent` + `LLMProvider` per name; applies per-stage agent overrides from `ProposalApproval` via `getStageOverrideAgent`; `Execution`/`Verification` steps nil when corresponding spec sections are zero. -- **`EnsureAgentTemplate`:** Deterministic derived `SandboxTemplate` name from hash of LLM spec, model, skills, MCP servers, required secrets, step, and **base template resourceVersion**. Patches pod template env/volumes for credentials, Vertex/Bedrock/Azure extras, skills image/paths, and MCP JSON env. GC older templates labeled for same agent+step. +- **`EnsureAgentTemplate`:** Deterministic derived `SandboxTemplate` name from hash of LLM spec, model, skills, MCP servers, required secrets, dataSource PVC, step, and **base template resourceVersion**. `dataSource` is extracted from `tools.DataSource` (set in `ToolsSpec`). Patches pod template env/volumes for credentials, Vertex/Bedrock/Azure extras, skills image/paths, MCP JSON env, `LIGHTSPEED_MODE`, and optional `/data/input` PVC mount. GC older templates labeled for same agent+step. --- diff --git a/api/v1alpha1/proposal_types.go b/api/v1alpha1/proposal_types.go index dbfc503b..869b3375 100644 --- a/api/v1alpha1/proposal_types.go +++ b/api/v1alpha1/proposal_types.go @@ -264,10 +264,22 @@ type ProposalStep struct { // for this step. Use this when different steps need different skills. // +optional Tools ToolsSpec `json:"tools,omitzero"` + + // timeoutMinutes sets the timeout for this step's sandbox agent call. + // This controls how long the operator waits for the sandbox pod to + // become ready and for the agent to complete its work. Increase this + // for long-running tools (e.g., IntelliAide RCA takes 10-30 minutes). + // Defaults to 5 minutes when omitted. + // + // Mutable: can be adjusted before approving a step. + // +optional + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Maximum=60 + TimeoutMinutes int32 `json:"timeoutMinutes,omitempty"` } func (s ProposalStep) IsZero() bool { - return s.Agent == "" && s.Tools.IsZero() + return s.Agent == "" && s.Tools.IsZero() && s.TimeoutMinutes == 0 } // DataSource references a pre-existing PersistentVolumeClaim containing @@ -283,7 +295,11 @@ type DataSource struct { // +kubebuilder:validation:MinLength=1 // +kubebuilder:validation:MaxLength=253 // +kubebuilder:validation:XValidation:rule="!format.dns1123Subdomain().validate(self).hasValue()",message="must be a valid DNS subdomain" - ClaimName string `json:"claimName"` + ClaimName string `json:"claimName,omitempty"` +} + +func (d DataSource) IsZero() bool { + return d.ClaimName == "" } // ProposalSpec defines the desired state of Proposal. @@ -297,10 +313,9 @@ type DataSource struct { // +kubebuilder:validation:XValidation:rule="!has(oldSelf.analysisOutput) || (has(self.analysisOutput) && self.analysisOutput == oldSelf.analysisOutput)",message="analysisOutput is immutable once set" // +kubebuilder:validation:XValidation:rule="!has(self.analysisOutput) || self.analysisOutput.mode != 'Minimal' || (!has(self.execution) && !has(self.verification))",message="analysisOutput mode Minimal is only allowed for analysis-only proposals (no execution or verification steps)" // +kubebuilder:validation:XValidation:rule="!has(oldSelf.tools) || (has(self.tools) && self.tools == oldSelf.tools)",message="tools is immutable once set" -// +kubebuilder:validation:XValidation:rule="!has(oldSelf.analysis) || (has(self.analysis) && self.analysis == oldSelf.analysis)",message="analysis is immutable once set" -// +kubebuilder:validation:XValidation:rule="!has(oldSelf.execution) || (has(self.execution) && self.execution == oldSelf.execution)",message="execution is immutable once set" -// +kubebuilder:validation:XValidation:rule="!has(oldSelf.verification) || (has(self.verification) && self.verification == oldSelf.verification)",message="verification is immutable once set" -// +kubebuilder:validation:XValidation:rule="!has(oldSelf.dataSource) || (has(self.dataSource) && self.dataSource == oldSelf.dataSource)",message="dataSource is immutable once set" +// +kubebuilder:validation:XValidation:rule="!has(oldSelf.analysis) || (has(self.analysis) && self.analysis.agent == oldSelf.analysis.agent && self.analysis.tools == oldSelf.analysis.tools)",message="analysis agent and tools are immutable once set" +// +kubebuilder:validation:XValidation:rule="!has(oldSelf.execution) || (has(self.execution) && self.execution.agent == oldSelf.execution.agent && self.execution.tools == oldSelf.execution.tools)",message="execution agent and tools are immutable once set" +// +kubebuilder:validation:XValidation:rule="!has(oldSelf.verification) || (has(self.verification) && self.verification.agent == oldSelf.verification.agent && self.verification.tools == oldSelf.verification.tools)",message="verification agent and tools are immutable once set" type ProposalSpec struct { // request is the user's original request, alert description, or a // description of what triggered this proposal. This text is passed to @@ -348,9 +363,10 @@ type ProposalSpec struct { AnalysisOutput AnalysisOutput `json:"analysisOutput,omitzero"` // tools defines the default tools for all steps: skills images, - // MCP servers, and required secrets. Per-step tools - // (analysis.tools, execution.tools, verification.tools) replace - // this default for individual steps. + // MCP servers, required secrets, and an optional dataSource PVC. + // Per-step tools (analysis.tools, execution.tools, verification.tools) + // replace this default for individual steps, so a dataSource set in + // spec.analysis.tools is mounted only in the analysis sandbox. // // Immutable: the skills and secrets available to the agent are // fixed at creation. Changing tools mid-flight could violate the @@ -358,15 +374,6 @@ type ProposalSpec struct { // +optional Tools ToolsSpec `json:"tools,omitzero"` - // dataSource references a PVC containing pre-populated input data - // (e.g., must-gather bundles, diagnostic data). The operator mounts - // it read-only at /data/input in the sandbox pod. Skills discover - // input data at this standard location. - // - // Immutable: input data source is fixed at creation. - // +optional - DataSource *DataSource `json:"dataSource,omitzero"` - // analysis defines per-step configuration for the analysis step, // including which agent handles it and any per-step tools. // @@ -388,18 +395,6 @@ type ProposalSpec struct { // +optional Verification ProposalStep `json:"verification,omitzero"` - // timeoutMinutes sets the per-step timeout for sandbox agent calls. - // This controls how long the operator waits for the sandbox pod to - // become ready and for the agent to complete its work. Increase this - // for long-running tools (e.g., IntelliAide RCA takes 10-30 minutes). - // Defaults to 5 minutes when omitted. - // - // Mutable: can be adjusted before approving a step. - // +optional - // +kubebuilder:validation:Minimum=1 - // +kubebuilder:validation:Maximum=60 - TimeoutMinutes *int32 `json:"timeoutMinutes,omitempty"` - // revisionFeedback is the user's free-text feedback requesting changes // to the analysis. Patching this field bumps metadata.generation, which // the operator detects (generation > observedGeneration) and triggers @@ -411,7 +406,6 @@ type ProposalSpec struct { // +kubebuilder:validation:MinLength=1 // +kubebuilder:validation:MaxLength=32768 RevisionFeedback string `json:"revisionFeedback,omitempty"` - } // ProposalStatus defines the observed state of Proposal. All fields are diff --git a/api/v1alpha1/tools_types.go b/api/v1alpha1/tools_types.go index 4955d28e..bd9135d6 100644 --- a/api/v1alpha1/tools_types.go +++ b/api/v1alpha1/tools_types.go @@ -109,12 +109,15 @@ type SecretRequirement struct { } // ToolsSpec defines the tools available to an agent in its sandbox pod. -// This includes skills images, MCP servers, and required secrets. +// This includes skills images, MCP servers, required secrets, and an +// optional data source PVC. // // ToolsSpec is specified on a Proposal either as a shared default // (spec.tools) or per-step (spec.analysis.tools, spec.execution.tools, // spec.verification.tools). Per-step tools replace the shared default -// for that step. +// for that step, so a dataSource set in spec.analysis.tools is mounted +// only in the analysis sandbox, while one in spec.tools is mounted in +// every step that does not override tools. // // +kubebuilder:validation:MinProperties=1 type ToolsSpec struct { @@ -147,8 +150,16 @@ type ToolsSpec struct { // +kubebuilder:validation:MinItems=1 // +kubebuilder:validation:MaxItems=20 RequiredSecrets []SecretRequirement `json:"requiredSecrets,omitempty"` + + // dataSource references a pre-existing PersistentVolumeClaim containing + // input data for this step (e.g., must-gather bundles, diagnostic data). + // The PVC must already exist in the same namespace as the Proposal and be + // pre-populated with data before the Proposal is created. The operator + // mounts it read-only at /data/input in the sandbox pod. + // +optional + DataSource DataSource `json:"dataSource,omitzero"` } func (t ToolsSpec) IsZero() bool { - return len(t.Skills) == 0 && len(t.MCPServers) == 0 && len(t.RequiredSecrets) == 0 + return len(t.Skills) == 0 && len(t.MCPServers) == 0 && len(t.RequiredSecrets) == 0 && t.DataSource.IsZero() } diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 41cbe9fe..8016b467 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -544,6 +544,21 @@ func (in *AzureOpenAIConfig) DeepCopy() *AzureOpenAIConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DataSource) DeepCopyInto(out *DataSource) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DataSource. +func (in *DataSource) DeepCopy() *DataSource { + if in == nil { + return nil + } + out := new(DataSource) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DiagnosisResult) DeepCopyInto(out *DiagnosisResult) { *out = *in @@ -1263,11 +1278,6 @@ func (in *ProposalSpec) DeepCopyInto(out *ProposalSpec) { in.Analysis.DeepCopyInto(&out.Analysis) in.Execution.DeepCopyInto(&out.Execution) in.Verification.DeepCopyInto(&out.Verification) - if in.TimeoutMinutes != nil { - in, out := &in.TimeoutMinutes, &out.TimeoutMinutes - *out = new(int32) - **out = **in - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProposalSpec. @@ -1606,6 +1616,7 @@ func (in *ToolsSpec) DeepCopyInto(out *ToolsSpec) { *out = make([]SecretRequirement, len(*in)) copy(*out, *in) } + out.DataSource = in.DataSource } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ToolsSpec. diff --git a/config/crd/bases/agentic.openshift.io_proposals.yaml b/config/crd/bases/agentic.openshift.io_proposals.yaml index 3dc17f0d..aede511e 100644 --- a/config/crd/bases/agentic.openshift.io_proposals.yaml +++ b/config/crd/bases/agentic.openshift.io_proposals.yaml @@ -83,12 +83,46 @@ spec: - message: 'must be a valid DNS subdomain: lowercase alphanumeric characters, hyphens, and dots' rule: '!format.dns1123Subdomain().validate(self).hasValue()' + timeoutMinutes: + description: |- + timeoutMinutes sets the timeout for this step's sandbox agent call. + This controls how long the operator waits for the sandbox pod to + become ready and for the agent to complete its work. Increase this + for long-running tools (e.g., IntelliAide RCA takes 10-30 minutes). + Defaults to 5 minutes when omitted. + + Mutable: can be adjusted before approving a step. + format: int32 + maximum: 60 + minimum: 1 + type: integer tools: description: |- tools provides per-step tools that replace the shared spec.tools for this step. Use this when different steps need different skills. minProperties: 1 properties: + dataSource: + description: |- + dataSource references a pre-existing PersistentVolumeClaim containing + input data for this step (e.g., must-gather bundles, diagnostic data). + The PVC must already exist in the same namespace as the Proposal and be + pre-populated with data before the Proposal is created. The operator + mounts it read-only at /data/input in the sandbox pod. + properties: + claimName: + description: |- + claimName is the name of the PersistentVolumeClaim to mount. + The PVC must exist in the same namespace as the Proposal. + maxLength: 253 + minLength: 1 + type: string + x-kubernetes-validations: + - message: must be a valid DNS subdomain + rule: '!format.dns1123Subdomain().validate(self).hasValue()' + required: + - claimName + type: object mcpServers: description: |- mcpServers defines external MCP (Model Context Protocol) servers the @@ -481,28 +515,6 @@ spec: x-kubernetes-validations: - message: schema is required when mode is Minimal rule: self.mode != 'Minimal' || has(self.schema) - dataSource: - description: |- - dataSource references a PVC containing pre-populated input data - (e.g., must-gather bundles, diagnostic data). The operator mounts - it read-only at /data/input in the sandbox pod. Skills discover - input data at this standard location. - - Immutable: input data source is fixed at creation. - properties: - claimName: - description: |- - claimName is the name of the PersistentVolumeClaim to mount. - The PVC must exist in the same namespace as the Proposal. - maxLength: 253 - minLength: 1 - type: string - x-kubernetes-validations: - - message: must be a valid DNS subdomain - rule: '!format.dns1123Subdomain().validate(self).hasValue()' - required: - - claimName - type: object execution: description: |- execution defines per-step configuration for the execution step. @@ -522,12 +534,46 @@ spec: - message: 'must be a valid DNS subdomain: lowercase alphanumeric characters, hyphens, and dots' rule: '!format.dns1123Subdomain().validate(self).hasValue()' + timeoutMinutes: + description: |- + timeoutMinutes sets the timeout for this step's sandbox agent call. + This controls how long the operator waits for the sandbox pod to + become ready and for the agent to complete its work. Increase this + for long-running tools (e.g., IntelliAide RCA takes 10-30 minutes). + Defaults to 5 minutes when omitted. + + Mutable: can be adjusted before approving a step. + format: int32 + maximum: 60 + minimum: 1 + type: integer tools: description: |- tools provides per-step tools that replace the shared spec.tools for this step. Use this when different steps need different skills. minProperties: 1 properties: + dataSource: + description: |- + dataSource references a pre-existing PersistentVolumeClaim containing + input data for this step (e.g., must-gather bundles, diagnostic data). + The PVC must already exist in the same namespace as the Proposal and be + pre-populated with data before the Proposal is created. The operator + mounts it read-only at /data/input in the sandbox pod. + properties: + claimName: + description: |- + claimName is the name of the PersistentVolumeClaim to mount. + The PVC must exist in the same namespace as the Proposal. + maxLength: 253 + minLength: 1 + type: string + x-kubernetes-validations: + - message: must be a valid DNS subdomain + rule: '!format.dns1123Subdomain().validate(self).hasValue()' + required: + - claimName + type: object mcpServers: description: |- mcpServers defines external MCP (Model Context Protocol) servers the @@ -935,31 +981,40 @@ spec: x-kubernetes-validations: - message: each namespace must be a valid DNS label rule: self.all(ns, !format.dns1123Label().validate(ns).hasValue()) - timeoutMinutes: - description: |- - timeoutMinutes sets the per-step timeout for sandbox agent calls. - This controls how long the operator waits for the sandbox pod to - become ready and for the agent to complete its work. Increase this - for long-running tools (e.g., IntelliAide RCA takes 10-30 minutes). - Defaults to 5 minutes when omitted. - - Mutable: can be adjusted before approving a step. - format: int32 - maximum: 60 - minimum: 1 - type: integer tools: description: |- tools defines the default tools for all steps: skills images, - MCP servers, and required secrets. Per-step tools - (analysis.tools, execution.tools, verification.tools) replace - this default for individual steps. + MCP servers, required secrets, and an optional dataSource PVC. + Per-step tools (analysis.tools, execution.tools, verification.tools) + replace this default for individual steps, so a dataSource set in + spec.analysis.tools is mounted only in the analysis sandbox. Immutable: the skills and secrets available to the agent are fixed at creation. Changing tools mid-flight could violate the assumptions of an in-progress analysis or execution. minProperties: 1 properties: + dataSource: + description: |- + dataSource references a pre-existing PersistentVolumeClaim containing + input data for this step (e.g., must-gather bundles, diagnostic data). + The PVC must already exist in the same namespace as the Proposal and be + pre-populated with data before the Proposal is created. The operator + mounts it read-only at /data/input in the sandbox pod. + properties: + claimName: + description: |- + claimName is the name of the PersistentVolumeClaim to mount. + The PVC must exist in the same namespace as the Proposal. + maxLength: 253 + minLength: 1 + type: string + x-kubernetes-validations: + - message: must be a valid DNS subdomain + rule: '!format.dns1123Subdomain().validate(self).hasValue()' + required: + - claimName + type: object mcpServers: description: |- mcpServers defines external MCP (Model Context Protocol) servers the @@ -1329,12 +1384,46 @@ spec: - message: 'must be a valid DNS subdomain: lowercase alphanumeric characters, hyphens, and dots' rule: '!format.dns1123Subdomain().validate(self).hasValue()' + timeoutMinutes: + description: |- + timeoutMinutes sets the timeout for this step's sandbox agent call. + This controls how long the operator waits for the sandbox pod to + become ready and for the agent to complete its work. Increase this + for long-running tools (e.g., IntelliAide RCA takes 10-30 minutes). + Defaults to 5 minutes when omitted. + + Mutable: can be adjusted before approving a step. + format: int32 + maximum: 60 + minimum: 1 + type: integer tools: description: |- tools provides per-step tools that replace the shared spec.tools for this step. Use this when different steps need different skills. minProperties: 1 properties: + dataSource: + description: |- + dataSource references a pre-existing PersistentVolumeClaim containing + input data for this step (e.g., must-gather bundles, diagnostic data). + The PVC must already exist in the same namespace as the Proposal and be + pre-populated with data before the Proposal is created. The operator + mounts it read-only at /data/input in the sandbox pod. + properties: + claimName: + description: |- + claimName is the name of the PersistentVolumeClaim to mount. + The PVC must exist in the same namespace as the Proposal. + maxLength: 253 + minLength: 1 + type: string + x-kubernetes-validations: + - message: must be a valid DNS subdomain + rule: '!format.dns1123Subdomain().validate(self).hasValue()' + required: + - claimName + type: object mcpServers: description: |- mcpServers defines external MCP (Model Context Protocol) servers the @@ -1710,18 +1799,15 @@ spec: || (!has(self.execution) && !has(self.verification))' - message: tools is immutable once set rule: '!has(oldSelf.tools) || (has(self.tools) && self.tools == oldSelf.tools)' - - message: analysis is immutable once set - rule: '!has(oldSelf.analysis) || (has(self.analysis) && self.analysis - == oldSelf.analysis)' - - message: execution is immutable once set - rule: '!has(oldSelf.execution) || (has(self.execution) && self.execution - == oldSelf.execution)' - - message: verification is immutable once set - rule: '!has(oldSelf.verification) || (has(self.verification) && self.verification - == oldSelf.verification)' - - message: dataSource is immutable once set - rule: '!has(oldSelf.dataSource) || (has(self.dataSource) && self.dataSource - == oldSelf.dataSource)' + - message: analysis agent and tools are immutable once set + rule: '!has(oldSelf.analysis) || (has(self.analysis) && self.analysis.agent + == oldSelf.analysis.agent && self.analysis.tools == oldSelf.analysis.tools)' + - message: execution agent and tools are immutable once set + rule: '!has(oldSelf.execution) || (has(self.execution) && self.execution.agent + == oldSelf.execution.agent && self.execution.tools == oldSelf.execution.tools)' + - message: verification agent and tools are immutable once set + rule: '!has(oldSelf.verification) || (has(self.verification) && self.verification.agent + == oldSelf.verification.agent && self.verification.tools == oldSelf.verification.tools)' status: description: status defines the observed state of Proposal. minProperties: 1 diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 795513c6..718b2ff3 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -33,14 +33,6 @@ rules: - create - delete - get -- apiGroups: - - "" - resources: - - persistentvolumeclaims - verbs: - - get - - list - - watch - apiGroups: - agentic.openshift.io resources: diff --git a/controller/proposal/client_test.go b/controller/proposal/client_test.go index dee0cf5a..1515aef5 100644 --- a/controller/proposal/client_test.go +++ b/controller/proposal/client_test.go @@ -6,6 +6,7 @@ import ( "net/http" "net/http/httptest" "testing" + "time" agenticv1alpha1 "github.com/openshift/lightspeed-agentic-operator/api/v1alpha1" ) @@ -179,3 +180,64 @@ func TestAgentHTTPClient_RunWithContext(t *testing.T) { t.Fatalf("unexpected error: %v", err) } } + +// TestAgentHTTPClient_TimeoutMs_CustomTimeout verifies that a positive timeout +// is serialized as timeout_ms in the request body sent to the agent. +func TestAgentHTTPClient_TimeoutMs_CustomTimeout(t *testing.T) { + const customTimeout = 10 * time.Second + var gotTimeoutMs *int64 + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + var req agentRunRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + t.Fatalf("decode request: %v", err) + } + gotTimeoutMs = req.TimeoutMs + w.Header().Set("Content-Type", "application/json") + w.Write([]byte(`{"success": true}`)) + })) + defer server.Close() + + c := NewAgentHTTPClient(server.URL, customTimeout) + if _, err := c.Run(context.Background(), "", "ping", nil, nil); err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if gotTimeoutMs == nil { + t.Fatal("timeout_ms was not sent in request") + } + wantMs := int64(customTimeout / time.Millisecond) + if *gotTimeoutMs != wantMs { + t.Errorf("timeout_ms = %d, want %d", *gotTimeoutMs, wantMs) + } +} + +// TestAgentHTTPClient_TimeoutMs_ZeroFallsBackToDefault verifies that timeout <= 0 +// falls back to defaultSandboxTimeout and is reflected in timeout_ms. +func TestAgentHTTPClient_TimeoutMs_ZeroFallsBackToDefault(t *testing.T) { + var gotTimeoutMs *int64 + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + var req agentRunRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + t.Fatalf("decode request: %v", err) + } + gotTimeoutMs = req.TimeoutMs + w.Header().Set("Content-Type", "application/json") + w.Write([]byte(`{"success": true}`)) + })) + defer server.Close() + + c := NewAgentHTTPClient(server.URL, 0) // zero should fall back to defaultSandboxTimeout + if _, err := c.Run(context.Background(), "", "ping", nil, nil); err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if gotTimeoutMs == nil { + t.Fatal("timeout_ms was not sent in request") + } + wantMs := int64(defaultSandboxTimeout / time.Millisecond) + if *gotTimeoutMs != wantMs { + t.Errorf("timeout_ms = %d, want %d (defaultSandboxTimeout)", *gotTimeoutMs, wantMs) + } +} diff --git a/controller/proposal/handlers.go b/controller/proposal/handlers.go index 0db81c4f..99e1d1eb 100644 --- a/controller/proposal/handlers.go +++ b/controller/proposal/handlers.go @@ -85,7 +85,7 @@ func (r *ProposalReconciler) handleAnalysis( return ctrl.Result{}, fmt.Errorf("%s: %w", ErrUpdateToAnalyzing, err) } - timeout := proposalTimeout(proposal) + timeout := stepTimeout(resolved.Analysis) analysisResult, err := r.Agent.Analyze(ctx, proposal, resolved.Analysis, proposal.Spec.Request, defaultSandboxSA, timeout) if err != nil { return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionAnalyzed, err) @@ -152,7 +152,7 @@ func (r *ProposalReconciler) handleRevision( revisionSuffix := buildRevisionContext(proposal) requestWithRevision := proposal.Spec.Request + "\n\n" + revisionSuffix - timeout := proposalTimeout(proposal) + timeout := stepTimeout(resolved.Analysis) analysisResult, err := r.Agent.Analyze(ctx, proposal, resolved.Analysis, requestWithRevision, defaultSandboxSA, timeout) if err != nil { return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionAnalyzed, err) @@ -272,7 +272,7 @@ func (r *ProposalReconciler) handleExecution( return ctrl.Result{}, fmt.Errorf("%s: %w", ErrUpdateToExecuting, err) } - timeout := proposalTimeout(proposal) + timeout := stepTimeout(*resolved.Execution) execResult, err := r.Agent.Execute(ctx, proposal, *resolved.Execution, selectedOption, execSA, timeout) if err != nil { return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionExecuted, err) @@ -390,7 +390,7 @@ func (r *ProposalReconciler) handleVerification( } } - timeout := proposalTimeout(proposal) + timeout := stepTimeout(*resolved.Verification) verifyResult, err := r.Agent.Verify(ctx, proposal, *resolved.Verification, selectedOption, execOutput, defaultSandboxSA, timeout) if err != nil { return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionVerified, err) @@ -590,7 +590,7 @@ func (r *ProposalReconciler) handleEscalation( } escalationText := buildEscalationRequest(proposal) - timeout := proposalTimeout(proposal) + timeout := stepTimeout(step) escalationResult, err := r.Agent.Escalate(ctx, proposal, step, escalationText, defaultSandboxSA, timeout) if err != nil { return r.failStep(ctx, proposal, agenticv1alpha1.ProposalConditionEscalated, err) diff --git a/controller/proposal/reconciler_test.go b/controller/proposal/reconciler_test.go index 8eec3125..2acf6620 100644 --- a/controller/proposal/reconciler_test.go +++ b/controller/proposal/reconciler_test.go @@ -31,6 +31,11 @@ type testAgentCaller struct { executeResult *ExecutionOutput verifyResult *VerificationOutput escalateResult *EscalationOutput + + lastAnalyzeTimeout time.Duration + lastExecuteTimeout time.Duration + lastVerifyTimeout time.Duration + lastEscalateTimeout time.Duration } func newTestAgentCaller() *testAgentCaller { @@ -42,25 +47,29 @@ func newTestAgentCaller() *testAgentCaller { return &testAgentCaller{analyzeResult: a, executeResult: e, verifyResult: v, escalateResult: esc} } -func (ta *testAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string, _ time.Duration) (*AnalysisOutput, error) { +func (ta *testAgentCaller) Analyze(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string, timeout time.Duration) (*AnalysisOutput, error) { + ta.lastAnalyzeTimeout = timeout if ta.analyzeErr != nil { return nil, ta.analyzeErr } return ta.analyzeResult, nil } -func (ta *testAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ string, _ time.Duration) (*ExecutionOutput, error) { +func (ta *testAgentCaller) Execute(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ string, timeout time.Duration) (*ExecutionOutput, error) { + ta.lastExecuteTimeout = timeout if ta.executeErr != nil { return nil, ta.executeErr } return ta.executeResult, nil } -func (ta *testAgentCaller) Verify(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ *ExecutionOutput, _ string, _ time.Duration) (*VerificationOutput, error) { +func (ta *testAgentCaller) Verify(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ *agenticv1alpha1.RemediationOption, _ *ExecutionOutput, _ string, timeout time.Duration) (*VerificationOutput, error) { + ta.lastVerifyTimeout = timeout if ta.verifyErr != nil { return nil, ta.verifyErr } return ta.verifyResult, nil } -func (ta *testAgentCaller) Escalate(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string, _ time.Duration) (*EscalationOutput, error) { +func (ta *testAgentCaller) Escalate(_ context.Context, _ *agenticv1alpha1.Proposal, _ resolvedStep, _ string, _ string, timeout time.Duration) (*EscalationOutput, error) { + ta.lastEscalateTimeout = timeout if ta.escalateErr != nil { return nil, ta.escalateErr } @@ -496,3 +505,40 @@ func TestHandleSuspension(t *testing.T) { }) } } + +// TestReconcile_PropagatesStepTimeout verifies that timeoutMinutes set on a +// ProposalStep is resolved and forwarded to the AgentCaller as time.Duration. +func TestReconcile_PropagatesStepTimeout(t *testing.T) { + const wantMinutes int32 = 30 + + scheme := testScheme() + proposal := &agenticv1alpha1.Proposal{ + ObjectMeta: metav1.ObjectMeta{Name: "timeout-check", Namespace: "default"}, + Spec: agenticv1alpha1.ProposalSpec{ + Request: "Pod crashing", + Tools: testTools(), + Analysis: agenticv1alpha1.ProposalStep{ + Agent: "default", + TimeoutMinutes: wantMinutes, + }, + Execution: agenticv1alpha1.ProposalStep{Agent: "default"}, + Verification: agenticv1alpha1.ProposalStep{Agent: "default"}, + }, + } + + objs := append([]client.Object{proposal}, defaultObjects()...) + fc := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...). + WithStatusSubresource(proposal, &agenticv1alpha1.AnalysisResult{}, &agenticv1alpha1.ExecutionResult{}, &agenticv1alpha1.VerificationResult{}, &agenticv1alpha1.EscalationResult{}).Build() + + caller := newTestAgentCaller() + r := &ProposalReconciler{Client: fc, Agent: caller, Namespace: "default"} + + if _, err := reconcileOnce(r, "timeout-check"); err != nil { + t.Fatalf("reconcile: %v", err) + } + + want := time.Duration(wantMinutes) * time.Minute + if caller.lastAnalyzeTimeout != want { + t.Errorf("Analyze timeout = %v, want %v", caller.lastAnalyzeTimeout, want) + } +} diff --git a/controller/proposal/resolve.go b/controller/proposal/resolve.go index 56c1fff2..3a754d66 100644 --- a/controller/proposal/resolve.go +++ b/controller/proposal/resolve.go @@ -19,9 +19,10 @@ const ( ) type resolvedStep struct { - Agent *agenticv1alpha1.Agent - LLM *agenticv1alpha1.LLMProvider - Tools *agenticv1alpha1.ToolsSpec + Agent *agenticv1alpha1.Agent + LLM *agenticv1alpha1.LLMProvider + Tools *agenticv1alpha1.ToolsSpec + TimeoutMinutes int32 } type resolvedWorkflow struct { @@ -80,14 +81,14 @@ func resolveProposal(ctx context.Context, c client.Client, proposal *agenticv1al if err != nil { return nil, fmt.Errorf("%s: %w", ErrResolveAnalysisStep, err) } - resolved.Analysis = resolvedStep{Agent: agent, LLM: llm, Tools: toolsForStep(proposal.Spec.Analysis)} + resolved.Analysis = resolvedStep{Agent: agent, LLM: llm, Tools: toolsForStep(proposal.Spec.Analysis), TimeoutMinutes: proposal.Spec.Analysis.TimeoutMinutes} if !proposal.Spec.Execution.IsZero() { agent, llm, err := resolveAgent(effectiveAgent(agenticv1alpha1.SandboxStepExecution, proposal.Spec.Execution)) if err != nil { return nil, fmt.Errorf("%s: %w", ErrResolveExecutionStep, err) } - resolved.Execution = &resolvedStep{Agent: agent, LLM: llm, Tools: toolsForStep(proposal.Spec.Execution)} + resolved.Execution = &resolvedStep{Agent: agent, LLM: llm, Tools: toolsForStep(proposal.Spec.Execution), TimeoutMinutes: proposal.Spec.Execution.TimeoutMinutes} } if !proposal.Spec.Verification.IsZero() { @@ -95,7 +96,7 @@ func resolveProposal(ctx context.Context, c client.Client, proposal *agenticv1al if err != nil { return nil, fmt.Errorf("%s: %w", ErrResolveVerificationStep, err) } - resolved.Verification = &resolvedStep{Agent: agent, LLM: llm, Tools: toolsForStep(proposal.Spec.Verification)} + resolved.Verification = &resolvedStep{Agent: agent, LLM: llm, Tools: toolsForStep(proposal.Spec.Verification), TimeoutMinutes: proposal.Spec.Verification.TimeoutMinutes} } return resolved, nil diff --git a/controller/proposal/sandbox.go b/controller/proposal/sandbox.go index a4e47d8e..6dfe09a6 100644 --- a/controller/proposal/sandbox.go +++ b/controller/proposal/sandbox.go @@ -99,7 +99,7 @@ func (m *SandboxManager) buildClaim(claimName, proposalName, step, templateName func (m *SandboxManager) Claim(ctx context.Context, proposalName, step, _ string) (string, error) { log := logf.FromContext(ctx) - templateName, err := EnsureAgentTemplate(ctx, m.Client, m.BaseTemplateName, m.Namespace, step, m.agent, m.llm, m.tools, nil, m.serviceAccount) + templateName, err := EnsureAgentTemplate(ctx, m.Client, m.BaseTemplateName, m.Namespace, step, m.agent, m.llm, m.tools, m.serviceAccount) if err != nil { return "", fmt.Errorf("%s: %w", ErrEnsureAgentTemplate, err) } diff --git a/controller/proposal/sandbox_agent.go b/controller/proposal/sandbox_agent.go index ae71444c..6ac58ed8 100644 --- a/controller/proposal/sandbox_agent.go +++ b/controller/proposal/sandbox_agent.go @@ -35,13 +35,13 @@ type analysisResponse struct { } type executionResponse struct { - Success bool `json:"success"` + Success bool `json:"success"` ActionsTaken []agenticv1alpha1.ExecutionAction `json:"actionsTaken"` Verification *agenticv1alpha1.ExecutionVerification `json:"verification,omitempty"` } type verificationResponse struct { - Success bool `json:"success"` + Success bool `json:"success"` Checks []agenticv1alpha1.VerifyCheck `json:"checks"` Summary string `json:"summary"` } @@ -71,12 +71,12 @@ func NewSandboxAgentCaller( } } -// proposalTimeout returns the effective timeout for sandbox operations. -// Reads spec.timeoutMinutes when set; falls back to defaultSandboxTimeout. +// stepTimeout returns the effective timeout for a single step's sandbox operation. +// Reads the step's timeoutMinutes when set; falls back to defaultSandboxTimeout. // This is the single place where timeout policy is decided. -func proposalTimeout(proposal *agenticv1alpha1.Proposal) time.Duration { - if proposal.Spec.TimeoutMinutes != nil && *proposal.Spec.TimeoutMinutes > 0 { - return time.Duration(*proposal.Spec.TimeoutMinutes) * time.Minute +func stepTimeout(step resolvedStep) time.Duration { + if step.TimeoutMinutes > 0 { + return time.Duration(step.TimeoutMinutes) * time.Minute } return defaultSandboxTimeout } @@ -194,7 +194,6 @@ func (s *SandboxAgentCaller) callWithSandbox( timeout = defaultSandboxTimeout } - claimName, err := s.Sandbox.Claim(ctx, proposal.Name, stepName, "") if err != nil { return nil, fmt.Errorf("%s: %w", ErrClaimSandbox, err) @@ -204,7 +203,11 @@ func (s *SandboxAgentCaller) callWithSandbox( // while the sandbox is still starting up s.patchSandboxInfo(ctx, proposal, stepName, claimName) - endpoint, err := s.Sandbox.WaitReady(ctx, claimName, timeout) + // Pod startup is an infrastructure concern unrelated to the agent's work + // budget. Using a fixed ceiling here ensures that the full user-configured + // timeout is available for the agent call itself, and avoids the effective + // wall-clock time being 2× the configured value. + endpoint, err := s.Sandbox.WaitReady(ctx, claimName, defaultSandboxTimeout) if err != nil { return nil, fmt.Errorf("%s: %w", ErrWaitForSandbox, err) } diff --git a/controller/proposal/sandbox_agent_test.go b/controller/proposal/sandbox_agent_test.go index 20ed0775..d5052969 100644 --- a/controller/proposal/sandbox_agent_test.go +++ b/controller/proposal/sandbox_agent_test.go @@ -18,13 +18,14 @@ import ( // --- Hand-written mocks --- type mockSandboxProvider struct { - claimName string - claimErr error - endpoint string - readyErr error - releaseErr error - claimCalls int - releaseCalls int + claimName string + claimErr error + endpoint string + readyErr error + releaseErr error + claimCalls int + releaseCalls int + lastWaitReadyTimeout time.Duration } func (m *mockSandboxProvider) SetStep(_ *agenticv1alpha1.Agent, _ *agenticv1alpha1.LLMProvider, _ *agenticv1alpha1.ToolsSpec, _ string) { @@ -33,7 +34,8 @@ func (m *mockSandboxProvider) Claim(_ context.Context, _, _, _ string) (string, m.claimCalls++ return m.claimName, m.claimErr } -func (m *mockSandboxProvider) WaitReady(_ context.Context, _ string, _ time.Duration) (string, error) { +func (m *mockSandboxProvider) WaitReady(_ context.Context, _ string, d time.Duration) (string, error) { + m.lastWaitReadyTimeout = d return m.endpoint, m.readyErr } func (m *mockSandboxProvider) Release(_ context.Context, _ string) error { @@ -60,11 +62,11 @@ func newTestSandboxAgentCaller(sandbox *mockSandboxProvider, httpClient *mockHTT fc := fake.NewClientBuilder().WithScheme(testScheme()).Build() _ = fc.Create(context.Background(), fakeBaseTemplate()) return &SandboxAgentCaller{ - Sandbox: sandbox, - K8sClient: fc, - ClientFactory: func(_ string, _ time.Duration) AgentHTTPClientInterface { return httpClient }, - Namespace: "test-ns", - Timeout: 5 * time.Minute, + Sandbox: sandbox, + K8sClient: fc, + ClientFactory: func(_ string, _ time.Duration) AgentHTTPClientInterface { return httpClient }, + Namespace: "test-ns", + Timeout: 5 * time.Minute, } } @@ -75,11 +77,11 @@ func newTestSandboxAgentCallerWithProposal(sandbox *mockSandboxProvider, httpCli Build() _ = fc.Create(context.Background(), fakeBaseTemplate()) return &SandboxAgentCaller{ - Sandbox: sandbox, - K8sClient: fc, - ClientFactory: func(_ string, _ time.Duration) AgentHTTPClientInterface { return httpClient }, - Namespace: "test-ns", - Timeout: 5 * time.Minute, + Sandbox: sandbox, + K8sClient: fc, + ClientFactory: func(_ string, _ time.Duration) AgentHTTPClientInterface { return httpClient }, + Namespace: "test-ns", + Timeout: 5 * time.Minute, } } @@ -730,3 +732,49 @@ func (m *trackingMockSandbox) Release(_ context.Context, claimName string) error } return nil } + +// TestSandboxAgentCaller_TimeoutPropagation verifies the two-phase timeout +// design: WaitReady (pod startup) always uses the fixed defaultSandboxTimeout +// regardless of the step's configured timeout, while ClientFactory receives the +// full user-configured timeout for the agent's work. +func TestSandboxAgentCaller_TimeoutPropagation(t *testing.T) { + const customTimeout = 20 * time.Minute + + sandbox := &mockSandboxProvider{ + claimName: "ls-analysis-test", + endpoint: "http://sandbox:8080", + } + httpClient := &mockHTTPClient{ + response: &agentRunResponse{ + Response: json.RawMessage(`{"success": true, "options": []}`), + }, + } + + var lastFactoryTimeout time.Duration + fc := fake.NewClientBuilder().WithScheme(testScheme()).Build() + _ = fc.Create(context.Background(), fakeBaseTemplate()) + caller := &SandboxAgentCaller{ + Sandbox: sandbox, + K8sClient: fc, + ClientFactory: func(_ string, d time.Duration) AgentHTTPClientInterface { + lastFactoryTimeout = d + return httpClient + }, + Namespace: "test-ns", + Timeout: defaultSandboxTimeout, + } + + _, err := caller.Analyze(context.Background(), testSandboxProposal(), testSandboxStep(), "test", "", customTimeout) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Pod startup always uses the fixed ceiling, not the step-level timeout. + if sandbox.lastWaitReadyTimeout != defaultSandboxTimeout { + t.Errorf("WaitReady timeout = %v, want defaultSandboxTimeout (%v)", sandbox.lastWaitReadyTimeout, defaultSandboxTimeout) + } + // The agent's work budget is the full configured timeout. + if lastFactoryTimeout != customTimeout { + t.Errorf("ClientFactory timeout = %v, want %v", lastFactoryTimeout, customTimeout) + } +} diff --git a/controller/proposal/sandbox_templates.go b/controller/proposal/sandbox_templates.go index 1c691a3f..175b5321 100644 --- a/controller/proposal/sandbox_templates.go +++ b/controller/proposal/sandbox_templates.go @@ -73,14 +73,15 @@ var sandboxTemplateGVK = schema.GroupVersionKind{ } const ( - agentModeEnvVar = "LIGHTSPEED_MODE" - llmCredsMountPath = "/var/run/secrets/llm-credentials" + agentModeEnvVar = "LIGHTSPEED_MODE" vertexCredsMountPath = "/var/secrets/google" vertexCredsFileName = "credentials.json" llmCredsVolumeName = "llm-credentials" + llmCredsMountPath = "/var/run/secrets/llm-credentials" mcpHeadersMountRoot = "/var/secrets/mcp" mcpServersEnvVar = "LIGHTSPEED_MCP_SERVERS" dataSourceMountPath = "/data/input" + dataSourceVolumeName = "lightspeed-data-source" LabelManaged = "agentic.openshift.io/managed" LabelBaseTemplate = "agentic.openshift.io/base-template" @@ -137,7 +138,8 @@ func agentTemplateName(step, agentName, hash string) string { } // EnsureAgentTemplate creates a SandboxTemplate derived from the base template -// with skills, LLM credentials, MCP servers, and required secrets from the CRD chain. +// with skills, LLM credentials, MCP servers, required secrets, and an optional +// dataSource PVC from the CRD chain. // Template name includes a config hash — same input = same template = no-op. // Old templates for the same agent+phase are garbage-collected. func EnsureAgentTemplate( @@ -149,7 +151,6 @@ func EnsureAgentTemplate( agent *agenticv1alpha1.Agent, llm *agenticv1alpha1.LLMProvider, tools *agenticv1alpha1.ToolsSpec, - dataSource *agenticv1alpha1.DataSource, serviceAccount string, ) (string, error) { log := logf.FromContext(ctx).WithName("sandbox-templates") @@ -170,10 +171,14 @@ func EnsureAgentTemplate( var skills []agenticv1alpha1.SkillsSource var mcpServers []agenticv1alpha1.MCPServerConfig var requiredSecrets []agenticv1alpha1.SecretRequirement + var dataSource *agenticv1alpha1.DataSource if tools != nil { skills = tools.Skills mcpServers = tools.MCPServers requiredSecrets = tools.RequiredSecrets + if !tools.DataSource.IsZero() { + dataSource = &tools.DataSource + } } hash, err := computeTemplateHash(llm, agent.Spec.Model, skills, mcpServers, requiredSecrets, dataSource, step, base.GetResourceVersion(), serviceAccount) @@ -653,7 +658,7 @@ func addPVCVolume(tmpl *unstructured.Unstructured, volumeName, claimName string) } func patchDataSource(tmpl *unstructured.Unstructured, ds *agenticv1alpha1.DataSource) error { - volName := "data-source" + volName := dataSourceVolumeName if err := addPVCVolume(tmpl, volName, ds.ClaimName); err != nil { return fmt.Errorf("add data source PVC volume: %w", err) } diff --git a/controller/proposal/sandbox_templates_test.go b/controller/proposal/sandbox_templates_test.go index 9ca39adc..09bb3452 100644 --- a/controller/proposal/sandbox_templates_test.go +++ b/controller/proposal/sandbox_templates_test.go @@ -517,14 +517,14 @@ func TestSetEnvVar_FailsOnNoContainers(t *testing.T) { } func TestEnsureAgentTemplate_NilAgent(t *testing.T) { - _, err := EnsureAgentTemplate(context.Background(), nil, "base", "ns", "analysis", nil, testLLMProvider(agenticv1alpha1.LLMProviderGoogleCloudVertex), nil, nil, "lightspeed-agent") + _, err := EnsureAgentTemplate(context.Background(), nil, "base", "ns", "analysis", nil, testLLMProvider(agenticv1alpha1.LLMProviderGoogleCloudVertex), nil, "lightspeed-agent") if err == nil { t.Error("expected error for nil agent") } } func TestEnsureAgentTemplate_NilLLM(t *testing.T) { - _, err := EnsureAgentTemplate(context.Background(), nil, "base", "ns", "analysis", testDefaultAgent(), nil, nil, nil,"lightspeed-agent") + _, err := EnsureAgentTemplate(context.Background(), nil, "base", "ns", "analysis", testDefaultAgent(), nil, nil, "lightspeed-agent") if err == nil { t.Error("expected error for nil LLM") } @@ -730,3 +730,89 @@ func TestPatchProbes(t *testing.T) { } }) } + +// TestComputeTemplateHash_DataSource verifies that a non-nil dataSource changes +// the hash, and that two different claimNames produce two different hashes. +// DataSource is optional — nil is valid and covered by all other hash tests above. +func TestComputeTemplateHash_DataSource(t *testing.T) { + llm := testLLMProvider(agenticv1alpha1.LLMProviderGoogleCloudVertex) + skills := []agenticv1alpha1.SkillsSource{{Image: "quay.io/test/skills:latest"}} + + hNoDS, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, nil, "analysis", "", "") + if err != nil { + t.Fatalf("computeTemplateHash (nil dataSource): %v", err) + } + + ds1 := &agenticv1alpha1.DataSource{ClaimName: "pvc-alpha"} + hDS1, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, ds1, "analysis", "", "") + if err != nil { + t.Fatalf("computeTemplateHash (dataSource pvc-alpha): %v", err) + } + + ds2 := &agenticv1alpha1.DataSource{ClaimName: "pvc-beta"} + hDS2, err := computeTemplateHash(llm, "claude-opus-4-6", skills, nil, nil, ds2, "analysis", "", "") + if err != nil { + t.Fatalf("computeTemplateHash (dataSource pvc-beta): %v", err) + } + + if hNoDS == hDS1 { + t.Error("nil dataSource and non-nil dataSource should produce different hashes") + } + if hDS1 == hDS2 { + t.Error("different claimNames should produce different hashes") + } +} + +// TestPatchDataSource_MountsReadOnly verifies that patchDataSource adds a PVC +// volume with the correct claimName and mounts it read-only at /data/input. +func TestPatchDataSource_MountsReadOnly(t *testing.T) { + tmpl := emptyTemplate() + ds := &agenticv1alpha1.DataSource{ClaimName: "my-must-gather-pvc"} + + if err := patchDataSource(tmpl, ds); err != nil { + t.Fatalf("patchDataSource: %v", err) + } + + // Verify volume was added with the correct name and claimName. + volumes, _, _ := unstructured.NestedSlice(tmpl.Object, "spec", "podTemplate", "spec", "volumes") + var foundVol bool + for _, v := range volumes { + vol, ok := v.(map[string]any) + if !ok { + continue + } + if vol["name"] != dataSourceVolumeName { + continue + } + foundVol = true + pvc, ok := vol["persistentVolumeClaim"].(map[string]any) + if !ok { + t.Fatalf("volume %q has no persistentVolumeClaim", dataSourceVolumeName) + } + if got := pvc["claimName"]; got != "my-must-gather-pvc" { + t.Errorf("claimName = %q, want %q", got, "my-must-gather-pvc") + } + } + if !foundVol { + t.Errorf("volume %q not found after patchDataSource", dataSourceVolumeName) + } + + // Verify volumeMount is read-only at /data/input. + mounts := getVolumeMounts(tmpl) + var foundMount bool + for _, m := range mounts { + if m["mountPath"] != dataSourceMountPath { + continue + } + foundMount = true + if m["name"] != dataSourceVolumeName { + t.Errorf("mount name = %q, want %q", m["name"], dataSourceVolumeName) + } + if ro, _ := m["readOnly"].(bool); !ro { + t.Error("dataSource mount should be readOnly=true") + } + } + if !foundMount { + t.Errorf("volumeMount at %q not found after patchDataSource", dataSourceMountPath) + } +} diff --git a/controller/proposal/templates/analysis_query.tmpl b/controller/proposal/templates/analysis_query.tmpl index ff6eea11..cdc002f0 100644 --- a/controller/proposal/templates/analysis_query.tmpl +++ b/controller/proposal/templates/analysis_query.tmpl @@ -1,31 +1,5 @@ You are an analysis agent. Your job is to diagnose the problem, determine the root cause, and propose one or more remediation options. Do NOT attempt to fix, patch, or execute any changes — only analyze and propose. -## Skills - -A specialist deep-RCA pipeline is available at `/app/skills/intelliaide/SKILL.md`. - -Use it ONLY when the request calls for: -- Root cause analysis (RCA) or deeper / ML-assisted troubleshooting of a cluster issue -- Must-gather collection or analysis -- Investigating pod failures, etcd degradation, networking problems, storage issues, etc. -- Any request that explicitly mentions "deeper analysis", "deeper troubleshooting", "root cause", "RCA", "must-gather", or "IntelliAide" - -For routine inspection (checking pod/node status, listing events, summarising resource state, -describing objects), use `kubectl`/`oc` commands directly — do NOT invoke the IntelliAide pipeline. - -**Decision rule — apply before doing anything else:** -1. Read the `## Request` section below. -2. If it is a routine inspection query → proceed with `kubectl`/`oc` directly. -3. If it is a deep-RCA or troubleshooting request → read the skill file with ONE atomic command: - ``` - cat /app/skills/intelliaide/SKILL.md - ``` - If the command returns one or more paths, read the most relevant SKILL.md with `cat` - and follow its workflow **exactly** instead of the instructions below. - If no SKILL.md files are found, stop immediately and return a JSON error response — skills are required and their absence is a fatal misconfiguration. - -## Analysis requirements - For each option you propose, include: - A diagnosis with root cause and confidence level - A detailed remediation plan with specific actions diff --git a/examples/setup/09-intelliaide-proposals.yaml b/examples/setup/09-intelliaide-proposals.yaml index d7e177d4..5c488e78 100644 --- a/examples/setup/09-intelliaide-proposals.yaml +++ b/examples/setup/09-intelliaide-proposals.yaml @@ -19,21 +19,30 @@ # render with MarkdownText in the console UI. No custom outputSchema is used. # # IntelliAide triggering: -# The analysis prompt (analysis_query.tmpl) reads the request text and invokes -# the IntelliAide pipeline when the request contains RCA keywords such as -# "root cause analysis", "RCA", "deeper analysis", "must-gather", or "IntelliAide". -# All requests in this file are phrased to trigger IntelliAide automatically. +# The operator is skill-agnostic — it mounts the skills image and passes +# spec.request to the agent. IntelliAide is invoked by the agent itself when +# it reads the SKILL.md from the mounted skills image. No keyword routing +# is done by the operator. Phrase the request to describe the problem; the +# agent decides which skill to invoke. # # Modes: -# Live mode (default) — IntelliAide reads live cluster state via kubectl/oc. -# Must-gather mode — set RUN_MUST_GATHER=1 env var, or phrase the request -# with "using must-gather" / "collect must-gather". +# Live mode (default) — IntelliAide queries live cluster state via kubectl/oc. +# No dataSource needed. +# Must-gather mode — Provide a pre-populated PVC via tools.dataSource. +# The operator mounts it read-only at /data/input. +# Mention the bundle path in spec.request so the agent +# knows to analyze it instead of collecting live data. # # Pre-requisites (cluster admin, one-time): # 1. Build and push skills image: # podman build -f Dockerfile.skills \ # -t quay.io//intelliaide-skills:latest . # podman push quay.io//intelliaide-skills:latest +# # Retag and push to the in-cluster registry so Proposals can pull it: +# podman tag quay.io//intelliaide-skills:latest \ +# image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest +# podman push \ +# image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest # 2. Label openshift-lightspeed namespace (Kubernetes 1.21+ does this # automatically via kubernetes.io/metadata.name): # oc label namespace openshift-lightspeed \ @@ -64,15 +73,14 @@ spec: targetNamespaces: - openshift-etcd - # IntelliAide RCA takes 10-30 minutes; override the default 5-min timeout. - timeoutMinutes: 30 - tools: skills: - image: image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest analysis: agent: smart + # IntelliAide RCA takes 10-30 minutes; override the default 5-min timeout. + timeoutMinutes: 30 --- # Cluster update failure RCA @@ -96,14 +104,13 @@ spec: - openshift-cluster-version - openshift-machine-config-operator - timeoutMinutes: 30 - tools: skills: - image: image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest analysis: agent: smart + timeoutMinutes: 30 --- # Live-cluster RCA — intermittent router/ingress latency spikes @@ -142,14 +149,13 @@ spec: - openshift-ingress - openshift-ingress-operator - timeoutMinutes: 30 - tools: skills: - image: image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest analysis: agent: smart + timeoutMinutes: 30 --- # General RCA — operator-managed workload degradation (no dedicated template) @@ -178,11 +184,48 @@ spec: targetNamespaces: [] # fill in the affected namespace(s) before applying - timeoutMinutes: 30 + tools: + skills: + - image: image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest + + analysis: + agent: smart + timeoutMinutes: 30 + +--- +# Must-gather mode RCA — analyze a pre-collected must-gather bundle +# +# Use this when live cluster access is not available or the issue is +# transient and you have already collected a must-gather bundle. +# The PVC must exist in the same namespace as the Proposal and be +# pre-populated with the must-gather data before applying this file. +# +# Pre-requisites: +# 1. Collect a must-gather bundle: +# oc adm must-gather --dest-dir=/tmp/must-gather +# 2. Create a PVC and populate it with the bundle data. +# 3. Apply this Proposal (after replacing ): +# kubectl apply -f 09-intelliaide-proposals.yaml +apiVersion: agentic.openshift.io/v1alpha1 +kind: Proposal +metadata: + name: intelliaide-rca-must-gather + namespace: openshift-lightspeed + labels: + agentic.openshift.io/source: intelliaide + agentic.openshift.io/mode: must-gather +spec: + request: | + Perform root cause analysis on the must-gather bundle mounted at + /data/input. Analyze the collected cluster state and identify the + root cause of the issue, then provide remediation options. tools: skills: - image: image-registry.openshift-image-registry.svc:5000/openshift-lightspeed/lightspeed-skills:latest + dataSource: + claimName: # replace with your PVC name analysis: agent: smart + timeoutMinutes: 30