From e9fa284f2c067ac68efa4d3e65f823a22351d9bf Mon Sep 17 00:00:00 2001
From: Andy Xie <anxie@redhat.com>
Date: Tue, 9 Jun 2026 02:10:37 +0800
Subject: [PATCH 1/2] feat(agent): add AgentRunner runtime dispatcher (Phase 2
 foundation)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Route a sub-agent task to the legacy Anthropic loop or the Claude Agent
SDK adapter based on agent.runtime (legacy|sdk, default legacy), returning
the same structured result dict either way.

Additive and dormant: nothing in the request path imports it yet (mirrors
how the #24 adapter shipped behind the flag). A follow-up PR wires the
Icinga sub-agent to dispatch through it. With the default legacy runtime
it is a transparent pass-through to src.agent.agents.run_sub_agent — zero
behavior change. The SDK branch surfaces token/cost/cache usage under
data.usage, which the Phase-2 cost benchmark compares against legacy.

11 tests: runtime resolution, legacy pass-through, SDK result
normalization, and SDK-unavailable error handling.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/agent/runner.py        | 198 +++++++++++++++++++++++++++++++++++
 tests/test_agent_runner.py | 204 +++++++++++++++++++++++++++++++++++++
 2 files changed, 402 insertions(+)
 create mode 100644 src/agent/runner.py
 create mode 100644 tests/test_agent_runner.py

diff --git a/src/agent/runner.py b/src/agent/runner.py
new file mode 100644
index 0000000..51785bc
--- /dev/null
+++ b/src/agent/runner.py
@@ -0,0 +1,198 @@
+"""AgentRunner — route a sub-agent task to the legacy loop or the Claude Agent SDK.
+
+Phase-2 foundation. The orchestrator currently calls :func:`src.agent.agents.run_sub_agent`
+directly. This module introduces a single dispatch seam that reads ``agent.runtime``
+(``legacy|sdk``, default ``legacy``) once and routes each sub-agent task to the matching
+runtime, returning the **same structured result dict** either way so callers don't care
+which runtime answered.
+
+It is intentionally *additive and dormant*: nothing in the request path imports it yet
+(mirroring how the #24 adapter shipped behind the flag). A later PR wires a specific
+sub-agent (Icinga) to dispatch through it. With the default ``legacy`` runtime,
+:meth:`AgentRunner.run_sub_agent` is a transparent pass-through to the existing loop —
+zero behavior change.
+
+The SDK branch here is deliberately minimal: it runs the agent's system prompt through
+:meth:`AgentSdkClient.complete` and normalizes the outcome. Per-agent *skill + tool*
+wiring (e.g. the Icinga ``query_icinga`` tool surface and ``icinga-triage`` SKILL.md)
+lands in the follow-up PR; this module only owns the routing.
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from typing import Any
+
+from src.llm import RUNTIME_SDK, RuntimeName, get_runtime
+
+logger = logging.getLogger(__name__)
+
+
+class AgentRunner:
+    """Dispatches a sub-agent task to the configured runtime.
+
+    Resolve the runtime once (at construction) so a single request doesn't
+    re-read config per sub-agent call. Stateless apart from the resolved
+    runtime + config handle, so it's safe to build per request or reuse.
+    """
+
+    def __init__(self, config: Any, runtime: RuntimeName | None = None) -> None:
+        """Args:
+        config: Dynaconf-style config (or plain dict in tests).
+        runtime: Force a runtime, bypassing ``agent.runtime``. Mainly for
+            tests and benchmark harnesses that want to drive both paths from
+            one config; ``None`` resolves the flag via :func:`get_runtime`.
+        """
+        self._config = config
+        self._runtime: RuntimeName = runtime or get_runtime(config)
+        logger.debug("AgentRunner initialized with runtime=%s", self._runtime)
+
+    @property
+    def runtime(self) -> RuntimeName:
+        """The resolved runtime (``legacy`` or ``sdk``)."""
+        return self._runtime
+
+    async def run_sub_agent(
+        self,
+        agent_type: str,
+        task: str,
+        context: dict | None = None,
+        client: Any = None,
+        event_queue: Any = None,
+        conversation_history: list | None = None,
+    ) -> dict:
+        """Run one sub-agent task on the active runtime.
+
+        Signature mirrors :func:`src.agent.agents.run_sub_agent` so the runner
+        is a drop-in seam. The ``client``/``event_queue``/``conversation_history``
+        arguments are only meaningful for the legacy loop and are ignored by the
+        SDK path (the SDK runs its own loop and streams internally).
+
+        Returns:
+            The legacy result dict (``agent``/``status``/``summary``/``findings``/
+            ``data``/``tool_calls``/``duration_seconds`` …). The SDK path produces
+            the same shape, with token/cost/cache usage surfaced under ``data``.
+        """
+        if self._runtime == RUNTIME_SDK:
+            return await self._run_via_sdk(agent_type=agent_type, task=task, context=context)
+        return await self._run_via_legacy(
+            agent_type=agent_type,
+            task=task,
+            context=context,
+            client=client,
+            event_queue=event_queue,
+            conversation_history=conversation_history,
+        )
+
+    # ----------------------------------------------------------------- legacy
+
+    async def _run_via_legacy(
+        self,
+        *,
+        agent_type: str,
+        task: str,
+        context: dict | None,
+        client: Any,
+        event_queue: Any,
+        conversation_history: list | None,
+    ) -> dict:
+        """Pass-through to the existing Anthropic tool-use loop (unchanged)."""
+        # Imported lazily: agents.py pulls in the full tool/orchestrator graph,
+        # which we don't want to import just to construct an SDK-runtime runner.
+        from src.agent.agents import run_sub_agent as legacy_run_sub_agent
+
+        return await legacy_run_sub_agent(
+            agent_type,
+            task,
+            context=context,
+            client=client,
+            event_queue=event_queue,
+            conversation_history=conversation_history,
+        )
+
+    # -------------------------------------------------------------------- sdk
+
+    async def _run_via_sdk(self, *, agent_type: str, task: str, context: dict | None) -> dict:
+        """Run the task through the Claude Agent SDK adapter and normalize it.
+
+        Minimal by design (see module docstring): no per-agent skill/tool wiring
+        yet. The agent's system prompt is loaded the same way the legacy loop
+        loads it, so the two paths share prompt content for a fair benchmark.
+        """
+        from src.agent.system_prompt import get_agent_prompt
+        from src.llm import AgentSdkClient, AgentSdkUnavailableError
+
+        start = time.monotonic()
+        system = get_agent_prompt(agent_type) or None
+        prompt = _with_context(task, context)
+
+        try:
+            sdk_client = AgentSdkClient.from_config(self._config)
+            result = await sdk_client.complete(prompt=prompt, system=system)
+        except AgentSdkUnavailableError as exc:
+            logger.warning("SDK runtime requested but unavailable: %s", exc)
+            return _error_result(agent_type, str(exc), round(time.monotonic() - start, 1))
+
+        return _sdk_result_to_dict(agent_type, result, round(time.monotonic() - start, 1))
+
+
+# --------------------------------------------------------------------- helpers
+
+
+def _with_context(task: str, context: dict | None) -> str:
+    """Append orchestrator context to the task, matching the legacy framing."""
+    if not context:
+        return task
+    import json
+
+    return f"{task}\n\n**Context from orchestrator:**\n```json\n{json.dumps(context, default=str)}\n```"
+
+
+def _sdk_result_to_dict(agent_type: str, result: Any, duration_seconds: float) -> dict:
+    """Map an :class:`SdkResult` onto the legacy ``run_sub_agent`` result dict.
+
+    Token/cost/cache usage is surfaced under ``data.usage`` — exactly the
+    fields the Phase-2 cost benchmark compares against the legacy path.
+    """
+    usage = result.usage
+    return {
+        "agent": agent_type,
+        "status": "success" if result.succeeded else "error",
+        "summary": result.text or (result.error_message or ""),
+        "findings": [result.text] if result.text else [],
+        "data": {
+            "runtime": "sdk",
+            "model": result.model,
+            "session_id": result.session_id,
+            "usage": {
+                "input_tokens": usage.input_tokens,
+                "output_tokens": usage.output_tokens,
+                "cache_creation_input_tokens": usage.cache_creation_input_tokens,
+                "cache_read_input_tokens": usage.cache_read_input_tokens,
+                "total_cost_usd": usage.total_cost_usd,
+                "num_turns": usage.num_turns,
+            },
+        },
+        "tool_calls": len(result.tool_invocations),
+        "tool_errors": sum(1 for inv in result.tool_invocations if inv.get("is_error")),
+        "rounds_used": result.usage.num_turns,
+        "duration_seconds": duration_seconds,
+        "error": result.error_message,
+    }
+
+
+def _error_result(agent_type: str, message: str, duration_seconds: float = 0.0) -> dict:
+    """A legacy-shaped error result for failures before/around the SDK call."""
+    return {
+        "agent": agent_type,
+        "status": "error",
+        "summary": message,
+        "findings": [],
+        "data": {"runtime": "sdk"},
+        "tool_calls": 0,
+        "tool_errors": 0,
+        "rounds_used": 0,
+        "duration_seconds": duration_seconds,
+        "error": message,
+    }
diff --git a/tests/test_agent_runner.py b/tests/test_agent_runner.py
new file mode 100644
index 0000000..76320b1
--- /dev/null
+++ b/tests/test_agent_runner.py
@@ -0,0 +1,204 @@
+"""Tests for src.agent.runner.AgentRunner.
+
+The runner is a routing seam between the legacy Anthropic loop
+(``src.agent.agents.run_sub_agent``) and the Claude Agent SDK adapter
+(``src.llm.AgentSdkClient``). These tests inject fakes for both heavy
+dependencies via ``monkeypatch`` so the routing/normalization logic can be
+exercised in isolation (no anthropic SDK, no claude_agent_sdk, no config files).
+"""
+
+from __future__ import annotations
+
+import sys
+import types
+from typing import Any
+
+import pytest
+
+from src.agent.runner import (
+    AgentRunner,
+    _error_result,
+    _sdk_result_to_dict,
+    _with_context,
+)
+from src.llm import RUNTIME_LEGACY, RUNTIME_SDK, AgentSdkUnavailableError, SdkResult, SdkUsage
+
+# --------------------------------------------------------------- runtime wiring
+
+
+def test_runtime_defaults_to_legacy() -> None:
+    assert AgentRunner({}).runtime == RUNTIME_LEGACY
+
+
+def test_runtime_reads_flag_from_config() -> None:
+    assert AgentRunner({"agent": {"runtime": "sdk"}}).runtime == RUNTIME_SDK
+
+
+def test_runtime_explicit_override_wins() -> None:
+    # Even with legacy in config, an explicit override forces the runtime
+    # (used by the benchmark harness to drive both paths from one config).
+    runner = AgentRunner({"agent": {"runtime": "legacy"}}, runtime=RUNTIME_SDK)
+    assert runner.runtime == RUNTIME_SDK
+
+
+def test_unknown_runtime_falls_back_to_legacy() -> None:
+    assert AgentRunner({"agent": {"runtime": "bogus"}}).runtime == RUNTIME_LEGACY
+
+
+# ------------------------------------------------------------- legacy dispatch
+
+
+async def test_legacy_dispatch_passes_through(monkeypatch: pytest.MonkeyPatch) -> None:
+    captured: dict[str, Any] = {}
+
+    async def _fake_run_sub_agent(agent_type: str, task: str, **kwargs: Any) -> dict:
+        captured["agent_type"] = agent_type
+        captured["task"] = task
+        captured["kwargs"] = kwargs
+        return {"agent": agent_type, "status": "success", "summary": "legacy answer"}
+
+    fake_agents = types.ModuleType("src.agent.agents")
+    fake_agents.run_sub_agent = _fake_run_sub_agent  # type: ignore[attr-defined]
+    monkeypatch.setitem(sys.modules, "src.agent.agents", fake_agents)
+
+    runner = AgentRunner({"agent": {"runtime": "legacy"}})
+    out = await runner.run_sub_agent(
+        "icinga", "triage alert X", context={"host": "h1"}, conversation_history=[{"x": 1}]
+    )
+
+    assert out["summary"] == "legacy answer"
+    assert captured["agent_type"] == "icinga"
+    assert captured["task"] == "triage alert X"
+    # context/conversation_history forwarded verbatim to the legacy loop
+    assert captured["kwargs"]["context"] == {"host": "h1"}
+    assert captured["kwargs"]["conversation_history"] == [{"x": 1}]
+
+
+# ---------------------------------------------------------------- sdk dispatch
+
+
+class _FakeSdkClient:
+    """Stand-in for AgentSdkClient: records the call and returns a real SdkResult."""
+
+    calls: list[dict[str, Any]] = []
+
+    def __init__(self, result: SdkResult) -> None:
+        self._result = result
+
+    @classmethod
+    def from_config(cls, config: Any) -> _FakeSdkClient:
+        return cls(
+            SdkResult(
+                text="sdk answer",
+                tool_invocations=(
+                    {"name": "query_icinga", "input": {}, "is_error": False},
+                    {"name": "fetch_github_file", "input": {}, "is_error": True},
+                ),
+                model="claude-sonnet-4-5",
+                session_id="sess-1",
+                usage=SdkUsage(
+                    input_tokens=1000,
+                    output_tokens=200,
+                    cache_creation_input_tokens=50,
+                    cache_read_input_tokens=800,
+                    total_cost_usd=0.0123,
+                    num_turns=3,
+                ),
+            )
+        )
+
+    async def complete(self, *, prompt: str, system: str | None = None, **kwargs: Any) -> SdkResult:
+        _FakeSdkClient.calls.append({"prompt": prompt, "system": system})
+        return self._result
+
+
+def _inject_prompt_loader(
+    monkeypatch: pytest.MonkeyPatch, prompt: str = "ICINGA SYSTEM PROMPT"
+) -> None:
+    fake_sp = types.ModuleType("src.agent.system_prompt")
+    fake_sp.get_agent_prompt = lambda agent_type: prompt  # type: ignore[attr-defined]
+    monkeypatch.setitem(sys.modules, "src.agent.system_prompt", fake_sp)
+
+
+async def test_sdk_dispatch_maps_result(monkeypatch: pytest.MonkeyPatch) -> None:
+    _FakeSdkClient.calls = []
+    _inject_prompt_loader(monkeypatch)
+    monkeypatch.setattr("src.llm.AgentSdkClient", _FakeSdkClient)
+
+    runner = AgentRunner({"agent": {"runtime": "sdk"}})
+    out = await runner.run_sub_agent("icinga", "triage alert X", context={"host": "h1"})
+
+    # routing reached the SDK and forwarded the agent's system prompt + task(+context)
+    assert _FakeSdkClient.calls[0]["system"] == "ICINGA SYSTEM PROMPT"
+    assert _FakeSdkClient.calls[0]["prompt"].startswith("triage alert X")
+    assert "Context from orchestrator" in _FakeSdkClient.calls[0]["prompt"]
+
+    # result normalized onto the legacy dict shape
+    assert out["agent"] == "icinga"
+    assert out["status"] == "success"
+    assert out["summary"] == "sdk answer"
+    assert out["tool_calls"] == 2
+    assert out["tool_errors"] == 1
+    assert out["rounds_used"] == 3
+    # cost/cache usage surfaced for the benchmark
+    assert out["data"]["runtime"] == "sdk"
+    assert out["data"]["usage"]["cache_read_input_tokens"] == 800
+    assert out["data"]["usage"]["total_cost_usd"] == pytest.approx(0.0123)
+
+
+async def test_sdk_unavailable_returns_error_not_raises(monkeypatch: pytest.MonkeyPatch) -> None:
+    _inject_prompt_loader(monkeypatch)
+
+    class _Unavailable:
+        @classmethod
+        def from_config(cls, config: Any) -> Any:
+            raise AgentSdkUnavailableError("claude_agent_sdk is not installed")
+
+    monkeypatch.setattr("src.llm.AgentSdkClient", _Unavailable)
+
+    runner = AgentRunner({"agent": {"runtime": "sdk"}})
+    out = await runner.run_sub_agent("icinga", "triage")
+
+    assert out["status"] == "error"
+    assert "not installed" in out["summary"]
+    assert out["data"]["runtime"] == "sdk"
+
+
+# ------------------------------------------------------------------- helpers
+
+
+def test_with_context_noop_when_empty() -> None:
+    assert _with_context("do thing", None) == "do thing"
+    assert _with_context("do thing", {}) == "do thing"
+
+
+def test_with_context_appends_json() -> None:
+    out = _with_context("do thing", {"account": "123"})
+    assert out.startswith("do thing")
+    assert "Context from orchestrator" in out
+    assert '"account": "123"' in out
+
+
+def test_error_result_shape() -> None:
+    out = _error_result("icinga", "boom", 1.5)
+    assert out == {
+        "agent": "icinga",
+        "status": "error",
+        "summary": "boom",
+        "findings": [],
+        "data": {"runtime": "sdk"},
+        "tool_calls": 0,
+        "tool_errors": 0,
+        "rounds_used": 0,
+        "duration_seconds": 1.5,
+        "error": "boom",
+    }
+
+
+def test_sdk_result_to_dict_error_status() -> None:
+    result = SdkResult(text="", is_error=True, error_message="timeout")
+    out = _sdk_result_to_dict("icinga", result, 2.0)
+    assert out["status"] == "error"
+    assert out["summary"] == "timeout"
+    assert out["findings"] == []
+    assert out["duration_seconds"] == 2.0

From 7475b4a77dd23852b9693e6ea917990551e08cda Mon Sep 17 00:00:00 2001
From: Andy Xie <anxie@redhat.com>
Date: Tue, 9 Jun 2026 02:21:56 +0800
Subject: [PATCH 2/2] feat(agent): port Icinga sub-agent to the Agent SDK
 (phase 2 pilot)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Splits the Icinga sub-agent into a first-class skill + a thin SDK profile,
the Phase-2 pilot for running a sub-agent on the Claude Agent SDK.

- skills/icinga-triage/SKILL.md: the Icinga triage workflow as an Agent
  Skill (state model, the two monitoring GitHub repos, the Step-0->diagnose
  procedure, gated write ops). Loads strict with zero warnings; parsec-native,
  domain=icinga. This is the "skill = reusable capability" half.
- src/agent/icinga_sdk.py: build_icinga_sdk_profile() returns the skill +
  the monitoring-mcp (SSE) and GitHub (HTTP) MCP servers the legacy
  query_icinga / github tools already use, so the SDK consumes the same
  backends directly. Config-only and SDK-import-light (unit-testable).
- runner.py: the SDK branch now applies the per-agent profile
  (sdk_profile_for) — Icinga loads its skill + servers; other agents get an
  empty profile. The "agent = running instance that loads the skill" half.

Still gated by agent.runtime: sdk (default legacy) -> zero behavior change.
Skill discovery in-cluster depends on the image baking skills/ (#27); the
runner seam depends on #30. End-to-end Icinga-on-SDK run is verified in the
personal NERC cluster (results to be commented on the PR).

7 Icinga tests + runner suite green; full suite passes.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 skills/icinga-triage/SKILL.md | 120 ++++++++++++++++++++++++++++++++++
 src/agent/icinga_sdk.py       |  85 ++++++++++++++++++++++++
 src/agent/runner.py           |  12 ++--
 tests/test_icinga_sdk.py      |  78 ++++++++++++++++++++++
 4 files changed, 291 insertions(+), 4 deletions(-)
 create mode 100644 skills/icinga-triage/SKILL.md
 create mode 100644 src/agent/icinga_sdk.py
 create mode 100644 tests/test_icinga_sdk.py

diff --git a/skills/icinga-triage/SKILL.md b/skills/icinga-triage/SKILL.md
new file mode 100644
index 0000000..dfada61
--- /dev/null
+++ b/skills/icinga-triage/SKILL.md
@@ -0,0 +1,120 @@
+---
+name: icinga-triage
+description: >
+  Triage and diagnose an Icinga2 monitoring alert by correlating live host/service
+  state with the check-script source and Icinga GitOps config from GitHub, then
+  produce a root cause and an action plan. Use when someone reports a monitoring
+  alert, a host or service is DOWN / CRITICAL / WARNING / UNKNOWN, or asks why an
+  Icinga check is failing.
+license: MIT
+allowed-tools:
+  - query_icinga
+  - fetch_github_file
+  - search_github_repo
+metadata:
+  author: parsec-team
+  maturity: sample
+parsec:
+  version: "1.0.0"
+  domain: icinga
+  requires_mcp:
+    - icinga
+    - github
+  cost_estimate_per_call_usd: 1.38
+---
+
+# Icinga Alert Triage
+
+You are an expert Icinga SRE. Diagnose Icinga monitoring alerts by combining **live
+Icinga state** with **check-script source** and **Icinga GitOps config** from GitHub.
+
+## When to use
+
+- A monitoring alert fired (host DOWN / service CRITICAL, WARNING, or UNKNOWN).
+- Someone asks "why is this Icinga check failing / red?" or pastes a dashboard alert.
+- You need to correlate a monitoring problem with the script or config that produced it.
+
+## Tools
+
+1. **query_icinga** — Icinga2 hosts, services, problems, downtimes, comments. Can also
+   acknowledge, schedule downtime, force a recheck (see Write Operations).
+2. **fetch_github_file** — fetch monitoring scripts and Icinga config from GitHub.
+3. **search_github_repo** — find paths in a repo by substring.
+
+## Reference repositories
+
+| Repo | Purpose | Key paths |
+|------|---------|-----------|
+| `rhpds/monitoring-scripts` | Custom check scripts (`.sh`/`.py`/`.pl`) | `monitoring/<script>` |
+| `rhpds/monitoring-config` | Icinga2 GitOps config (YAML) | `groups/<group>/{hosts,services,commands}.yaml` |
+
+Use `owner: "rhpds"` with the GitHub tools. The config repo is organized by **groups**:
+`ci`, `database`, `exams`, `external_apis`, `infra_rhdp`, `linux`, `openshift`,
+`projectzero`, `public_cloud`, `rhpds`, `rhpds_apis`.
+
+## State model
+
+- **Host states:** 0=UP, 1=DOWN, 2=UNREACHABLE.
+- **Service states:** 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN.
+- **State types:** SOFT (retrying) vs HARD (confirmed after max retries).
+
+## Workflow
+
+### Step 0 — Identify the alert
+Use `query_icinga` to find it. If host+service given, `get_services` with `host` + a
+`filter_expr` using `match()` on `service.display_name`/`service.name`. If only a host,
+list its services. If only a service name, `match("*keyword*", service.display_name)`
+across hosts. If ambiguous, `get_problems`. Dashboard display names (e.g. "Babylon Schema
+YAML Diff") differ from internal names — bridge with `match()` wildcards. Once found,
+extract `attrs.state`, `attrs.last_check_result.{output,command,exit_status}`,
+`attrs.acknowledgement`, `attrs.downtime_depth`, `attrs.host_name`, `attrs.name`. Also
+check `get_comments` and `get_downtimes` — if already in downtime, report that first.
+
+### Step 0.1 — Determine the platform
+Infer from host/display name: `ocpvirt*`/`ocpv*-hcp*`→CNV on IBM Cloud bare metal;
+`cnv-*`→NaaS (OCP VMs on CNV); `babylon-ocp-*`/`integration-ocp-*`→Babylon on AWS;
+`maas.*`→MaaS on IBM Cloud; `infra-*`→Infra. Confirm from the `openshift` subdir in
+`monitoring-config` (`virt/`, `naas/`, `babylon/`, `maas/`, `infra/`) and the
+`hosttype`/`bastion_user` host vars. Record the platform — include it in the output.
+
+### Step 0.5 — Locate and read the check script
+From `last_check_result.command[0]`, get the script path. Custom scripts live in
+`rhpds/monitoring-scripts` under `monitoring/<name>` — fetch with `fetch_github_file`.
+Standard Nagios plugins (`/usr/lib*/nagios/plugins/`) are explained from their args.
+Walk the script's code path that matches the current output + exit status.
+
+### Step 0.75 — Look up the Icinga config
+`search_github_repo` in `rhpds/monitoring-config` for the host/service to find the group,
+then `fetch_github_file` for `groups/<group>/{services,commands,hosts}.yaml`. Trace how
+host vars → service vars → command args → script params connect; note YAML-level
+thresholds (tunable without script changes).
+
+### Step 1 — Triage
+State (OK/WARNING/CRITICAL/UNKNOWN); severity (HARD vs SOFT via `state_type`); scope
+(host/service/cluster); acknowledged or in downtime.
+
+### Step 2 — Diagnose
+Parse `last_check_result.output`; walk the script path that produced the exit status;
+verify args match script expectations; check config thresholds and `assign_where` rules;
+note `check_interval`/`retry_interval` (a long interval can explain stale results).
+
+### Step 3 — Troubleshoot (action plan)
+Immediate mitigations; investigation commands (e.g. `reschedule_check`); long-term
+config/script improvements.
+
+## Efficiency
+Use `detailed=true` on the follow-up `query_icinga` after locating the alert to get
+output+command+config+thresholds in one call. Don't search GitHub for config on simple
+resource alerts (disk/CPU/memory) — the service output is enough. Only read
+`monitoring-config`/`monitoring-scripts` when you need thresholds or check logic.
+
+## Write Operations (gated)
+Only when the user **explicitly** requests them: `acknowledge_problem`,
+`schedule_downtime`, `reschedule_check`, `add_comment`, `remove_comment`,
+`remove_downtime`. These touch live production monitoring — never perform them
+proactively, and confirm host/service identity first.
+
+## Output
+Report: **platform**, **state/severity/scope**, **root cause** (the specific
+script condition or threshold that triggered it, with the config values), and a
+**3-tier action plan** (immediate / investigate / long-term).
diff --git a/src/agent/icinga_sdk.py b/src/agent/icinga_sdk.py
new file mode 100644
index 0000000..12fc9ac
--- /dev/null
+++ b/src/agent/icinga_sdk.py
@@ -0,0 +1,85 @@
+"""SDK invocation profile for the Icinga sub-agent.
+
+When Icinga runs on the Agent SDK (``agent.runtime: sdk``), it loads the
+``icinga-triage`` SKILL.md and talks to the **same backends** the legacy
+``query_icinga`` / GitHub tools use: the ``monitoring-mcp`` sidecar and the
+GitHub MCP server. Both are real MCP servers, so the SDK can consume them
+directly via ``ClaudeAgentOptions(mcp_servers=...)`` — no per-tool shim.
+
+This module builds the ``skills`` / ``allowed_tools`` / ``mcp_servers`` kwargs
+that :meth:`AgentSdkClient.complete` passes through. It is config-only and
+import-light (no SDK dependency) so it is unit-testable without the SDK; the
+exact MCP-server wire format is verified in-cluster.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+#: Sub-agent that has an SDK profile today (the Phase-2 pilot).
+ICINGA_AGENT = "icinga"
+ICINGA_SKILL = "icinga-triage"
+
+
+def sdk_profile_for(agent_type: str, config: Any) -> dict[str, Any]:
+    """Return the ``complete()`` profile kwargs for ``agent_type``, or ``{}``.
+
+    Only Icinga has an SDK profile in Phase 2; every other agent runs the SDK
+    with no skill/tool specialization (``{}``), so the runner stays generic.
+    """
+    if agent_type == ICINGA_AGENT:
+        return build_icinga_sdk_profile(config)
+    return {}
+
+
+def build_icinga_sdk_profile(config: Any) -> dict[str, Any]:
+    """Build the Icinga SDK profile: the skill + the Icinga/GitHub MCP servers.
+
+    Reads ``icinga.mcp_url`` (the monitoring-mcp sidecar, SSE) and ``github.mcp_url``
+    (+ ``github.token`` if present, for auth). A server is only added when its URL
+    is configured, so a partial config degrades gracefully.
+    """
+    icinga_cfg = _section(config, "icinga")
+    github_cfg = _section(config, "github")
+
+    mcp_servers: dict[str, Any] = {}
+
+    icinga_url = str(icinga_cfg.get("mcp_url", "") or "").strip()
+    if icinga_url:
+        mcp_servers["icinga"] = {"type": "sse", "url": icinga_url}
+
+    github_url = str(github_cfg.get("mcp_url", "") or "").strip()
+    if github_url:
+        server: dict[str, Any] = {"type": "http", "url": github_url}
+        token = str(github_cfg.get("token", "") or "").strip()
+        if token:
+            server["headers"] = {"Authorization": f"Bearer {token}"}
+        mcp_servers["github"] = server
+
+    profile: dict[str, Any] = {"skills": [ICINGA_SKILL]}
+    if mcp_servers:
+        profile["mcp_servers"] = mcp_servers
+        profile["allowed_tools"] = _allowed_tools(mcp_servers)
+
+    logger.debug("Icinga SDK profile: skill=%s servers=%s", ICINGA_SKILL, list(mcp_servers))
+    return profile
+
+
+def _allowed_tools(mcp_servers: dict[str, Any]) -> list[str]:
+    """Whitelist the configured MCP servers' tools (server-level prefixes)."""
+    return [f"mcp__{name}" for name in mcp_servers]
+
+
+def _section(config: Any, key: str) -> dict[str, Any]:
+    """Return config sub-section ``key`` as a plain dict (``{}`` if missing)."""
+    if config is None:
+        return {}
+    raw = config.get(key, {}) if hasattr(config, "get") else getattr(config, key, {})
+    if raw is None:
+        return {}
+    if hasattr(raw, "to_dict"):
+        return raw.to_dict()
+    return dict(raw)
diff --git a/src/agent/runner.py b/src/agent/runner.py
index 51785bc..49a6374 100644
--- a/src/agent/runner.py
+++ b/src/agent/runner.py
@@ -116,20 +116,24 @@ async def _run_via_legacy(
     async def _run_via_sdk(self, *, agent_type: str, task: str, context: dict | None) -> dict:
         """Run the task through the Claude Agent SDK adapter and normalize it.
 
-        Minimal by design (see module docstring): no per-agent skill/tool wiring
-        yet. The agent's system prompt is loaded the same way the legacy loop
-        loads it, so the two paths share prompt content for a fair benchmark.
+        The agent's system prompt is loaded the same way the legacy loop loads
+        it, so the two paths share prompt content for a fair benchmark. Per-agent
+        skill + MCP-tool specialization comes from :func:`sdk_profile_for`
+        (Icinga loads ``icinga-triage`` + the monitoring-mcp/GitHub servers;
+        other agents get an empty profile).
         """
+        from src.agent.icinga_sdk import sdk_profile_for
         from src.agent.system_prompt import get_agent_prompt
         from src.llm import AgentSdkClient, AgentSdkUnavailableError
 
         start = time.monotonic()
         system = get_agent_prompt(agent_type) or None
         prompt = _with_context(task, context)
+        profile = sdk_profile_for(agent_type, self._config)
 
         try:
             sdk_client = AgentSdkClient.from_config(self._config)
-            result = await sdk_client.complete(prompt=prompt, system=system)
+            result = await sdk_client.complete(prompt=prompt, system=system, **profile)
         except AgentSdkUnavailableError as exc:
             logger.warning("SDK runtime requested but unavailable: %s", exc)
             return _error_result(agent_type, str(exc), round(time.monotonic() - start, 1))
diff --git a/tests/test_icinga_sdk.py b/tests/test_icinga_sdk.py
new file mode 100644
index 0000000..9c8d775
--- /dev/null
+++ b/tests/test_icinga_sdk.py
@@ -0,0 +1,78 @@
+"""Tests for the Icinga SDK profile + the icinga-triage skill."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from src.agent.icinga_sdk import (
+    ICINGA_SKILL,
+    build_icinga_sdk_profile,
+    sdk_profile_for,
+)
+from src.skills.loader import SkillLoader, SkillSource
+
+# --------------------------------------------------------- profile builder
+
+
+def test_profile_both_servers() -> None:
+    profile = build_icinga_sdk_profile(
+        {
+            "icinga": {"mcp_url": "http://icinga-mcp:8080/sse"},
+            "github": {"mcp_url": "https://api.githubcopilot.com/mcp/"},
+        }
+    )
+    assert profile["skills"] == [ICINGA_SKILL]
+    assert profile["mcp_servers"]["icinga"] == {"type": "sse", "url": "http://icinga-mcp:8080/sse"}
+    assert profile["mcp_servers"]["github"]["url"] == "https://api.githubcopilot.com/mcp/"
+    assert set(profile["allowed_tools"]) == {"mcp__icinga", "mcp__github"}
+
+
+def test_profile_github_token_becomes_auth_header() -> None:
+    profile = build_icinga_sdk_profile(
+        {"github": {"mcp_url": "https://gh/mcp", "token": "ght_abc"}}
+    )
+    assert profile["mcp_servers"]["github"]["headers"] == {"Authorization": "Bearer ght_abc"}
+
+
+def test_profile_no_servers_only_skill() -> None:
+    # Skill still loads even with no MCP configured (degrades gracefully).
+    profile = build_icinga_sdk_profile({})
+    assert profile == {"skills": [ICINGA_SKILL]}
+    assert "mcp_servers" not in profile
+    assert "allowed_tools" not in profile
+
+
+def test_profile_only_icinga_configured() -> None:
+    profile = build_icinga_sdk_profile({"icinga": {"mcp_url": "http://i/sse"}})
+    assert profile["allowed_tools"] == ["mcp__icinga"]
+    assert "github" not in profile["mcp_servers"]
+
+
+# ------------------------------------------------------------ dispatch helper
+
+
+def test_sdk_profile_for_icinga() -> None:
+    assert sdk_profile_for("icinga", {})["skills"] == [ICINGA_SKILL]
+
+
+def test_sdk_profile_for_other_agent_is_empty() -> None:
+    assert sdk_profile_for("cost", {"icinga": {"mcp_url": "x"}}) == {}
+
+
+# ---------------------------------------------------------- the skill itself
+
+
+def test_icinga_triage_skill_loads_strict() -> None:
+    """The shipped icinga-triage SKILL.md must load with zero warnings."""
+    root = Path(__file__).resolve().parent.parent / "skills"
+    loader = SkillLoader([SkillSource(label="project", root=root)])
+    manifests = {m.name: m for m in loader.load_strict()}
+
+    assert "icinga-triage" in manifests
+    skill = manifests["icinga-triage"]
+    assert skill.warnings == ()
+    assert skill.parsec is not None
+    assert skill.parsec.domain == "icinga"
+    assert set(skill.allowed_tools) == {"query_icinga", "fetch_github_file", "search_github_repo"}
+    # description drives SDK auto-discovery — must mention the trigger
+    assert "alert" in (skill.description or "").lower()