diff --git a/.github/workflows/eval-baseline.yml b/.github/workflows/eval-baseline.yml
deleted file mode 100644
index 3cce4010..00000000
--- a/.github/workflows/eval-baseline.yml
+++ /dev/null
@@ -1,140 +0,0 @@
-# Runs all skill evals on push to main and commits baseline results.
-#
-# See docs/specs/2026-04-21-eval-skills-ci-workflow-design.md for full design.
-
-name: Eval Baseline
-
-on:
-  push:
-    branches: [main]
-    paths:
-      - 'plugins/sdlc-workflow/skills/**/*.md'
-      - 'evals/**/evals.json'
-
-jobs:
-  eval-baseline:
-    name: Create Eval Baselines
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Authenticate to Google Cloud
-        uses: google-github-actions/auth@v3
-        with:
-          credentials_json: ${{ secrets.GCP_SA_KEY }}
-
-      - name: Install Claude Code
-        run: curl -fsSL https://claude.ai/install.sh | bash
-
-      - name: Discover all eval suites
-        id: discover
-        run: |
-          skills=""
-          for f in evals/*/evals.json; do
-            [ -f "$f" ] || continue
-            skill=$(basename "$(dirname "$f")")
-            skills="${skills:+${skills},}${skill}"
-          done
-          commit_hash=$(git rev-parse --short HEAD)
-          echo "skills=${skills}" >> "$GITHUB_OUTPUT"
-          echo "commit_hash=${commit_hash}" >> "$GITHUB_OUTPUT"
-          echo "Discovered eval suites: ${skills:-none}"
-          echo "Commit hash: ${commit_hash}"
-
-      - name: Run evals and create baselines
-        if: steps.discover.outputs.skills != ''
-        env:
-          CLAUDE_CODE_USE_VERTEX: "1"
-          CLOUD_ML_REGION: ${{ secrets.GCP_CLOUD_ML_REGION }}
-          ANTHROPIC_DEFAULT_SONNET_MODEL: "claude-sonnet-4-6"
-          ANTHROPIC_DEFAULT_OPUS_MODEL: "claude-opus-4-6"
-          ANTHROPIC_MODEL: "claude-opus-4-6"
-        run: |
-          COMMIT_HASH="${{ steps.discover.outputs.commit_hash }}"
-          IFS=',' read -ra SKILLS <<< "${{ steps.discover.outputs.skills }}"
-
-          for skill in "${SKILLS[@]}"; do
-            workspace="/tmp/${skill}-eval-baseline"
-            baseline_dir="evals/${skill}/baselines/${COMMIT_HASH}"
-
-            echo "=== Running evals for ${skill} ==="
-            echo "Workspace: ${workspace}"
-            echo "Baseline: ${baseline_dir}"
-
-            claude -p "$(cat <<PROMPT
-          Use the sdlc-workflow:run-evals skill for skill /${skill}.
-          Evals path: evals/${skill}/evals.json
-          Workspace: ${workspace}
-          PROMPT
-            )" --permission-mode dontAsk \
-              --allowedTools Read Write Bash Skill Agent Glob \
-              --verbose 2>&1 || {
-              echo "::warning::Eval run failed for ${skill} (exit $?)"
-              continue
-            }
-
-            echo "--- Workspace contents ---"
-            find "${workspace}" -type f 2>/dev/null | head -80 || true
-
-            # Copy results to baseline directory
-            mkdir -p "${baseline_dir}"
-            for eval_dir in "${workspace}"/eval-*/; do
-              [ -d "${eval_dir}" ] || continue
-              cp -r "${eval_dir}" "${baseline_dir}/"
-            done
-            for f in benchmark.json feedback.json summary.md; do
-              [ -f "${workspace}/${f}" ] && cp "${workspace}/${f}" "${baseline_dir}/"
-            done
-
-            # Update latest symlink
-            ln -sfn "${COMMIT_HASH}" "evals/${skill}/baselines/latest"
-
-            echo "--- Baseline contents ---"
-            find "${baseline_dir}" -type f 2>/dev/null || true
-          done
-
-      - name: Commit baselines
-        if: steps.discover.outputs.skills != ''
-        run: |
-          git config user.name "github-actions[bot]"
-          git config user.email "github-actions[bot]@users.noreply.github.com"
-          git add evals/*/baselines/
-          if git diff --cached --quiet; then
-            echo "No baseline changes to commit"
-            exit 0
-          fi
-          git commit -m "chore(evals): create baselines for ${{ steps.discover.outputs.commit_hash }}"
-          for attempt in 1 2 3; do
-            if git push; then
-              echo "Push succeeded on attempt ${attempt}"
-              exit 0
-            fi
-            echo "Push failed (attempt ${attempt}/3), rebasing on latest main..."
-            if ! git pull --rebase -X theirs origin main; then
-              # In rebase, --theirs = the commit being replayed (our local baseline commit).
-              # Resolve 'latest' symlink conflicts in favor of our commit since it's newer.
-              while IFS= read -r conflict; do
-                case "$conflict" in
-                  evals/*/baselines/latest)
-                    git checkout --theirs "$conflict"
-                    git add "$conflict"
-                    ;;
-                  *)
-                    echo "::error::Unexpected conflict in ${conflict} on attempt ${attempt}"
-                    git rebase --abort 2>/dev/null || true
-                    exit 1
-                    ;;
-                esac
-              done < <(git diff --name-only --diff-filter=U 2>/dev/null)
-              if ! git rebase --continue --no-edit 2>/dev/null; then
-                echo "::error::Rebase --continue failed on attempt ${attempt}"
-                git rebase --abort 2>/dev/null || true
-                exit 1
-              fi
-            fi
-          done
-          echo "::error::Push failed after 3 attempts"
-          exit 1
diff --git a/.github/workflows/eval-pr.yml b/.github/workflows/eval-pr.yml
deleted file mode 100644
index 989941a2..00000000
--- a/.github/workflows/eval-pr.yml
+++ /dev/null
@@ -1,152 +0,0 @@
-# Runs skill evals on PRs that modify skill or eval files, compares results
-# against the stored baseline, and posts a benchmark delta as a PR review.
-#
-# See docs/specs/2026-04-21-eval-skills-ci-workflow-design.md for full design.
-
-name: Eval PR
-
-on:
-  pull_request:
-    branches: [main]
-    paths:
-      - 'plugins/sdlc-workflow/skills/**/*.md'
-      - 'evals/**/evals.json'
-      - '.github/workflows/eval-pr.yml'
-
-jobs:
-  eval-pr:
-    name: Eval PR Changes
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      pull-requests: write
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Authenticate to Google Cloud
-        uses: google-github-actions/auth@v3
-        with:
-          credentials_json: ${{ secrets.GCP_SA_KEY }}
-
-      - name: Install Claude Code
-        run: curl -fsSL https://claude.ai/install.sh | bash
-
-      - name: Discover changed skills
-        id: discover
-        env:
-          BASE_SHA: ${{ github.event.pull_request.base.sha }}
-        run: |
-          changed_files=$(git diff --name-only "$BASE_SHA"...HEAD)
-          echo "Changed files:"
-          echo "$changed_files"
-
-          skills=""
-          for file in $changed_files; do
-            skill=""
-            if [[ "$file" =~ ^plugins/sdlc-workflow/skills/([^/]+)/ ]]; then
-              skill="${BASH_REMATCH[1]}"
-            elif [[ "$file" =~ ^evals/([^/]+)/ ]]; then
-              skill="${BASH_REMATCH[1]}"
-            fi
-
-            if [ -n "$skill" ] && [ -f "evals/${skill}/evals.json" ]; then
-              if [[ ! ",$skills," == *",$skill,"* ]]; then
-                skills="${skills:+${skills},}${skill}"
-              fi
-            fi
-          done
-
-          echo "skills=${skills}" >> "$GITHUB_OUTPUT"
-          echo "Discovered changed skills with evals: ${skills:-none}"
-
-      - name: Run PR evals
-        if: steps.discover.outputs.skills != ''
-        env:
-          CLAUDE_CODE_USE_VERTEX: "1"
-          CLOUD_ML_REGION: ${{ secrets.GCP_CLOUD_ML_REGION }}
-          ANTHROPIC_DEFAULT_SONNET_MODEL: "claude-sonnet-4-6"
-          ANTHROPIC_DEFAULT_OPUS_MODEL: "claude-opus-4-6"
-          ANTHROPIC_MODEL: "claude-opus-4-6"
-        run: |
-          IFS=',' read -ra SKILLS <<< "${{ steps.discover.outputs.skills }}"
-
-          for skill in "${SKILLS[@]}"; do
-            eval_count=$(jq '.evals | length' "evals/${skill}/evals.json")
-            workspace="/tmp/${skill}-eval-pr"
-            mkdir -p "${workspace}"
-
-            echo "=== Running PR evals for ${skill} (${eval_count} cases) ==="
-            echo "Workspace: ${workspace}"
-
-            claude -p "$(cat <<PROMPT
-          Use the sdlc-workflow:run-evals skill for skill /${skill}.
-          Evals path: evals/${skill}/evals.json
-          Workspace: ${workspace}
-          PROMPT
-            )" --permission-mode dontAsk \
-              --allowedTools Read Write Bash Skill Agent Glob \
-              --verbose 2>&1 || {
-              echo "::warning::PR eval run failed for ${skill} (exit $?)"
-              continue
-            }
-
-            echo "--- Workspace contents ---"
-            find "${workspace}" -type f 2>/dev/null | head -80 || true
-          done
-
-      - name: Post eval results review
-        if: steps.discover.outputs.skills != ''
-        env:
-          SKILLS_CSV: ${{ steps.discover.outputs.skills }}
-        uses: actions/github-script@v7
-        with:
-          script: |
-            const fs = require('fs');
-            const skills = process.env.SKILLS_CSV.split(',').filter(Boolean);
-
-            let body = '';
-            for (const skill of skills) {
-              const summaryPath = `/tmp/${skill}-eval-pr/summary.md`;
-              if (fs.existsSync(summaryPath)) {
-                body += fs.readFileSync(summaryPath, 'utf8');
-              } else {
-                body += `### ${skill}\n\n> No results produced. See workflow logs.\n\n`;
-              }
-            }
-
-            const pluginJson = JSON.parse(fs.readFileSync('plugins/sdlc-workflow/.claude-plugin/plugin.json', 'utf8'));
-            const version = pluginJson.version;
-
-            body += '---\n';
-            body += `*Generated by [sdlc-workflow/run-evals](plugins/sdlc-workflow/skills/run-evals) v${version}*\n`;
-
-            const { data: reviews } = await github.rest.pulls.listReviews({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              pull_number: context.issue.number
-            });
-            const marker = '## Eval Results';
-            const existing = reviews.find(r =>
-              r.user?.login === 'github-actions[bot]' && r.body?.startsWith(marker)
-            );
-
-            if (existing) {
-              await github.rest.pulls.updateReview({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                pull_number: context.issue.number,
-                review_id: existing.id,
-                body
-              });
-            } else {
-              await github.rest.pulls.createReview({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                pull_number: context.issue.number,
-                event: 'COMMENT',
-                body
-              });
-            }
diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
new file mode 100644
index 00000000..d7ea8f51
--- /dev/null
+++ b/.github/workflows/eval.yml
@@ -0,0 +1,35 @@
+name: Skill Evals
+
+on:
+  pull_request:
+    paths:
+      - 'plugins/sdlc-workflow/skills/**/*.md'
+      - 'evals/**/evals.json'
+  push:
+    branches: [main]
+    paths:
+      - 'plugins/sdlc-workflow/skills/**/*.md'
+      - 'evals/**/evals.json'
+
+jobs:
+  eval:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - uses: google-github-actions/auth@v3
+        with:
+          credentials_json: ${{ secrets.GCP_SA_KEY }}
+
+      - uses: mrizzi/skill-litmus@v0.1.5
+        env:
+          CLAUDE_CODE_USE_VERTEX: "1"
+          CLOUD_ML_REGION: ${{ secrets.GCP_CLOUD_ML_REGION }}
+          ANTHROPIC_DEFAULT_SONNET_MODEL: "claude-sonnet-4-6"
+          ANTHROPIC_DEFAULT_OPUS_MODEL: "claude-opus-4-6"
+          ANTHROPIC_MODEL: "claude-opus-4-6"
diff --git a/evals/README.md b/evals/README.md
index 51e030a6..2b5dc267 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -7,7 +7,7 @@ baselines.
 For architectural decisions and known limitations, see the
 [design spec](../docs/specs/2026-04-16-skill-eval-framework-design.md).
 
-## Why a custom eval skill
+## Why skill-litmus
 
 Anthropic's `skill-creator` includes eval capabilities, but it is designed
 for interactive skill development — running both with-skill and
@@ -16,16 +16,17 @@ iterating through improvement cycles with human review.
 
 CI evaluation has different requirements: deterministic output paths,
 single-configuration runs, no browser, and a summary that can be posted
-as a PR comment or displayed in a terminal. Rather than fighting
-skill-creator's interactive assumptions, `run-evals`
-(`/sdlc-workflow:run-evals`) is a purpose-built skill that:
+as a PR comment or displayed in a terminal.
+[skill-litmus](https://github.com/mrizzi/skill-litmus) is a standalone
+plugin and GitHub Action (`/skill-litmus:run-evals`) that provides a
+shell-driven eval engine:
 
 - Produces results in a fixed directory layout — no variation between runs
 - Runs only the current skill version (baselines are stored separately)
 - Grades assertions and aggregates metrics into `benchmark.json`
-- Renders a Markdown summary via `render_summary.py`, comparing against
-  stored baselines when available
+- Renders a Markdown summary comparing against stored baselines when available
 - Works identically in interactive and headless (`claude -p`) modes
+- Provides a reusable GitHub Action for CI integration
 
 ## Directory structure
 
@@ -59,6 +60,7 @@ Each skill defines its test cases in `evals.json`:
 ```json
 {
   "skill_name": "plan-feature",
+  "plugin": "sdlc-workflow",
   "evals": [
     {
       "id": 1,
@@ -77,6 +79,7 @@ Each skill defines its test cases in `evals.json`:
 | Field | Type | Description |
 |-------|------|-------------|
 | `skill_name` | string | Name of the skill being evaluated |
+| `plugin` | string | Plugin that owns the skill (e.g., `sdlc-workflow`) |
 | `evals[].id` | number | Unique identifier for the test case |
 | `evals[].prompt` | string | The prompt sent to the skill agent |
 | `evals[].expected_output` | string | Natural language description of expected behavior |
@@ -89,13 +92,13 @@ Start a Claude Code session in the repo root and invoke the `run-evals`
 skill:
 
 ```
-/sdlc-workflow:run-evals Run evals for plan-feature.
+/skill-litmus:run-evals Run evals for plan-feature.
 Evals path: evals/plan-feature/evals.json
 Workspace: /tmp/plan-feature-eval
 ```
 
 The skill:
-1. Reads `evals.json` and spawns a subagent per test case
+1. Reads `evals.json` and runs each test case via `claude -p`
 2. Grades each run's outputs against the assertions
 3. Aggregates results into `benchmark.json`
 4. Compares against the stored baseline at `evals/plan-feature/baselines/latest/`
@@ -137,35 +140,24 @@ Every run produces this exact layout:
 | `time_seconds` | Large increases may indicate the skill is doing unnecessary work. |
 | `tokens` | Token usage proxy for cost. Compare across runs. |
 
-## CI workflows
+## CI workflow
 
-Two GitHub Actions workflows automate eval execution. Neither gates
-merges — they report results only.
+A single GitHub Actions workflow (`eval.yml`) automates eval execution
+using the [`mrizzi/skill-litmus`](https://github.com/mrizzi/skill-litmus)
+composite action. It does not gate merges — it reports results only.
 
-### eval-pr.yml (pull request)
+The workflow triggers when `plugins/sdlc-workflow/skills/**/*.md` or
+`evals/**/evals.json` are modified, on both pull requests and pushes to
+main.
 
-Triggers when a PR modifies `plugins/sdlc-workflow/skills/**/*.md` or
-`evals/**/evals.json`.
+- **On pull request** — discovers changed skills, runs their evals,
+  compares against stored baselines, and posts a PR review with the
+  results.
+- **On push to main** — runs all eval suites and commits baseline
+  results to `evals/<skill>/baselines/<hash>/`, updating the `latest`
+  symlink.
 
-1. **Discover changed skills** — `git diff` maps changed files to eval suites
-2. **Run evals** — invokes `run-evals` via `claude -p` for each changed skill
-3. **Post PR comment** — reads `summary.md` from the workspace and posts it
-
-The skill's `render_summary.py` script handles baseline comparison. If
-`evals/<skill>/baselines/latest/` exists, the summary includes a delta
-line. If not, it shows raw results only.
-
-### eval-baseline.yml (push to main)
-
-Triggers when skill or eval files are merged to main.
-
-1. **Discover all eval suites** — runs all skills, not just changed ones
-2. **Run evals** — invokes `run-evals` via `claude -p` for each skill
-3. **Store baselines** — commits results to `evals/<skill>/baselines/<hash>/`
-4. **Update `latest` symlink** — points to the new baseline
-
-This ensures every merge to main has a complete baseline that subsequent
-PRs can compare against.
+Both modes are handled automatically by the skill-litmus action.
 
 ## Baseline strategy
 
@@ -175,7 +167,7 @@ the baseline via the symlink — they don't need to know the exact commit
 hash.
 
 If main receives commits that don't touch skill or eval files, the
-`eval-baseline` workflow doesn't trigger, but `latest` still points to
+eval workflow doesn't trigger, but `latest` still points to
 the most recent valid baseline.
 
 ## Adding evals for a new skill
@@ -220,7 +212,7 @@ network dependencies:
        └───────────────────────────────────────────────────────────┘
 ```
 
-1. **Run** — invoke `/sdlc-workflow:run-evals`
+1. **Run** — invoke `/skill-litmus:run-evals`
 2. **Grade** — the skill grades assertions and produces `grading.json`
 3. **Review** — inspect outputs and record feedback in `feedback.json`
 4. **Improve** — edit the skill's `SKILL.md` based on failures and feedback
diff --git a/evals/define-feature/README.md b/evals/define-feature/README.md
index a4e8b0a1..059edb4f 100644
--- a/evals/define-feature/README.md
+++ b/evals/define-feature/README.md
@@ -58,7 +58,7 @@ Mock CLAUDE.md files representing different project states:
 ## Running
 
 ```
-/sdlc-workflow:run-evals Run evals for define-feature.
+/skill-litmus:run-evals Run evals for define-feature.
 Evals path: evals/define-feature/evals.json
 Workspace: /tmp/define-feature-eval
 ```
diff --git a/evals/define-feature/evals.json b/evals/define-feature/evals.json
index 0409c9b6..c28ac40f 100644
--- a/evals/define-feature/evals.json
+++ b/evals/define-feature/evals.json
@@ -1,5 +1,6 @@
 {
   "skill_name": "define-feature",
+  "plugin": "sdlc-workflow",
   "evals": [
     {
       "id": 1,
diff --git a/evals/implement-task/README.md b/evals/implement-task/README.md
index fe49cbea..14bc4b97 100644
--- a/evals/implement-task/README.md
+++ b/evals/implement-task/README.md
@@ -62,7 +62,7 @@ during Step 0 validation.
 ## Running
 
 ```
-/sdlc-workflow:run-evals Run evals for implement-task.
+/skill-litmus:run-evals Run evals for implement-task.
 Evals path: evals/implement-task/evals.json
 Workspace: /tmp/implement-task-eval
 ```
diff --git a/evals/implement-task/evals.json b/evals/implement-task/evals.json
index f27eb80a..77c2b203 100644
--- a/evals/implement-task/evals.json
+++ b/evals/implement-task/evals.json
@@ -1,5 +1,6 @@
 {
   "skill_name": "implement-task",
+  "plugin": "sdlc-workflow",
   "evals": [
     {
       "id": 1,
diff --git a/evals/plan-feature/README.md b/evals/plan-feature/README.md
index c5aa5c5d..3eb01077 100644
--- a/evals/plan-feature/README.md
+++ b/evals/plan-feature/README.md
@@ -37,7 +37,7 @@ multi-repo test case.
 ## Running
 
 ```
-/sdlc-workflow:run-evals Run evals for plan-feature.
+/skill-litmus:run-evals Run evals for plan-feature.
 Evals path: evals/plan-feature/evals.json
 Workspace: /tmp/plan-feature-eval
 ```
diff --git a/evals/plan-feature/evals.json b/evals/plan-feature/evals.json
index e930f536..1a5a79e4 100644
--- a/evals/plan-feature/evals.json
+++ b/evals/plan-feature/evals.json
@@ -1,5 +1,6 @@
 {
   "skill_name": "plan-feature",
+  "plugin": "sdlc-workflow",
   "evals": [
     {
       "id": 1,
diff --git a/evals/setup/README.md b/evals/setup/README.md
index 7c32dd41..1965b1ee 100644
--- a/evals/setup/README.md
+++ b/evals/setup/README.md
@@ -60,7 +60,7 @@ Mock MCP tool listings simulating tool discovery output:
 ## Running
 
 ```
-/sdlc-workflow:run-evals Run evals for setup.
+/skill-litmus:run-evals Run evals for setup.
 Evals path: evals/setup/evals.json
 Workspace: /tmp/setup-eval
 ```
diff --git a/evals/setup/evals.json b/evals/setup/evals.json
index ac103674..a2400d85 100644
--- a/evals/setup/evals.json
+++ b/evals/setup/evals.json
@@ -1,5 +1,6 @@
 {
   "skill_name": "setup",
+  "plugin": "sdlc-workflow",
   "evals": [
     {
       "id": 1,
diff --git a/evals/verify-pr/README.md b/evals/verify-pr/README.md
index 4e664ede..8115e711 100644
--- a/evals/verify-pr/README.md
+++ b/evals/verify-pr/README.md
@@ -59,7 +59,7 @@ from `docs/constraints.md`:
 ## Running
 
 ```
-/sdlc-workflow:run-evals Run evals for verify-pr.
+/skill-litmus:run-evals Run evals for verify-pr.
 Evals path: evals/verify-pr/evals.json
 Workspace: /tmp/verify-pr-eval
 ```
diff --git a/evals/verify-pr/evals.json b/evals/verify-pr/evals.json
index 845f5c23..50d8f410 100644
--- a/evals/verify-pr/evals.json
+++ b/evals/verify-pr/evals.json
@@ -1,5 +1,6 @@
 {
   "skill_name": "verify-pr",
+  "plugin": "sdlc-workflow",
   "evals": [
     {
       "id": 1,
diff --git a/plugins/sdlc-workflow/skills/run-evals/SKILL.md b/plugins/sdlc-workflow/skills/run-evals/SKILL.md
deleted file mode 100644
index 6015f423..00000000
--- a/plugins/sdlc-workflow/skills/run-evals/SKILL.md
+++ /dev/null
@@ -1,198 +0,0 @@
----
-name: run-evals
-description: Run skill evals with deterministic output layout and CI-compatible results. Grades assertions, aggregates benchmark.json, and renders summary.md via Python scripts. NOT skill-creator — this skill produces fixed output paths for automated pipelines. Triggers on "run evals", "eval <skill-name>", "benchmark", or "grade evals".
----
-
-# Run Evals
-
-Run eval cases for a skill, grade each against assertions, and produce
-structured results in a deterministic directory layout.
-
-## Inputs
-
-The user provides:
-- **Skill name** — the slash-command name of the skill to test (e.g., `plan-feature`)
-- **Evals path** — path to the `evals.json` file (e.g., `evals/plan-feature/evals.json`)
-- **Workspace** — directory where results are written
-
-## Output Structure
-
-Every run produces this exact layout — no variation:
-
-```
-<workspace>/
-├── benchmark.json
-├── feedback.json
-├── summary.md
-├── eval-1/
-│   ├── grading.json
-│   ├── timing.json
-│   └── outputs/
-│       └── (skill outputs)
-├── eval-2/
-│   └── ...
-└── eval-N/
-    └── ...
-```
-
-## Process
-
-### Step 1 — Read evals.json
-
-Read the evals file and extract:
-- `skill_name` — the skill being evaluated
-- `evals[]` — array of test cases, each with `id`, `prompt`, `expected_output`,
-  `files` (optional), and `assertions`
-
-### Step 2 — Execute each eval case
-
-For each eval in `evals[]`, spawn a subagent with this prompt:
-
-```
-You are executing an eval for the /sdlc-workflow:<skill-name> skill.
-
-Task: <eval.prompt>
-
-<if eval.files>
-Input files (read these before starting):
-<for each file in eval.files>
-- <evals_dir>/<file>
-</for>
-</if>
-
-Write all outputs to: <workspace>/eval-<eval.id>/outputs/
-
-Important:
-- Invoke the /sdlc-workflow:<skill-name> skill via the Skill tool to process this task
-- Write every output file to the outputs/ directory
-- Do not interact with external services (Jira, Figma, etc.) — write to files instead
-```
-
-**Parallelism:** Spawn all eval subagents in a single turn so they run
-concurrently. Do not wait for one eval to complete before starting the
-next — the eval cases are independent.
-
-When each subagent completes, capture `total_tokens` and `duration_ms`
-from the task completion notification immediately. Write to
-`<workspace>/eval-<eval.id>/timing.json`:
-
-```json
-{
-  "total_tokens": <value>,
-  "duration_ms": <value>
-}
-```
-
-### Step 3 — Grade each eval case
-
-As each eval completes, spawn a grader subagent. Grading can overlap
-with execution — grade each eval as it finishes rather than waiting for
-all to complete.
-
-Grader prompt:
-
-```
-Grade the outputs of an eval run against these assertions.
-
-Assertions:
-<for each assertion in eval.assertions>
-- <assertion>
-</for>
-
-Outputs directory: <workspace>/eval-<eval.id>/outputs/
-
-## Grading rules
-
-1. Read every file in the outputs directory. Open and read file
-   contents — do not judge based on filenames alone.
-2. Burden of proof is on PASS. Default to FAIL. Only mark PASS when
-   you find specific, concrete evidence in the output files that
-   satisfies the assertion.
-3. No partial credit. Each assertion is binary: PASS or FAIL.
-   "Mostly correct" or "partially addressed" is FAIL.
-4. Cite specific evidence. The evidence field must quote or reference
-   exact content from output files — file paths, line excerpts, counts.
-   Never write vague evidence like "the output generally addresses this."
-5. Check structure AND content. Verify both that expected sections/files
-   exist AND that their content satisfies the assertion.
-6. Contradictory evidence means FAIL. If some outputs support the
-   assertion but others contradict it, the assertion fails.
-
-Write results to: <workspace>/eval-<eval.id>/grading.json
-
-Use this exact JSON structure:
-{
-  "assertion_results": [
-    {
-      "text": "<assertion text>",
-      "passed": true/false,
-      "evidence": "<specific evidence from output files>"
-    }
-  ],
-  "summary": {
-    "passed": <count>,
-    "failed": <count>,
-    "total": <count>,
-    "pass_rate": <0.0 to 1.0>
-  }
-}
-```
-
-### Step 4 — Aggregate into benchmark.json
-
-After all evals are graded, run the aggregation script:
-
-```bash
-python3 <skill-dir>/scripts/aggregate_benchmark.py --results <workspace>
-```
-
-The script reads all `eval-N/grading.json` and `eval-N/timing.json`
-files, computes mean and stddev for pass rate, duration, and token usage,
-and writes `<workspace>/benchmark.json`.
-
-### Step 5 — Create feedback.json placeholder
-
-Write `<workspace>/feedback.json` with empty strings for each eval:
-
-```json
-{
-  "eval-1": "",
-  "eval-2": "",
-  ...
-}
-```
-
-### Step 6 — Render summary
-
-Determine the baseline path: `evals/<skill-name>/baselines/latest/`.
-
-Run the render script, passing the skill name so the heading identifies
-which skill the results belong to:
-
-```bash
-python3 <skill-dir>/scripts/render_summary.py \
-  --results <workspace> \
-  --baseline <baseline-path> \
-  --skill <skill-name>
-```
-
-If the baseline path does not exist, omit `--baseline` — the script
-renders results without a comparison.
-
-The script writes `<workspace>/summary.md`. Display its contents to the
-user.
-
-## Rules
-
-- Write all outputs to the exact paths specified. No intermediate directories,
-  no configuration-named subdirectories.
-- Every eval case gets its own `eval-<id>/` directory using the `id` from evals.json.
-- `grading.json` uses `assertion_results` (not `expectations`) with fields
-  `text`, `passed`, `evidence`.
-- Timing data must be captured from the task completion notification — it cannot
-  be recovered after the fact.
-- Do not launch an eval viewer or browser.
-- Do not run baseline comparisons — only run the current skill version.
-  Baseline comparison is handled by the render script in Step 6.
-- `<skill-dir>` is the directory containing this SKILL.md — resolve it from
-  the absolute path shown in the skill invocation header.
diff --git a/plugins/sdlc-workflow/skills/run-evals/scripts/aggregate_benchmark.py b/plugins/sdlc-workflow/skills/run-evals/scripts/aggregate_benchmark.py
deleted file mode 100644
index cba668ca..00000000
--- a/plugins/sdlc-workflow/skills/run-evals/scripts/aggregate_benchmark.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env python3
-"""Aggregate eval grading and timing results into benchmark.json."""
-
-import argparse
-import json
-import statistics
-import sys
-from pathlib import Path
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Aggregate eval results into benchmark.json"
-    )
-    parser.add_argument(
-        "--results", type=Path, required=True, help="Workspace with eval results"
-    )
-    args = parser.parse_args()
-
-    if not args.results.exists():
-        print(f"Results directory not found: {args.results}", file=sys.stderr)
-        sys.exit(1)
-
-    pass_rates = []
-    time_seconds = []
-    tokens = []
-
-    for eval_dir in sorted(args.results.glob("eval-*")):
-        grading_path = eval_dir / "grading.json"
-        if not grading_path.exists():
-            continue
-
-        grading = json.loads(grading_path.read_text())
-        rate = grading.get("summary", {}).get("pass_rate")
-        if rate is not None:
-            pass_rates.append(rate)
-
-        timing_path = eval_dir / "timing.json"
-        if timing_path.exists():
-            timing = json.loads(timing_path.read_text())
-            duration_ms = timing.get("duration_ms")
-            if duration_ms is not None:
-                time_seconds.append(duration_ms / 1000)
-            total_tokens = timing.get("total_tokens")
-            if total_tokens is not None:
-                tokens.append(total_tokens)
-
-    benchmark = {
-        "run_summary": {
-            "pass_rate": _stats(pass_rates),
-            "time_seconds": _stats(time_seconds),
-            "tokens": _stats(tokens),
-        }
-    }
-
-    output_path = args.results / "benchmark.json"
-    output_path.write_text(json.dumps(benchmark, indent=2) + "\n")
-    print(f"Written: {output_path}")
-
-
-def _stats(values: list[float]) -> dict:
-    if not values:
-        return {"mean": 0.0, "stddev": 0.0}
-    mean = statistics.mean(values)
-    stddev = statistics.pstdev(values) if len(values) > 1 else 0.0
-    return {"mean": round(mean, 2), "stddev": round(stddev, 2)}
-
-
-if __name__ == "__main__":
-    main()
diff --git a/plugins/sdlc-workflow/skills/run-evals/scripts/render_summary.py b/plugins/sdlc-workflow/skills/run-evals/scripts/render_summary.py
deleted file mode 100644
index a7e4368e..00000000
--- a/plugins/sdlc-workflow/skills/run-evals/scripts/render_summary.py
+++ /dev/null
@@ -1,140 +0,0 @@
-#!/usr/bin/env python3
-"""Render eval results and optional baseline comparison as Markdown."""
-
-import argparse
-import json
-import sys
-from pathlib import Path
-
-
-def load_grading_files(results_dir: Path) -> list[dict]:
-    """Load all eval-N/grading.json files, sorted by eval ID."""
-    gradings = []
-    for eval_dir in sorted(results_dir.glob("eval-*")):
-        grading_path = eval_dir / "grading.json"
-        if not grading_path.exists():
-            continue
-        grading = json.loads(grading_path.read_text())
-        grading["_eval_id"] = eval_dir.name
-        gradings.append(grading)
-    return gradings
-
-
-def render(results_dir: Path, baseline_dir: Path | None, skill: str | None = None) -> str:
-    gradings = load_grading_files(results_dir)
-
-    if not gradings:
-        return "> No eval results found.\n"
-
-    heading = f"## Eval Results: {skill}" if skill else "## Eval Results"
-    lines = [heading, ""]
-
-    # Per-eval table
-    lines.append("| Eval | Passed | Failed | Pass Rate |")
-    lines.append("|------|--------|--------|-----------|")
-
-    for g in gradings:
-        eval_id = g["_eval_id"]
-        s = g.get("summary", {})
-        passed = s.get("passed", 0)
-        failed = s.get("failed", 0)
-        total = s.get("total", 0)
-        rate = s.get("pass_rate", 0)
-        lines.append(
-            f"| {eval_id} | {passed}/{total} | {failed} | {rate * 100:.0f}% |"
-        )
-
-    lines.append("")
-
-    # Aggregate from benchmark.json if present
-    benchmark_path = results_dir / "benchmark.json"
-    if benchmark_path.exists():
-        benchmark = json.loads(benchmark_path.read_text())
-        rs = benchmark.get("run_summary", {})
-        pr_mean = rs.get("pass_rate", {}).get("mean")
-        tokens_mean = rs.get("tokens", {}).get("mean")
-        time_mean = rs.get("time_seconds", {}).get("mean")
-
-        parts = []
-        if pr_mean is not None:
-            parts.append(f"**Pass rate:** {pr_mean * 100:.0f}%")
-        if tokens_mean is not None:
-            parts.append(f"**Tokens:** {int(tokens_mean):,}")
-        if time_mean is not None:
-            parts.append(f"**Duration:** {time_mean:.0f}s")
-        if parts:
-            lines.append(" · ".join(parts))
-            lines.append("")
-
-    # Baseline comparison
-    if baseline_dir and baseline_dir.exists():
-        baseline_benchmark = baseline_dir / "benchmark.json"
-        if baseline_benchmark.exists():
-            baseline = json.loads(baseline_benchmark.read_text())
-            brs = baseline.get("run_summary", {})
-            b_pr = brs.get("pass_rate", {}).get("mean")
-            b_tokens = brs.get("tokens", {}).get("mean")
-            b_time = brs.get("time_seconds", {}).get("mean")
-
-            baseline_label = baseline_dir.name
-            if baseline_dir.is_symlink():
-                baseline_label = baseline_dir.resolve().name
-
-            parts = []
-            if b_pr is not None:
-                parts.append(f"{b_pr * 100:.0f}%")
-            if b_tokens is not None:
-                parts.append(f"{int(b_tokens):,} tokens")
-            if b_time is not None:
-                parts.append(f"{b_time:.0f}s")
-            if parts:
-                lines.append(
-                    f"**Baseline** (`{baseline_label}`): " + " · ".join(parts)
-                )
-                lines.append("")
-
-    return "\n".join(lines) + "\n"
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Render eval results as Markdown"
-    )
-    parser.add_argument(
-        "--results", type=Path, required=True, help="Workspace with eval results"
-    )
-    parser.add_argument(
-        "--baseline", type=Path, default=None, help="Baseline directory"
-    )
-    parser.add_argument(
-        "--skill",
-        type=str,
-        default=None,
-        help="Skill name to include in the heading",
-    )
-    parser.add_argument(
-        "--output",
-        type=Path,
-        default=None,
-        help="Output file (default: <results>/summary.md)",
-    )
-    args = parser.parse_args()
-
-    if not args.results.exists():
-        print(f"Results directory not found: {args.results}", file=sys.stderr)
-        sys.exit(1)
-
-    baseline = args.baseline
-    if baseline and not baseline.exists():
-        print(f"Warning: baseline not found: {baseline}", file=sys.stderr)
-        baseline = None
-
-    md = render(args.results, baseline, args.skill)
-
-    output_path = args.output or (args.results / "summary.md")
-    output_path.write_text(md)
-    print(md)
-
-
-if __name__ == "__main__":
-    main()