diff --git a/.github/workflows/eval-baseline.yml b/.github/workflows/eval-baseline.yml deleted file mode 100644 index 3cce4010..00000000 --- a/.github/workflows/eval-baseline.yml +++ /dev/null @@ -1,140 +0,0 @@ -# Runs all skill evals on push to main and commits baseline results. -# -# See docs/specs/2026-04-21-eval-skills-ci-workflow-design.md for full design. - -name: Eval Baseline - -on: - push: - branches: [main] - paths: - - 'plugins/sdlc-workflow/skills/**/*.md' - - 'evals/**/evals.json' - -jobs: - eval-baseline: - name: Create Eval Baselines - runs-on: ubuntu-latest - permissions: - contents: write - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Authenticate to Google Cloud - uses: google-github-actions/auth@v3 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - - - name: Install Claude Code - run: curl -fsSL https://claude.ai/install.sh | bash - - - name: Discover all eval suites - id: discover - run: | - skills="" - for f in evals/*/evals.json; do - [ -f "$f" ] || continue - skill=$(basename "$(dirname "$f")") - skills="${skills:+${skills},}${skill}" - done - commit_hash=$(git rev-parse --short HEAD) - echo "skills=${skills}" >> "$GITHUB_OUTPUT" - echo "commit_hash=${commit_hash}" >> "$GITHUB_OUTPUT" - echo "Discovered eval suites: ${skills:-none}" - echo "Commit hash: ${commit_hash}" - - - name: Run evals and create baselines - if: steps.discover.outputs.skills != '' - env: - CLAUDE_CODE_USE_VERTEX: "1" - CLOUD_ML_REGION: ${{ secrets.GCP_CLOUD_ML_REGION }} - ANTHROPIC_DEFAULT_SONNET_MODEL: "claude-sonnet-4-6" - ANTHROPIC_DEFAULT_OPUS_MODEL: "claude-opus-4-6" - ANTHROPIC_MODEL: "claude-opus-4-6" - run: | - COMMIT_HASH="${{ steps.discover.outputs.commit_hash }}" - IFS=',' read -ra SKILLS <<< "${{ steps.discover.outputs.skills }}" - - for skill in "${SKILLS[@]}"; do - workspace="/tmp/${skill}-eval-baseline" - baseline_dir="evals/${skill}/baselines/${COMMIT_HASH}" - - echo "=== Running evals for ${skill} ===" - echo "Workspace: ${workspace}" - echo "Baseline: ${baseline_dir}" - - claude -p "$(cat <&1 || { - echo "::warning::Eval run failed for ${skill} (exit $?)" - continue - } - - echo "--- Workspace contents ---" - find "${workspace}" -type f 2>/dev/null | head -80 || true - - # Copy results to baseline directory - mkdir -p "${baseline_dir}" - for eval_dir in "${workspace}"/eval-*/; do - [ -d "${eval_dir}" ] || continue - cp -r "${eval_dir}" "${baseline_dir}/" - done - for f in benchmark.json feedback.json summary.md; do - [ -f "${workspace}/${f}" ] && cp "${workspace}/${f}" "${baseline_dir}/" - done - - # Update latest symlink - ln -sfn "${COMMIT_HASH}" "evals/${skill}/baselines/latest" - - echo "--- Baseline contents ---" - find "${baseline_dir}" -type f 2>/dev/null || true - done - - - name: Commit baselines - if: steps.discover.outputs.skills != '' - run: | - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - git add evals/*/baselines/ - if git diff --cached --quiet; then - echo "No baseline changes to commit" - exit 0 - fi - git commit -m "chore(evals): create baselines for ${{ steps.discover.outputs.commit_hash }}" - for attempt in 1 2 3; do - if git push; then - echo "Push succeeded on attempt ${attempt}" - exit 0 - fi - echo "Push failed (attempt ${attempt}/3), rebasing on latest main..." - if ! git pull --rebase -X theirs origin main; then - # In rebase, --theirs = the commit being replayed (our local baseline commit). - # Resolve 'latest' symlink conflicts in favor of our commit since it's newer. - while IFS= read -r conflict; do - case "$conflict" in - evals/*/baselines/latest) - git checkout --theirs "$conflict" - git add "$conflict" - ;; - *) - echo "::error::Unexpected conflict in ${conflict} on attempt ${attempt}" - git rebase --abort 2>/dev/null || true - exit 1 - ;; - esac - done < <(git diff --name-only --diff-filter=U 2>/dev/null) - if ! git rebase --continue --no-edit 2>/dev/null; then - echo "::error::Rebase --continue failed on attempt ${attempt}" - git rebase --abort 2>/dev/null || true - exit 1 - fi - fi - done - echo "::error::Push failed after 3 attempts" - exit 1 diff --git a/.github/workflows/eval-pr.yml b/.github/workflows/eval-pr.yml deleted file mode 100644 index 989941a2..00000000 --- a/.github/workflows/eval-pr.yml +++ /dev/null @@ -1,152 +0,0 @@ -# Runs skill evals on PRs that modify skill or eval files, compares results -# against the stored baseline, and posts a benchmark delta as a PR review. -# -# See docs/specs/2026-04-21-eval-skills-ci-workflow-design.md for full design. - -name: Eval PR - -on: - pull_request: - branches: [main] - paths: - - 'plugins/sdlc-workflow/skills/**/*.md' - - 'evals/**/evals.json' - - '.github/workflows/eval-pr.yml' - -jobs: - eval-pr: - name: Eval PR Changes - runs-on: ubuntu-latest - permissions: - contents: read - pull-requests: write - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Authenticate to Google Cloud - uses: google-github-actions/auth@v3 - with: - credentials_json: ${{ secrets.GCP_SA_KEY }} - - - name: Install Claude Code - run: curl -fsSL https://claude.ai/install.sh | bash - - - name: Discover changed skills - id: discover - env: - BASE_SHA: ${{ github.event.pull_request.base.sha }} - run: | - changed_files=$(git diff --name-only "$BASE_SHA"...HEAD) - echo "Changed files:" - echo "$changed_files" - - skills="" - for file in $changed_files; do - skill="" - if [[ "$file" =~ ^plugins/sdlc-workflow/skills/([^/]+)/ ]]; then - skill="${BASH_REMATCH[1]}" - elif [[ "$file" =~ ^evals/([^/]+)/ ]]; then - skill="${BASH_REMATCH[1]}" - fi - - if [ -n "$skill" ] && [ -f "evals/${skill}/evals.json" ]; then - if [[ ! ",$skills," == *",$skill,"* ]]; then - skills="${skills:+${skills},}${skill}" - fi - fi - done - - echo "skills=${skills}" >> "$GITHUB_OUTPUT" - echo "Discovered changed skills with evals: ${skills:-none}" - - - name: Run PR evals - if: steps.discover.outputs.skills != '' - env: - CLAUDE_CODE_USE_VERTEX: "1" - CLOUD_ML_REGION: ${{ secrets.GCP_CLOUD_ML_REGION }} - ANTHROPIC_DEFAULT_SONNET_MODEL: "claude-sonnet-4-6" - ANTHROPIC_DEFAULT_OPUS_MODEL: "claude-opus-4-6" - ANTHROPIC_MODEL: "claude-opus-4-6" - run: | - IFS=',' read -ra SKILLS <<< "${{ steps.discover.outputs.skills }}" - - for skill in "${SKILLS[@]}"; do - eval_count=$(jq '.evals | length' "evals/${skill}/evals.json") - workspace="/tmp/${skill}-eval-pr" - mkdir -p "${workspace}" - - echo "=== Running PR evals for ${skill} (${eval_count} cases) ===" - echo "Workspace: ${workspace}" - - claude -p "$(cat <&1 || { - echo "::warning::PR eval run failed for ${skill} (exit $?)" - continue - } - - echo "--- Workspace contents ---" - find "${workspace}" -type f 2>/dev/null | head -80 || true - done - - - name: Post eval results review - if: steps.discover.outputs.skills != '' - env: - SKILLS_CSV: ${{ steps.discover.outputs.skills }} - uses: actions/github-script@v7 - with: - script: | - const fs = require('fs'); - const skills = process.env.SKILLS_CSV.split(',').filter(Boolean); - - let body = ''; - for (const skill of skills) { - const summaryPath = `/tmp/${skill}-eval-pr/summary.md`; - if (fs.existsSync(summaryPath)) { - body += fs.readFileSync(summaryPath, 'utf8'); - } else { - body += `### ${skill}\n\n> No results produced. See workflow logs.\n\n`; - } - } - - const pluginJson = JSON.parse(fs.readFileSync('plugins/sdlc-workflow/.claude-plugin/plugin.json', 'utf8')); - const version = pluginJson.version; - - body += '---\n'; - body += `*Generated by [sdlc-workflow/run-evals](plugins/sdlc-workflow/skills/run-evals) v${version}*\n`; - - const { data: reviews } = await github.rest.pulls.listReviews({ - owner: context.repo.owner, - repo: context.repo.repo, - pull_number: context.issue.number - }); - const marker = '## Eval Results'; - const existing = reviews.find(r => - r.user?.login === 'github-actions[bot]' && r.body?.startsWith(marker) - ); - - if (existing) { - await github.rest.pulls.updateReview({ - owner: context.repo.owner, - repo: context.repo.repo, - pull_number: context.issue.number, - review_id: existing.id, - body - }); - } else { - await github.rest.pulls.createReview({ - owner: context.repo.owner, - repo: context.repo.repo, - pull_number: context.issue.number, - event: 'COMMENT', - body - }); - } diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml new file mode 100644 index 00000000..d7ea8f51 --- /dev/null +++ b/.github/workflows/eval.yml @@ -0,0 +1,35 @@ +name: Skill Evals + +on: + pull_request: + paths: + - 'plugins/sdlc-workflow/skills/**/*.md' + - 'evals/**/evals.json' + push: + branches: [main] + paths: + - 'plugins/sdlc-workflow/skills/**/*.md' + - 'evals/**/evals.json' + +jobs: + eval: + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: google-github-actions/auth@v3 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + + - uses: mrizzi/skill-litmus@v0.1.5 + env: + CLAUDE_CODE_USE_VERTEX: "1" + CLOUD_ML_REGION: ${{ secrets.GCP_CLOUD_ML_REGION }} + ANTHROPIC_DEFAULT_SONNET_MODEL: "claude-sonnet-4-6" + ANTHROPIC_DEFAULT_OPUS_MODEL: "claude-opus-4-6" + ANTHROPIC_MODEL: "claude-opus-4-6" diff --git a/evals/README.md b/evals/README.md index 51e030a6..2b5dc267 100644 --- a/evals/README.md +++ b/evals/README.md @@ -7,7 +7,7 @@ baselines. For architectural decisions and known limitations, see the [design spec](../docs/specs/2026-04-16-skill-eval-framework-design.md). -## Why a custom eval skill +## Why skill-litmus Anthropic's `skill-creator` includes eval capabilities, but it is designed for interactive skill development — running both with-skill and @@ -16,16 +16,17 @@ iterating through improvement cycles with human review. CI evaluation has different requirements: deterministic output paths, single-configuration runs, no browser, and a summary that can be posted -as a PR comment or displayed in a terminal. Rather than fighting -skill-creator's interactive assumptions, `run-evals` -(`/sdlc-workflow:run-evals`) is a purpose-built skill that: +as a PR comment or displayed in a terminal. +[skill-litmus](https://github.com/mrizzi/skill-litmus) is a standalone +plugin and GitHub Action (`/skill-litmus:run-evals`) that provides a +shell-driven eval engine: - Produces results in a fixed directory layout — no variation between runs - Runs only the current skill version (baselines are stored separately) - Grades assertions and aggregates metrics into `benchmark.json` -- Renders a Markdown summary via `render_summary.py`, comparing against - stored baselines when available +- Renders a Markdown summary comparing against stored baselines when available - Works identically in interactive and headless (`claude -p`) modes +- Provides a reusable GitHub Action for CI integration ## Directory structure @@ -59,6 +60,7 @@ Each skill defines its test cases in `evals.json`: ```json { "skill_name": "plan-feature", + "plugin": "sdlc-workflow", "evals": [ { "id": 1, @@ -77,6 +79,7 @@ Each skill defines its test cases in `evals.json`: | Field | Type | Description | |-------|------|-------------| | `skill_name` | string | Name of the skill being evaluated | +| `plugin` | string | Plugin that owns the skill (e.g., `sdlc-workflow`) | | `evals[].id` | number | Unique identifier for the test case | | `evals[].prompt` | string | The prompt sent to the skill agent | | `evals[].expected_output` | string | Natural language description of expected behavior | @@ -89,13 +92,13 @@ Start a Claude Code session in the repo root and invoke the `run-evals` skill: ``` -/sdlc-workflow:run-evals Run evals for plan-feature. +/skill-litmus:run-evals Run evals for plan-feature. Evals path: evals/plan-feature/evals.json Workspace: /tmp/plan-feature-eval ``` The skill: -1. Reads `evals.json` and spawns a subagent per test case +1. Reads `evals.json` and runs each test case via `claude -p` 2. Grades each run's outputs against the assertions 3. Aggregates results into `benchmark.json` 4. Compares against the stored baseline at `evals/plan-feature/baselines/latest/` @@ -137,35 +140,24 @@ Every run produces this exact layout: | `time_seconds` | Large increases may indicate the skill is doing unnecessary work. | | `tokens` | Token usage proxy for cost. Compare across runs. | -## CI workflows +## CI workflow -Two GitHub Actions workflows automate eval execution. Neither gates -merges — they report results only. +A single GitHub Actions workflow (`eval.yml`) automates eval execution +using the [`mrizzi/skill-litmus`](https://github.com/mrizzi/skill-litmus) +composite action. It does not gate merges — it reports results only. -### eval-pr.yml (pull request) +The workflow triggers when `plugins/sdlc-workflow/skills/**/*.md` or +`evals/**/evals.json` are modified, on both pull requests and pushes to +main. -Triggers when a PR modifies `plugins/sdlc-workflow/skills/**/*.md` or -`evals/**/evals.json`. +- **On pull request** — discovers changed skills, runs their evals, + compares against stored baselines, and posts a PR review with the + results. +- **On push to main** — runs all eval suites and commits baseline + results to `evals//baselines//`, updating the `latest` + symlink. -1. **Discover changed skills** — `git diff` maps changed files to eval suites -2. **Run evals** — invokes `run-evals` via `claude -p` for each changed skill -3. **Post PR comment** — reads `summary.md` from the workspace and posts it - -The skill's `render_summary.py` script handles baseline comparison. If -`evals//baselines/latest/` exists, the summary includes a delta -line. If not, it shows raw results only. - -### eval-baseline.yml (push to main) - -Triggers when skill or eval files are merged to main. - -1. **Discover all eval suites** — runs all skills, not just changed ones -2. **Run evals** — invokes `run-evals` via `claude -p` for each skill -3. **Store baselines** — commits results to `evals//baselines//` -4. **Update `latest` symlink** — points to the new baseline - -This ensures every merge to main has a complete baseline that subsequent -PRs can compare against. +Both modes are handled automatically by the skill-litmus action. ## Baseline strategy @@ -175,7 +167,7 @@ the baseline via the symlink — they don't need to know the exact commit hash. If main receives commits that don't touch skill or eval files, the -`eval-baseline` workflow doesn't trigger, but `latest` still points to +eval workflow doesn't trigger, but `latest` still points to the most recent valid baseline. ## Adding evals for a new skill @@ -220,7 +212,7 @@ network dependencies: └───────────────────────────────────────────────────────────┘ ``` -1. **Run** — invoke `/sdlc-workflow:run-evals` +1. **Run** — invoke `/skill-litmus:run-evals` 2. **Grade** — the skill grades assertions and produces `grading.json` 3. **Review** — inspect outputs and record feedback in `feedback.json` 4. **Improve** — edit the skill's `SKILL.md` based on failures and feedback diff --git a/evals/define-feature/README.md b/evals/define-feature/README.md index a4e8b0a1..059edb4f 100644 --- a/evals/define-feature/README.md +++ b/evals/define-feature/README.md @@ -58,7 +58,7 @@ Mock CLAUDE.md files representing different project states: ## Running ``` -/sdlc-workflow:run-evals Run evals for define-feature. +/skill-litmus:run-evals Run evals for define-feature. Evals path: evals/define-feature/evals.json Workspace: /tmp/define-feature-eval ``` diff --git a/evals/define-feature/evals.json b/evals/define-feature/evals.json index 0409c9b6..c28ac40f 100644 --- a/evals/define-feature/evals.json +++ b/evals/define-feature/evals.json @@ -1,5 +1,6 @@ { "skill_name": "define-feature", + "plugin": "sdlc-workflow", "evals": [ { "id": 1, diff --git a/evals/implement-task/README.md b/evals/implement-task/README.md index fe49cbea..14bc4b97 100644 --- a/evals/implement-task/README.md +++ b/evals/implement-task/README.md @@ -62,7 +62,7 @@ during Step 0 validation. ## Running ``` -/sdlc-workflow:run-evals Run evals for implement-task. +/skill-litmus:run-evals Run evals for implement-task. Evals path: evals/implement-task/evals.json Workspace: /tmp/implement-task-eval ``` diff --git a/evals/implement-task/evals.json b/evals/implement-task/evals.json index f27eb80a..77c2b203 100644 --- a/evals/implement-task/evals.json +++ b/evals/implement-task/evals.json @@ -1,5 +1,6 @@ { "skill_name": "implement-task", + "plugin": "sdlc-workflow", "evals": [ { "id": 1, diff --git a/evals/plan-feature/README.md b/evals/plan-feature/README.md index c5aa5c5d..3eb01077 100644 --- a/evals/plan-feature/README.md +++ b/evals/plan-feature/README.md @@ -37,7 +37,7 @@ multi-repo test case. ## Running ``` -/sdlc-workflow:run-evals Run evals for plan-feature. +/skill-litmus:run-evals Run evals for plan-feature. Evals path: evals/plan-feature/evals.json Workspace: /tmp/plan-feature-eval ``` diff --git a/evals/plan-feature/evals.json b/evals/plan-feature/evals.json index e930f536..1a5a79e4 100644 --- a/evals/plan-feature/evals.json +++ b/evals/plan-feature/evals.json @@ -1,5 +1,6 @@ { "skill_name": "plan-feature", + "plugin": "sdlc-workflow", "evals": [ { "id": 1, diff --git a/evals/setup/README.md b/evals/setup/README.md index 7c32dd41..1965b1ee 100644 --- a/evals/setup/README.md +++ b/evals/setup/README.md @@ -60,7 +60,7 @@ Mock MCP tool listings simulating tool discovery output: ## Running ``` -/sdlc-workflow:run-evals Run evals for setup. +/skill-litmus:run-evals Run evals for setup. Evals path: evals/setup/evals.json Workspace: /tmp/setup-eval ``` diff --git a/evals/setup/evals.json b/evals/setup/evals.json index ac103674..a2400d85 100644 --- a/evals/setup/evals.json +++ b/evals/setup/evals.json @@ -1,5 +1,6 @@ { "skill_name": "setup", + "plugin": "sdlc-workflow", "evals": [ { "id": 1, diff --git a/evals/verify-pr/README.md b/evals/verify-pr/README.md index 4e664ede..8115e711 100644 --- a/evals/verify-pr/README.md +++ b/evals/verify-pr/README.md @@ -59,7 +59,7 @@ from `docs/constraints.md`: ## Running ``` -/sdlc-workflow:run-evals Run evals for verify-pr. +/skill-litmus:run-evals Run evals for verify-pr. Evals path: evals/verify-pr/evals.json Workspace: /tmp/verify-pr-eval ``` diff --git a/evals/verify-pr/evals.json b/evals/verify-pr/evals.json index 845f5c23..50d8f410 100644 --- a/evals/verify-pr/evals.json +++ b/evals/verify-pr/evals.json @@ -1,5 +1,6 @@ { "skill_name": "verify-pr", + "plugin": "sdlc-workflow", "evals": [ { "id": 1, diff --git a/plugins/sdlc-workflow/skills/run-evals/SKILL.md b/plugins/sdlc-workflow/skills/run-evals/SKILL.md deleted file mode 100644 index 6015f423..00000000 --- a/plugins/sdlc-workflow/skills/run-evals/SKILL.md +++ /dev/null @@ -1,198 +0,0 @@ ---- -name: run-evals -description: Run skill evals with deterministic output layout and CI-compatible results. Grades assertions, aggregates benchmark.json, and renders summary.md via Python scripts. NOT skill-creator — this skill produces fixed output paths for automated pipelines. Triggers on "run evals", "eval ", "benchmark", or "grade evals". ---- - -# Run Evals - -Run eval cases for a skill, grade each against assertions, and produce -structured results in a deterministic directory layout. - -## Inputs - -The user provides: -- **Skill name** — the slash-command name of the skill to test (e.g., `plan-feature`) -- **Evals path** — path to the `evals.json` file (e.g., `evals/plan-feature/evals.json`) -- **Workspace** — directory where results are written - -## Output Structure - -Every run produces this exact layout — no variation: - -``` -/ -├── benchmark.json -├── feedback.json -├── summary.md -├── eval-1/ -│ ├── grading.json -│ ├── timing.json -│ └── outputs/ -│ └── (skill outputs) -├── eval-2/ -│ └── ... -└── eval-N/ - └── ... -``` - -## Process - -### Step 1 — Read evals.json - -Read the evals file and extract: -- `skill_name` — the skill being evaluated -- `evals[]` — array of test cases, each with `id`, `prompt`, `expected_output`, - `files` (optional), and `assertions` - -### Step 2 — Execute each eval case - -For each eval in `evals[]`, spawn a subagent with this prompt: - -``` -You are executing an eval for the /sdlc-workflow: skill. - -Task: - - -Input files (read these before starting): - -- / - - - -Write all outputs to: /eval-/outputs/ - -Important: -- Invoke the /sdlc-workflow: skill via the Skill tool to process this task -- Write every output file to the outputs/ directory -- Do not interact with external services (Jira, Figma, etc.) — write to files instead -``` - -**Parallelism:** Spawn all eval subagents in a single turn so they run -concurrently. Do not wait for one eval to complete before starting the -next — the eval cases are independent. - -When each subagent completes, capture `total_tokens` and `duration_ms` -from the task completion notification immediately. Write to -`/eval-/timing.json`: - -```json -{ - "total_tokens": , - "duration_ms": -} -``` - -### Step 3 — Grade each eval case - -As each eval completes, spawn a grader subagent. Grading can overlap -with execution — grade each eval as it finishes rather than waiting for -all to complete. - -Grader prompt: - -``` -Grade the outputs of an eval run against these assertions. - -Assertions: - -- - - -Outputs directory: /eval-/outputs/ - -## Grading rules - -1. Read every file in the outputs directory. Open and read file - contents — do not judge based on filenames alone. -2. Burden of proof is on PASS. Default to FAIL. Only mark PASS when - you find specific, concrete evidence in the output files that - satisfies the assertion. -3. No partial credit. Each assertion is binary: PASS or FAIL. - "Mostly correct" or "partially addressed" is FAIL. -4. Cite specific evidence. The evidence field must quote or reference - exact content from output files — file paths, line excerpts, counts. - Never write vague evidence like "the output generally addresses this." -5. Check structure AND content. Verify both that expected sections/files - exist AND that their content satisfies the assertion. -6. Contradictory evidence means FAIL. If some outputs support the - assertion but others contradict it, the assertion fails. - -Write results to: /eval-/grading.json - -Use this exact JSON structure: -{ - "assertion_results": [ - { - "text": "", - "passed": true/false, - "evidence": "" - } - ], - "summary": { - "passed": , - "failed": , - "total": , - "pass_rate": <0.0 to 1.0> - } -} -``` - -### Step 4 — Aggregate into benchmark.json - -After all evals are graded, run the aggregation script: - -```bash -python3 /scripts/aggregate_benchmark.py --results -``` - -The script reads all `eval-N/grading.json` and `eval-N/timing.json` -files, computes mean and stddev for pass rate, duration, and token usage, -and writes `/benchmark.json`. - -### Step 5 — Create feedback.json placeholder - -Write `/feedback.json` with empty strings for each eval: - -```json -{ - "eval-1": "", - "eval-2": "", - ... -} -``` - -### Step 6 — Render summary - -Determine the baseline path: `evals//baselines/latest/`. - -Run the render script, passing the skill name so the heading identifies -which skill the results belong to: - -```bash -python3 /scripts/render_summary.py \ - --results \ - --baseline \ - --skill -``` - -If the baseline path does not exist, omit `--baseline` — the script -renders results without a comparison. - -The script writes `/summary.md`. Display its contents to the -user. - -## Rules - -- Write all outputs to the exact paths specified. No intermediate directories, - no configuration-named subdirectories. -- Every eval case gets its own `eval-/` directory using the `id` from evals.json. -- `grading.json` uses `assertion_results` (not `expectations`) with fields - `text`, `passed`, `evidence`. -- Timing data must be captured from the task completion notification — it cannot - be recovered after the fact. -- Do not launch an eval viewer or browser. -- Do not run baseline comparisons — only run the current skill version. - Baseline comparison is handled by the render script in Step 6. -- `` is the directory containing this SKILL.md — resolve it from - the absolute path shown in the skill invocation header. diff --git a/plugins/sdlc-workflow/skills/run-evals/scripts/aggregate_benchmark.py b/plugins/sdlc-workflow/skills/run-evals/scripts/aggregate_benchmark.py deleted file mode 100644 index cba668ca..00000000 --- a/plugins/sdlc-workflow/skills/run-evals/scripts/aggregate_benchmark.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python3 -"""Aggregate eval grading and timing results into benchmark.json.""" - -import argparse -import json -import statistics -import sys -from pathlib import Path - - -def main(): - parser = argparse.ArgumentParser( - description="Aggregate eval results into benchmark.json" - ) - parser.add_argument( - "--results", type=Path, required=True, help="Workspace with eval results" - ) - args = parser.parse_args() - - if not args.results.exists(): - print(f"Results directory not found: {args.results}", file=sys.stderr) - sys.exit(1) - - pass_rates = [] - time_seconds = [] - tokens = [] - - for eval_dir in sorted(args.results.glob("eval-*")): - grading_path = eval_dir / "grading.json" - if not grading_path.exists(): - continue - - grading = json.loads(grading_path.read_text()) - rate = grading.get("summary", {}).get("pass_rate") - if rate is not None: - pass_rates.append(rate) - - timing_path = eval_dir / "timing.json" - if timing_path.exists(): - timing = json.loads(timing_path.read_text()) - duration_ms = timing.get("duration_ms") - if duration_ms is not None: - time_seconds.append(duration_ms / 1000) - total_tokens = timing.get("total_tokens") - if total_tokens is not None: - tokens.append(total_tokens) - - benchmark = { - "run_summary": { - "pass_rate": _stats(pass_rates), - "time_seconds": _stats(time_seconds), - "tokens": _stats(tokens), - } - } - - output_path = args.results / "benchmark.json" - output_path.write_text(json.dumps(benchmark, indent=2) + "\n") - print(f"Written: {output_path}") - - -def _stats(values: list[float]) -> dict: - if not values: - return {"mean": 0.0, "stddev": 0.0} - mean = statistics.mean(values) - stddev = statistics.pstdev(values) if len(values) > 1 else 0.0 - return {"mean": round(mean, 2), "stddev": round(stddev, 2)} - - -if __name__ == "__main__": - main() diff --git a/plugins/sdlc-workflow/skills/run-evals/scripts/render_summary.py b/plugins/sdlc-workflow/skills/run-evals/scripts/render_summary.py deleted file mode 100644 index a7e4368e..00000000 --- a/plugins/sdlc-workflow/skills/run-evals/scripts/render_summary.py +++ /dev/null @@ -1,140 +0,0 @@ -#!/usr/bin/env python3 -"""Render eval results and optional baseline comparison as Markdown.""" - -import argparse -import json -import sys -from pathlib import Path - - -def load_grading_files(results_dir: Path) -> list[dict]: - """Load all eval-N/grading.json files, sorted by eval ID.""" - gradings = [] - for eval_dir in sorted(results_dir.glob("eval-*")): - grading_path = eval_dir / "grading.json" - if not grading_path.exists(): - continue - grading = json.loads(grading_path.read_text()) - grading["_eval_id"] = eval_dir.name - gradings.append(grading) - return gradings - - -def render(results_dir: Path, baseline_dir: Path | None, skill: str | None = None) -> str: - gradings = load_grading_files(results_dir) - - if not gradings: - return "> No eval results found.\n" - - heading = f"## Eval Results: {skill}" if skill else "## Eval Results" - lines = [heading, ""] - - # Per-eval table - lines.append("| Eval | Passed | Failed | Pass Rate |") - lines.append("|------|--------|--------|-----------|") - - for g in gradings: - eval_id = g["_eval_id"] - s = g.get("summary", {}) - passed = s.get("passed", 0) - failed = s.get("failed", 0) - total = s.get("total", 0) - rate = s.get("pass_rate", 0) - lines.append( - f"| {eval_id} | {passed}/{total} | {failed} | {rate * 100:.0f}% |" - ) - - lines.append("") - - # Aggregate from benchmark.json if present - benchmark_path = results_dir / "benchmark.json" - if benchmark_path.exists(): - benchmark = json.loads(benchmark_path.read_text()) - rs = benchmark.get("run_summary", {}) - pr_mean = rs.get("pass_rate", {}).get("mean") - tokens_mean = rs.get("tokens", {}).get("mean") - time_mean = rs.get("time_seconds", {}).get("mean") - - parts = [] - if pr_mean is not None: - parts.append(f"**Pass rate:** {pr_mean * 100:.0f}%") - if tokens_mean is not None: - parts.append(f"**Tokens:** {int(tokens_mean):,}") - if time_mean is not None: - parts.append(f"**Duration:** {time_mean:.0f}s") - if parts: - lines.append(" · ".join(parts)) - lines.append("") - - # Baseline comparison - if baseline_dir and baseline_dir.exists(): - baseline_benchmark = baseline_dir / "benchmark.json" - if baseline_benchmark.exists(): - baseline = json.loads(baseline_benchmark.read_text()) - brs = baseline.get("run_summary", {}) - b_pr = brs.get("pass_rate", {}).get("mean") - b_tokens = brs.get("tokens", {}).get("mean") - b_time = brs.get("time_seconds", {}).get("mean") - - baseline_label = baseline_dir.name - if baseline_dir.is_symlink(): - baseline_label = baseline_dir.resolve().name - - parts = [] - if b_pr is not None: - parts.append(f"{b_pr * 100:.0f}%") - if b_tokens is not None: - parts.append(f"{int(b_tokens):,} tokens") - if b_time is not None: - parts.append(f"{b_time:.0f}s") - if parts: - lines.append( - f"**Baseline** (`{baseline_label}`): " + " · ".join(parts) - ) - lines.append("") - - return "\n".join(lines) + "\n" - - -def main(): - parser = argparse.ArgumentParser( - description="Render eval results as Markdown" - ) - parser.add_argument( - "--results", type=Path, required=True, help="Workspace with eval results" - ) - parser.add_argument( - "--baseline", type=Path, default=None, help="Baseline directory" - ) - parser.add_argument( - "--skill", - type=str, - default=None, - help="Skill name to include in the heading", - ) - parser.add_argument( - "--output", - type=Path, - default=None, - help="Output file (default: /summary.md)", - ) - args = parser.parse_args() - - if not args.results.exists(): - print(f"Results directory not found: {args.results}", file=sys.stderr) - sys.exit(1) - - baseline = args.baseline - if baseline and not baseline.exists(): - print(f"Warning: baseline not found: {baseline}", file=sys.stderr) - baseline = None - - md = render(args.results, baseline, args.skill) - - output_path = args.output or (args.results / "summary.md") - output_path.write_text(md) - print(md) - - -if __name__ == "__main__": - main()