Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
296 changes: 296 additions & 0 deletions .github/workflows/eval-skills.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
name: Skill Evals

# Diff-gated evaluation runner for the public-facing skills under skills/.
#
# Triggers:
# - pull_request: comment score diff vs main; do not commit anything.
# - schedule: nightly run on changed skills, commit refreshed
# eval-scores.json + per-skill README badges back to main.
# - workflow_dispatch: manual full or partial re-run.
#
# Cost shape: only suites whose source has actually changed (per
# evals/scripts/diff-changed-skills.js) get re-evaluated, so a typical PR
# touching one skill costs roughly one suite's worth of API tokens.

on:
pull_request:
paths:
- "skills/**"
- "evals/**"
- ".github/workflows/eval-skills.yml"
schedule:
# 09:17 UTC daily - off the hour to avoid lining up with API rate limits.
- cron: "17 9 * * *"
workflow_dispatch:
inputs:
run_all:
description: "Re-run every suite regardless of diff"
type: boolean
default: false

concurrency:
group: skill-evals-${{ github.ref }}
cancel-in-progress: true

permissions:
# contents: write is needed only on `schedule` / `workflow_dispatch` so the
# aggregate job can push the refreshed eval-scores.json and per-skill README
# badges back to main. Pull requests use the same workflow but the commit
# step is gated on event_name, so PR runs effectively only need read.
contents: write
pull-requests: write

jobs:
unit-test:
name: "Unit tests"
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: evals/package-lock.json

- name: Install eval dependencies
run: npm ci --legacy-peer-deps
working-directory: evals

- name: Run unit tests
run: npm test
working-directory: evals

diff:
name: "Compute changed suites"
runs-on: ubuntu-latest
outputs:
slugs: ${{ steps.compute.outputs.slugs }}
has_changes: ${{ steps.compute.outputs.has_changes }}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Compute changed suites
id: compute
run: |
if [[ "${{ inputs.run_all }}" == "true" ]]; then
# run_all overrides: list every suite
slugs="$(node -e 'const m=require("./evals/scripts/_manifest");process.stdout.write(JSON.stringify(m.SUITES.map(s=>s.suite)))')"
else
slugs="$(node evals/scripts/diff-changed-skills.js --json --verbose)"
fi
echo "slugs=${slugs}" >> "$GITHUB_OUTPUT"
if [[ "${slugs}" == "[]" ]]; then
echo "has_changes=false" >> "$GITHUB_OUTPUT"
else
echo "has_changes=true" >> "$GITHUB_OUTPUT"
fi
echo "Changed suites: ${slugs}"

evaluate:
name: "Evaluate ${{ matrix.suite }}"
needs: [unit-test, diff]
if: needs.diff.outputs.has_changes == 'true'
runs-on: ubuntu-latest
strategy:
fail-fast: false
max-parallel: 3
matrix:
suite: ${{ fromJson(needs.diff.outputs.slugs) }}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: evals/package-lock.json

- name: Install eval dependencies
run: npm ci --legacy-peer-deps
working-directory: evals

- name: Run eval suite
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
AGENT_MODEL: ${{ vars.AGENT_MODEL || 'claude-sonnet-4-6' }}
RUBRIC_MODEL: ${{ vars.RUBRIC_MODEL || 'anthropic:messages:claude-haiku-4-5-20251001' }}
run: node scripts/aggregate.js --run --only=${{ matrix.suite }}
working-directory: evals

- name: Upload suite results
if: always()
uses: actions/upload-artifact@v4
with:
name: results-${{ matrix.suite }}
path: evals/${{ matrix.suite }}/results.json
retention-days: 14

aggregate:
name: "Aggregate scores"
needs: [diff, evaluate]
if: needs.diff.outputs.has_changes == 'true' && always() && needs.evaluate.result != 'cancelled'
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
token: ${{ secrets.GITHUB_TOKEN }}

- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "22"
cache: "npm"
cache-dependency-path: evals/package-lock.json

- name: Install eval dependencies
run: npm ci --legacy-peer-deps
working-directory: evals

- name: Download all suite results
uses: actions/download-artifact@v4
with:
path: artifact-results
pattern: results-*
merge-multiple: false

- name: Stage suite results into evals/<suite>/results.json
run: |
set -e
shopt -s nullglob
for d in artifact-results/results-*; do
name=$(basename "$d" | sed 's/^results-//')
mkdir -p "evals/$name"
if [[ -f "$d/results.json" ]]; then
cp "$d/results.json" "evals/$name/results.json"
echo "Staged evals/$name/results.json"
fi
done

- name: Save previous eval-scores.json for diff
run: |
if [[ -f eval-scores.json ]]; then
cp eval-scores.json /tmp/eval-scores-before.json
else
echo '{"schemaVersion":1,"updatedAt":null,"skills":{}}' > /tmp/eval-scores-before.json
fi

- name: Aggregate
env:
SUITES_JSON: ${{ needs.diff.outputs.slugs }}
run: |
slugs=$(echo "$SUITES_JSON" | node -e 'let s="";process.stdin.on("data",c=>s+=c);process.stdin.on("end",()=>{const a=JSON.parse(s);process.stdout.write(a.join(","))})')
if [[ -z "$slugs" ]]; then
echo "No suites to aggregate"
exit 0
fi
node scripts/aggregate.js --only="$slugs"
working-directory: evals

- name: Render README badges
if: always()
run: node evals/scripts/render-badges.js

- name: PR comment with score diff
if: always() && github.event_name == 'pull_request'
uses: actions/github-script@v7
env:
BEFORE_PATH: /tmp/eval-scores-before.json
AFTER_PATH: ${{ github.workspace }}/eval-scores.json
with:
script: |
const fs = require('node:fs');
const before = JSON.parse(fs.readFileSync(process.env.BEFORE_PATH, 'utf-8'));
const after = JSON.parse(fs.readFileSync(process.env.AFTER_PATH, 'utf-8'));
const lines = [
'<!-- skill-evals-comment -->',
'## Skill eval results',
'',
'| Skill | Before | After | Δ |',
'|-------|-------:|------:|----:|',
];
const keys = new Set([
...Object.keys(before.skills || {}),
...Object.keys(after.skills || {}),
]);
for (const key of [...keys].sort()) {
const b = (before.skills || {})[key];
const a = (after.skills || {})[key];
if (!a) continue;
const beforeStr = b && b.score !== null ? `${b.score}/100 (${b.passed}/${b.total})` : '-';
const afterStr = a.score !== null ? `${a.score}/100 (${a.passed}/${a.total})` : 'errored';
const delta = (b && b.score !== null && a.score !== null)
? (a.score - b.score === 0 ? 'no change' : (a.score - b.score > 0 ? `+${a.score - b.score}` : `${a.score - b.score}`))
: 'new';
lines.push(`| \`${key}\` | ${beforeStr} | ${afterStr} | ${delta} |`);
}
lines.push('');
lines.push('_Only suites whose source actually changed since their last recorded score were re-run. Soft-failing while we stabilise the baseline._');
const body = lines.join('\n');

const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
const existing = comments.find((c) => c.body && c.body.startsWith('<!-- skill-evals-comment -->'));
if (existing) {
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: existing.id,
body,
});
} else {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body,
});
}

- name: Commit refreshed scores and badges
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
run: |
if git diff --quiet eval-scores.json skills/; then
echo "No score or badge changes to commit"
exit 0
fi
git config user.name 'github-actions[bot]'
git config user.email '41898282+github-actions[bot]@users.noreply.github.com'
git add eval-scores.json
git add skills/**/README.md
git commit -m "chore(evals): refresh eval-scores.json and README badges"
git push origin HEAD:${{ github.ref_name }}

eval-gate:
name: "Evaluate gate"
needs: [diff, evaluate]
if: always()
runs-on: ubuntu-latest
steps:
- name: Check evaluate results
run: |
if [[ "${{ needs.diff.outputs.has_changes }}" != "true" ]]; then
echo "No suites changed — gate passes"
exit 0
fi
if [[ "${{ needs.evaluate.result }}" == "success" ]]; then
echo "All evaluate jobs passed"
exit 0
fi
echo "One or more evaluate jobs failed (result: ${{ needs.evaluate.result }})"
exit 1
9 changes: 2 additions & 7 deletions .mcp.json
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
{
"mcpServers": {
"LaunchDarkly Feature Management": {
"LaunchDarkly": {
"type": "http",
"url": "https://mcp.launchdarkly.com/mcp/fm",
"headers": {}
},
"LaunchDarkly AI Configs": {
"type": "http",
"url": "https://mcp.launchdarkly.com/mcp/aiconfigs",
"url": "https://mcp.launchdarkly.com/mcp/launchdarkly",
"headers": {}
}
}
Expand Down
5 changes: 3 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@ The metrics skills have a Promptfoo suite under `tests/`. Run it before merging

```bash
export ANTHROPIC_API_KEY=sk-ant-...
npm run test:llm-evals
npm run eval:all
```

Run from the repository root, or from `tests/` with `npm run test:llm-evals` (same as `npm run eval`). For a single skill, use `npm run eval:create`, `eval:choose`, or `eval:instrument` inside `tests/`.
Run from the repository root. To run a single suite, `cd evals` and use `npm run eval:<suite-name>` (e.g., `eval:aiconfig-create`). View results with `npm run eval:view`.


## Documentation

Expand Down
Loading
Loading