From f07625ddb066ff320bd76f1952b60b827c49dc94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Gosta=C5=84ski?= Date: Thu, 18 Jun 2026 10:19:08 +0200 Subject: [PATCH 1/2] chore: root promptfoo eval tooling Single shared root node_modules + scripts to run every plugin's eval suite. - package.json: devDeps (@anthropic-ai/claude-agent-sdk, promptfoo) + eval scripts - scripts/run-evals.sh: discovers and runs plugins/*/evals/promptfooconfig.yaml - pnpm-workspace.yaml: allowBuilds=false (avoids ERR_PNPM_IGNORED_BUILDS on pnpm 11) - .gitignore: node_modules, pnpm-lock.yaml, eval outputs --- .gitignore | 10 ++++++++++ package.json | 14 ++++++++++++++ pnpm-workspace.yaml | 11 +++++++++++ scripts/run-evals.sh | 41 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 76 insertions(+) create mode 100644 .gitignore create mode 100644 package.json create mode 100644 pnpm-workspace.yaml create mode 100755 scripts/run-evals.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e63fd40 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# Node dev tooling (shared at the repo root; used by plugins/*/evals) +node_modules/ +pnpm-lock.yaml +# pnpm-workspace.yaml is committed — it carries the onlyBuiltDependencies setting + +# promptfoo eval outputs +eval-*.json +*.eval.json +output.json +.promptfoo/ diff --git a/package.json b/package.json new file mode 100644 index 0000000..1cfb608 --- /dev/null +++ b/package.json @@ -0,0 +1,14 @@ +{ + "name": "cc-toolkit", + "version": "0.0.0", + "private": true, + "description": "Claude Code plugin marketplace — root dev tooling (promptfoo eval suites live in plugins//evals/)", + "scripts": { + "eval": "bash scripts/run-evals.sh", + "eval:comment-review": "promptfoo eval -c plugins/comment-review/evals/promptfooconfig.yaml --no-cache --no-share -o /tmp/eval-comment-review.json" + }, + "devDependencies": { + "@anthropic-ai/claude-agent-sdk": "latest", + "promptfoo": "latest" + } +} diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml new file mode 100644 index 0000000..a8e0538 --- /dev/null +++ b/pnpm-workspace.yaml @@ -0,0 +1,11 @@ +# promptfoo's transitive deps ship build scripts (esbuild, sharp, onnxruntime, …) that +# we don't need to run for evals. Decline each one explicitly so `pnpm install` exits 0 +# instead of failing with ERR_PNPM_IGNORED_BUILDS (which also breaks pnpm's pre-run +# dependency check used by `pnpm eval`). Flip one to `true` only if a feature needs it. +allowBuilds: + '@playwright/browser-chromium': false + '@swc/core': false + esbuild: false + onnxruntime-node: false + protobufjs: false + sharp: false diff --git a/scripts/run-evals.sh b/scripts/run-evals.sh new file mode 100755 index 0000000..61c90d4 --- /dev/null +++ b/scripts/run-evals.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# Run every plugin's promptfoo eval suite found at plugins//evals/promptfooconfig.yaml. +# Extra args are forwarded to `promptfoo eval` (e.g. --filter-pattern eval-3). +# +# pnpm eval # run all suites +# pnpm eval -- --filter-pattern e3 # forward args to promptfoo +# +# Deps (promptfoo + @anthropic-ai/claude-agent-sdk) are resolved from the root +# node_modules, so this must run from the repo root (it cd's there itself). +set -euo pipefail + +# `pnpm eval -- ` forwards a literal "--" into the script; drop it so it isn't +# passed on to promptfoo (where "--" would end option parsing). +[ "${1:-}" = "--" ] && shift + +cd "$(dirname "$0")/.." +shopt -s nullglob +configs=(plugins/*/evals/promptfooconfig.yaml) + +if [ ${#configs[@]} -eq 0 ]; then + echo "No eval suites found (plugins/*/evals/promptfooconfig.yaml)." + exit 0 +fi + +bin="./node_modules/.bin/promptfoo" +if [ ! -x "$bin" ]; then + echo "promptfoo not installed at root. Run: pnpm install" >&2 + exit 1 +fi + +status=0 +for cfg in "${configs[@]}"; do + name="$(basename "$(dirname "$(dirname "$cfg")")")" + echo "==> ${name} (${cfg})" + if ! "$bin" eval -c "$cfg" --no-cache --no-share -o "/tmp/eval-${name}.json" "$@"; then + echo "!! ${name} eval reported failures" >&2 + status=1 + fi +done + +exit $status From ca61a5b0d30ca7285ba3702d729843610c417bae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Gosta=C5=84ski?= Date: Thu, 18 Jun 2026 10:19:22 +0200 Subject: [PATCH 2/2] test(comment-review): promptfoo eval suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 5 evals (R1-R12 comment-quality verdicts) running the real skill via the native anthropic:claude-agent-sdk provider (local plugin, subscription auth). - fixtures: 2 ported from skill-creator + 4 reconstructed (datadog, scheduler, dlq pair) - assertions: llm-rubric per criterion + regex 'skill ran' proxy - baseline: sonnet 41/42, opus 42/42; eval-3 §4.1 accepts REWRITE-or-REMOVE, eval-4 token-verification kept strict as a model-capability marker --- plugins/comment-review/evals/README.md | 100 ++++++++ .../evals/fixtures/datadog-integration.tf | 37 +++ .../evals/fixtures/dlq-codes.ts | 13 + .../evals/fixtures/dlq.handler.ts | 25 ++ .../evals/fixtures/host-allowlist.ts | 28 ++ .../evals/fixtures/payment-validator.ts | 33 +++ .../evals/fixtures/scheduler.ts | 38 +++ .../comment-review/evals/promptfooconfig.yaml | 240 ++++++++++++++++++ .../comment-review/evals/prompts/review.txt | 6 + 9 files changed, 520 insertions(+) create mode 100644 plugins/comment-review/evals/README.md create mode 100644 plugins/comment-review/evals/fixtures/datadog-integration.tf create mode 100644 plugins/comment-review/evals/fixtures/dlq-codes.ts create mode 100644 plugins/comment-review/evals/fixtures/dlq.handler.ts create mode 100644 plugins/comment-review/evals/fixtures/host-allowlist.ts create mode 100644 plugins/comment-review/evals/fixtures/payment-validator.ts create mode 100644 plugins/comment-review/evals/fixtures/scheduler.ts create mode 100644 plugins/comment-review/evals/promptfooconfig.yaml create mode 100644 plugins/comment-review/evals/prompts/review.txt diff --git a/plugins/comment-review/evals/README.md b/plugins/comment-review/evals/README.md new file mode 100644 index 0000000..d05dbbe --- /dev/null +++ b/plugins/comment-review/evals/README.md @@ -0,0 +1,100 @@ +# comment-review — promptfoo eval suite + +Repeatable [promptfoo](https://www.promptfoo.dev/) evals for the `comment-review` +skill. Each test runs the **real skill** (loaded as a local plugin through the +Claude Agent SDK) against a fixture and grades the comment-quality verdicts +(KEEP / REMOVE / REWRITE / MOVE, rules R1–R12). + +This is dev tooling — it is **not** shipped as part of the plugin runtime. + +## Layout + +``` +evals/ + promptfooconfig.yaml # provider + tests + assertions + prompts/review.txt # natural-language trigger for the skill + fixtures/ # inputs (2 ported from skill-creator + 4 reconstructed) +``` + +Node dev deps (`@anthropic-ai/claude-agent-sdk` + `promptfoo`) and the run +scripts live at the **repo root** (`package.json`, single shared `node_modules`), +not per-plugin. + +| Test | Fixture(s) | Focus | +|------|-----------|-------| +| eval-0 | `datadog-integration.tf` | R5 banners + R4 internal-doc refs vs external RFC | +| eval-1 | `scheduler.ts` | R4 file/doc refs, R5 banners, R1 narration, kept diagram/CVE | +| eval-2 | `dlq-codes.ts` + `dlq.handler.ts` | R12 misplaced/duplicated rationale (REMOVE vs MOVE) | +| eval-3 | `payment-validator.ts` | R4 spec-id pointers (REMOVE/REWRITE), token-stripping | +| eval-4 | `host-allowlist.ts` | R4 spec-ids embedded mid-sentence → REWRITE, keep the WHY | + +## Prerequisites + +- `pnpm install` at the **repo root** (installs `@anthropic-ai/claude-agent-sdk` + + `promptfoo` into the shared root `node_modules`). +- A logged-in Claude Code CLI. The target provider runs on your **subscription** + (`apiKeyRequired: false`) — no `ANTHROPIC_API_KEY` needed. + +## Run + +All commands run from the **repo root** (so the Agent SDK resolves from the root +`node_modules`): + +```bash +pnpm install + +# this suite (full) +pnpm eval:comment-review + +# every plugin's eval suite (plugins/*/evals/*) +pnpm eval + +# forward args to promptfoo, e.g. focus one test while iterating +pnpm eval -- --filter-pattern eval-3 + +# structural check (free) +./node_modules/.bin/promptfoo validate config -c plugins/comment-review/evals/promptfooconfig.yaml + +# browse results +./node_modules/.bin/promptfoo view +``` + +`scripts/run-evals.sh` (wired to `pnpm eval`) discovers every +`plugins//evals/promptfooconfig.yaml`, runs each, and writes +`/tmp/eval-.json`. + +## Notes & knobs + +- **Model.** Default target/grader is `claude-sonnet-4-6` (cheap iteration); set a + stronger target in `promptfooconfig.yaml` (`providers[0].config.model`) for a + regression gate. Baseline: **sonnet 41/42, opus 42/42.** Two assertions are + instructive: + - *eval-3 `§4.1`* is a genuine judgment boundary — after the spec-id is stripped + the comment borders on R1, so REWRITE and REMOVE are both defensible (opus and + sonnet often pick REMOVE). The assertion accepts **either**; only "kept as-is" fails. + - *eval-4 token-verification* is a real capability discriminator: it asks the + report to verify each token against the code before stripping. Sonnet takes a + "letter+number ⇒ spec-id" shortcut and **fails** it; opus shows the check and + **passes**. Kept strict on purpose as a model marker. +- **Flaky verdicts.** A borderline verdict can flip run-to-run; add `repeat: 2`/`3` + (or `defaultTest.options`) if you want a stable gate. +- **Grader.** llm-rubric grades on the subscription via a single-turn agent + (slow-ish). For a faster/cheaper grader, set `ANTHROPIC_API_KEY` and change + `defaultTest.options.provider` to `anthropic:messages:claude-opus-4-8`. +- **No `skill-used` assertion.** Plugin skills load via Agent-Skills *injection*, + not a `Skill()` tool call, so `metadata.skillCalls` stays empty. Each test + instead asserts (via `regex`) that the report uses the skill's R1–R12 taxonomy — + output of that shape requires the skill to have loaded (you can confirm in the + trace: the agent reads `skills/comment-review/references/rules.md`). +- `setting_sources: []` keeps this repo's `CLAUDE.md`/hooks out of the run, so + results reflect the skill, not the surrounding harness. + +## Provenance + +Fixtures `payment-validator.ts` and `host-allowlist.ts` and their assertions come +from the skill-creator workspace +(`../skills/comment-review-workspace/`, evals 3 & 4). Fixtures +`datadog-integration.tf`, `scheduler.ts`, `dlq-codes.ts`, `dlq.handler.ts` were +reconstructed from the assertion lists in that workspace's +`skill-snapshot/evals/evals.json` (evals 0/1/2), whose original fixtures were no +longer present. diff --git a/plugins/comment-review/evals/fixtures/datadog-integration.tf b/plugins/comment-review/evals/fixtures/datadog-integration.tf new file mode 100644 index 0000000..a853298 --- /dev/null +++ b/plugins/comment-review/evals/fixtures/datadog-integration.tf @@ -0,0 +1,37 @@ +# ===== Datadog ===== + +# ----- Provider credentials ----- +provider "datadog" { + api_key = var.datadog_api_key + app_key = var.datadog_app_key +} + +# ----- DD GCP integration — per-env SA + STS registration ----- +resource "datadog_integration_gcp_sts" "main" { + client_email = google_service_account.datadog.email + + # account scoping lives in DD_ARCH.md §3.2 + host_filters = "env:prod" + + # the per-env hard-stop on host_filters is intentional (DD_PLAN.md T4.1) + automute = true +} + +# ----- Metric namespace allowlist ----- +resource "datadog_metric_metadata" "billed" { + # only gcp.* and custom.app.* are billed under our contract — everything else is + # dropped here so an accidental high-cardinality metric can't blow the bill + metric = "gcp.gce.instance" +} + +resource "datadog_synthetics_test" "refund_probe" { + request_definition { + method = "POST" + url = "https://api.example.com/refund" + + # Retry-After handling per RFC 9110 §10.2.4 + retry { + count = 2 + } + } +} diff --git a/plugins/comment-review/evals/fixtures/dlq-codes.ts b/plugins/comment-review/evals/fixtures/dlq-codes.ts new file mode 100644 index 0000000..797ee3b --- /dev/null +++ b/plugins/comment-review/evals/fixtures/dlq-codes.ts @@ -0,0 +1,13 @@ +export const DLQ_MAX_RETRIES = 0; // 0 means unbounded, not disabled + +export enum AuditDlqInfoCode { + DLQ_MESSAGE_RECEIVED = "audit.dlq.message.received", + + // the conditional UPDATE can't distinguish a missing row from an ineligible status + DLQ_STATUS_NOT_ELIGIBLE = "audit.dlq.no_op.status_not_eligible", + + // we swallow the duplicate so a redelivered message doesn't double-count the audit metric + DLQ_DUPLICATE_DROPPED = "audit.dlq.no_op.duplicate_dropped", + + DLQ_REQUEUED = "audit.dlq.requeued", +} diff --git a/plugins/comment-review/evals/fixtures/dlq.handler.ts b/plugins/comment-review/evals/fixtures/dlq.handler.ts new file mode 100644 index 0000000..46770dc --- /dev/null +++ b/plugins/comment-review/evals/fixtures/dlq.handler.ts @@ -0,0 +1,25 @@ +import { AuditDlqInfoCode } from "./dlq-codes"; + +export class DlqHandler { + private seen = new Set(); + + constructor(private repo: AuditRepo, private bus: EventBus) {} + + async handle(msg: DlqMessage) { + if (this.seen.has(msg.id)) { + this.bus.emit(AuditDlqInfoCode.DLQ_DUPLICATE_DROPPED); + return; + } + this.seen.add(msg.id); + + // the conditional UPDATE can't distinguish a missing row from an ineligible status, + // so a zero-row result is ambiguous — we treat both as not-eligible rather than retry + const updated = await this.repo.updateIfEligible(msg.auditId, msg.status); + if (updated === 0) { + this.bus.emit(AuditDlqInfoCode.DLQ_STATUS_NOT_ELIGIBLE); + return; + } + + this.bus.emit(AuditDlqInfoCode.DLQ_REQUEUED); + } +} diff --git a/plugins/comment-review/evals/fixtures/host-allowlist.ts b/plugins/comment-review/evals/fixtures/host-allowlist.ts new file mode 100644 index 0000000..a52786e --- /dev/null +++ b/plugins/comment-review/evals/fixtures/host-allowlist.ts @@ -0,0 +1,28 @@ +export class HostAllowlist { + private ttlSeconds: number; + + // We lowercase the host and strip the port before the R2 membership check, because the + // upstream registry (F3) stores bare lowercase hostnames — a mismatch here silently lets + // a blocked host through. + normalize(host: string): string { + return host.toLowerCase().replace(/:\d+$/, ""); + } + + // Per Q4 the cache TTL must never exceed the token lifetime; otherwise a revoked token + // keeps passing the check until the stale entry expires on its own. + setTtl(seconds: number) { + this.ttlSeconds = Math.min(seconds, this.tokenLifetime()); + } + + // Validate in two passes (see §2.3 then §2.4): structural shape first, semantics second, + // so a malformed payload never reaches the semantic validator and blows up mid-pass. + validate(payload: unknown): Result { + const shaped = this.checkShape(payload); + return this.checkSemantics(shaped); + } + + // hosts are matched case-insensitively + isAllowed(host: string): boolean { + return this.set.has(this.normalize(host)); + } +} diff --git a/plugins/comment-review/evals/fixtures/payment-validator.ts b/plugins/comment-review/evals/fixtures/payment-validator.ts new file mode 100644 index 0000000..eab87a5 --- /dev/null +++ b/plugins/comment-review/evals/fixtures/payment-validator.ts @@ -0,0 +1,33 @@ +import { dedupe, getOrCreateIdempotencyKey } from "./util"; + +const MAX_CHARGE_MINOR_UNITS = 500_000; + +// region R2 is EU-only; never route US tenants here +const REGION = "R2"; + +// F1: users can submit a payment with a saved card +export function submitPayment(req: PaymentRequest): PaymentResult { + // see Q1 + if (req.amount <= 0) throw new InvalidAmountError(); + + // §4.1 caps a single charge at 500_000 minor units + if (req.amount > MAX_CHARGE_MINOR_UNITS) throw new ChargeTooLargeError(); + + // R7: charge in source order — the ledger rejects out-of-order sequence numbers + const charges = dedupe(req.charges).sort((a, b) => a.seq - b.seq); + + const key = getOrCreateIdempotencyKey(req.userId, 24 * 60 * 60 * 1000); + + // sequential, not parallel — the gateway rate-limits per merchant IP + const results = []; + for (const c of charges) { + results.push(postCharge(c, key, REGION)); + } + return collect(results); +} + +/** Implements AC-3 and AC-4. */ +export function refund(charge: Charge): RefundResult { + // Retry-After handling per RFC 9110 §10.2.4 + return withRetryAfter(() => gateway.refund(charge)); +} diff --git a/plugins/comment-review/evals/fixtures/scheduler.ts b/plugins/comment-review/evals/fixtures/scheduler.ts new file mode 100644 index 0000000..1095032 --- /dev/null +++ b/plugins/comment-review/evals/fixtures/scheduler.ts @@ -0,0 +1,38 @@ +// ===== Scheduler ===== + +// Job lifecycle: +// IDLE ──submit──▶ QUEUED ──claim──▶ RUNNING ──ok───▶ DONE +// └───err──▶ BACKOFF ──retry──▶ QUEUED +import { nextDelay } from "./utils/backoff"; + +const MAX_DELAY_MS = 30_000; + +export class Scheduler { + constructor(private broker: Broker) {} + + // backoff curve lives in utils/backoff.ts; see docs/scheduling.md for the tuning rationale + async run(jobs: Job[]) { + // Sequential, not parallel — the broker rate-limits per worker token + for (const job of jobs) { + let attempt = 0; + while (!job.done && attempt < job.maxAttempts) { + // increment the attempt counter + attempt++; + try { + await this.claimAndRun(job); + } catch (err) { + // clamp the delay so a hostile job can't request an unbounded sleep (CVE-2024-1234) + const delay = Math.min(nextDelay(attempt), MAX_DELAY_MS); + await sleep(delay); + } + } + } + } + + // ----- helpers ----- + + private async claimAndRun(job: Job) { + const token = await this.broker.claim(job.id); + return this.execute(job, token); + } +} diff --git a/plugins/comment-review/evals/promptfooconfig.yaml b/plugins/comment-review/evals/promptfooconfig.yaml new file mode 100644 index 0000000..14908fc --- /dev/null +++ b/plugins/comment-review/evals/promptfooconfig.yaml @@ -0,0 +1,240 @@ +# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json +description: comment-review skill — comment-quality verdicts (R1–R12) on fixtures + +prompts: + # Natural-language trigger (not a literal "/comment-review" slash — the Agent SDK reads a + # leading "/" as a command and rejects it; the skill fires on description match instead). + - file://prompts/review.txt + +providers: + - id: anthropic:claude-agent-sdk # alias: anthropic:claude-code + label: comment-review + config: + apiKeyRequired: false # use the local Claude Code subscription, no API key for the target + model: claude-sonnet-4-6 # override per run; a capable model is needed for evals 3/4 to pass + permission_mode: bypassPermissions # the skill's Read/Grep/Bash run unattended + allow_dangerously_skip_permissions: true # required alongside bypassPermissions + working_dir: ../../.. # repo root, so the plugin path and {{file}} paths resolve + max_budget_usd: 1 # per-run cost guard + plugins: + - type: local + path: plugins/comment-review # loads the skill under test + skills: all + setting_sources: [] # keep this repo's CLAUDE.md / hooks OUT of the eval + ask_user_question: + behavior: first_option # file-path mode shouldn't prompt, but never block if it does + +defaultTest: + options: + # Grader for llm-rubric. No ANTHROPIC_API_KEY in this environment, so grade on the + # subscription via the agent SDK (single-turn, no tools). For a faster/cheaper grader, + # set ANTHROPIC_API_KEY and replace this with: provider: anthropic:messages:claude-opus-4-8 + provider: + id: anthropic:claude-agent-sdk + config: + apiKeyRequired: false + model: claude-sonnet-4-6 + max_turns: 1 + +tests: + # ---------------------------------------------------------------------------- + - description: 'eval-0 banners-and-doc-refs (datadog-integration.tf)' + vars: + file: plugins/comment-review/evals/fixtures/datadog-integration.tf + assert: + # "skill actually ran" proxy: plugin skills load via Agent-Skills injection, not a + # Skill() tool call, so `skill-used` / metadata.skillCalls stay empty. Assert the + # report uses the skill's own R1–R12 rule taxonomy instead. + - type: regex + value: '\bR(1[0-2]|[1-9])\b' + - type: llm-rubric + value: >- + Flags the top `# ===== Datadog =====` banner as R5 with verdict REMOVE. + - type: llm-rubric + value: >- + Flags `# ----- Provider credentials -----` as R5 REMOVE. + - type: llm-rubric + value: >- + Flags the titled banner `# ----- DD GCP integration — per-env SA + STS + registration -----` as R5 REMOVE (a descriptive title does not redeem the fence). + - type: llm-rubric + value: >- + Flags `# ----- Metric namespace allowlist -----` as R5 REMOVE. + - type: llm-rubric + value: >- + Flags the `account scoping lives in DD_ARCH.md §3.2` comment under R4 + (internal-doc reference). + - type: llm-rubric + value: >- + Flags the `DD_PLAN.md T4.1` comment under R4 — keeps the "hard-stop is + intentional" rationale but drops the internal-doc reference. + - type: llm-rubric + value: >- + Keeps `# Retry-After handling per RFC 9110 §10.2.4` (R4 external-pinned-spec + exception) — it is not flagged for removal. + - type: llm-rubric + value: >- + Keeps the metric-allowlist WHY rationale ("only gcp.* and custom.app.* are + billed … so an accidental high-cardinality metric can't blow the bill"). + + # ---------------------------------------------------------------------------- + - description: 'eval-1 file-refs-and-diagram (scheduler.ts)' + vars: + file: plugins/comment-review/evals/fixtures/scheduler.ts + assert: + # "skill actually ran" proxy: plugin skills load via Agent-Skills injection, not a + # Skill() tool call, so `skill-used` / metadata.skillCalls stay empty. Assert the + # report uses the skill's own R1–R12 rule taxonomy instead. + - type: regex + value: '\bR(1[0-2]|[1-9])\b' + - type: llm-rubric + value: >- + Flags the comment that references `utils/backoff.ts` and `docs/scheduling.md` + under R4 (cross-file / internal-doc references). + - type: llm-rubric + value: >- + Flags `// ----- helpers -----` as R5 REMOVE. + - type: llm-rubric + value: >- + Flags the top `// ===== Scheduler =====` banner as R5 REMOVE. + - type: llm-rubric + value: >- + Flags `// increment the attempt counter` as R1 REMOVE. + - type: llm-rubric + value: >- + Keeps the ASCII state-machine / job-lifecycle diagram (IDLE → QUEUED → RUNNING …) + — the R5 exception for a diagram that encodes information; it is NOT flagged. + - type: llm-rubric + value: >- + Keeps `// Sequential, not parallel — the broker rate-limits per worker token` + as a genuine decision/WHY (R2); it is NOT flagged. + - type: llm-rubric + value: >- + Keeps the `CVE-2024-1234` reference (R4 external exception); it is NOT flagged + for removal. + + # ---------------------------------------------------------------------------- + - description: 'eval-2 misplaced-and-duplicated-rationale (dlq-codes.ts + dlq.handler.ts)' + vars: + file: plugins/comment-review/evals/fixtures/dlq-codes.ts plugins/comment-review/evals/fixtures/dlq.handler.ts + assert: + # "skill actually ran" proxy: plugin skills load via Agent-Skills injection, not a + # Skill() tool call, so `skill-used` / metadata.skillCalls stay empty. Assert the + # report uses the skill's own R1–R12 rule taxonomy instead. + - type: regex + value: '\bR(1[0-2]|[1-9])\b' + - type: llm-rubric + value: >- + Flags `// the conditional UPDATE can't distinguish a missing row from an + ineligible status` sitting on the enum member `DLQ_STATUS_NOT_ELIGIBLE` as R12 + (rationale parked on a declaration). + - type: llm-rubric + value: >- + For that enum-member comment the verdict is REMOVE — the same rationale already + lives at the handler's `updateIfEligible` call, so the declaration copy is the + redundant/duplicated one. + - type: llm-rubric + value: >- + For the `// we swallow the duplicate …` comment on `DLQ_DUPLICATE_DROPPED` the + verdict is MOVE — the rationale exists only at the declaration, and the fix names + the handler method (`handle` / its duplicate branch) as the destination. + - type: llm-rubric + value: >- + Keeps `// 0 means unbounded, not disabled` on the `DLQ_MAX_RETRIES` constant — + a value-meaning note every reader needs (R12 contrast, not a MOVE/REMOVE). + - type: llm-rubric + value: >- + Keeps the genuine WHY already located at the handler's `updateIfEligible` + (R2); it is NOT flagged. + - type: llm-rubric + value: >- + Does not flag the bare enum members that carry no comment + (`DLQ_MESSAGE_RECEIVED`, `DLQ_REQUEUED`). + + # ---------------------------------------------------------------------------- + - description: 'eval-3 spec-id-references (payment-validator.ts)' + vars: + file: plugins/comment-review/evals/fixtures/payment-validator.ts + assert: + # "skill actually ran" proxy: plugin skills load via Agent-Skills injection, not a + # Skill() tool call, so `skill-used` / metadata.skillCalls stay empty. Assert the + # report uses the skill's own R1–R12 rule taxonomy instead. + - type: regex + value: '\bR(1[0-2]|[1-9])\b' + - type: llm-rubric + value: >- + Flags `// F1: users can submit a payment with a saved card` as R4 with verdict + REMOVE. + - type: llm-rubric + value: >- + Flags `// see Q1` as R4 REMOVE (a pure spec-id pointer; nothing survives stripping it). + - type: llm-rubric + value: >- + Flags `// §4.1 caps a single charge at 500_000 minor units` under R4 with a + verdict of REWRITE *or* REMOVE — both are defensible: once the `§4.1` spec-id + is stripped, the remainder ("caps a single charge at 500_000 minor units") + borders on R1 narration of `MAX_CHARGE_MINOR_UNITS = 500_000` + its guard, so a + strong model may delete it rather than rewrite. It must NOT be kept as-is. + - type: llm-rubric + value: >- + The handling of the `§4.1` comment never retains the token `§4.1` or `4.1`: a + REWRITE drops it from the replacement text, and a REMOVE (delete the line) + satisfies this trivially. + - type: llm-rubric + value: >- + Flags `// R7: charge in source order — the ledger rejects out-of-order sequence + numbers` as R4 REWRITE, keeping the ordering rationale. + - type: llm-rubric + value: >- + The suggested fix for the `R7` comment does NOT contain the token `R7` but keeps + the ordering meaning ("source order" / "out-of-order"). + - type: llm-rubric + value: >- + Flags `/** Implements AC-3 and AC-4. */` as R4 REMOVE (a pure spec-id pointer doc). + - type: llm-rubric + value: >- + Keeps `// Retry-After handling per RFC 9110 §10.2.4` (R4 external-pin exception); + it is NOT flagged for removal. + - type: llm-rubric + value: >- + Keeps `// region R2 is EU-only; never route US tenants here` — `R2` is a real + string-literal value (`const REGION = "R2"`), not a spec-id; it is NOT flagged. + - type: llm-rubric + value: >- + Keeps `// sequential, not parallel — the gateway rate-limits per merchant IP` + as a genuine WHY (R2); it is NOT flagged. + - type: llm-rubric + value: >- + No kept or rewritten comment text in the report retains any spec-id fragment + (`F1`, `Q1`, `AC-3`, `AC-4`, `§4.1`, `R7`). Quoting the ORIGINAL comment in a + finding header is fine; the requirement is about the kept/rewritten text. + + # ---------------------------------------------------------------------------- + - description: 'eval-4 spec-id-embedded-midsentence (host-allowlist.ts)' + vars: + file: plugins/comment-review/evals/fixtures/host-allowlist.ts + assert: + # "skill actually ran" proxy: plugin skills load via Agent-Skills injection, not a + # Skill() tool call, so `skill-used` / metadata.skillCalls stay empty. Assert the + # report uses the skill's own R1–R12 rule taxonomy instead. + - type: regex + value: '\bR(1[0-2]|[1-9])\b' + - type: llm-rubric + value: >- + All three genuine-WHY comments (the R2 normalize note, the F3/registry note, + and the §2.3/§2.4 two-pass-validation note) are REWRITE — not a blanket REMOVE, + and not KEEP-as-is. + - type: llm-rubric + value: >- + No rewritten comment retains the tokens `R2`, `F3`, `Q4`, `§2.3` or `§2.4`. + - type: llm-rubric + value: >- + The ordering / failure-mode rationale of each comment is preserved in its rewrite + (the WHY survives; only the spec-id token is stripped). + - type: llm-rubric + value: >- + The bare R1 narration line `// hosts are matched case-insensitively` is REMOVE. + - type: llm-rubric + value: >- + The reasoning checks whether each token is a real code value before stripping it, + rather than rationalizing the spec-id `R2` as a code value. diff --git a/plugins/comment-review/evals/prompts/review.txt b/plugins/comment-review/evals/prompts/review.txt new file mode 100644 index 0000000..b68b947 --- /dev/null +++ b/plugins/comment-review/evals/prompts/review.txt @@ -0,0 +1,6 @@ +comment review {{file}} + +Review only the comments in the file path(s) above, judging each against the +comment-quality rules (R1–R12). Return the per-comment verdict report +(KEEP / REMOVE / REWRITE / MOVE) with a one-line reason and a concrete suggested +fix for every finding, then the tally. Do not edit any files.