From f07625ddb066ff320bd76f1952b60b827c49dc94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20Gosta=C5=84ski?= <mateusz.gostanski@gmail.com>
Date: Thu, 18 Jun 2026 10:19:08 +0200
Subject: [PATCH 1/2] chore: root promptfoo eval tooling

Single shared root node_modules + scripts to run every plugin's eval suite.

- package.json: devDeps (@anthropic-ai/claude-agent-sdk, promptfoo) + eval scripts
- scripts/run-evals.sh: discovers and runs plugins/*/evals/promptfooconfig.yaml
- pnpm-workspace.yaml: allowBuilds=false (avoids ERR_PNPM_IGNORED_BUILDS on pnpm 11)
- .gitignore: node_modules, pnpm-lock.yaml, eval outputs
---
 .gitignore           | 10 ++++++++++
 package.json         | 14 ++++++++++++++
 pnpm-workspace.yaml  | 11 +++++++++++
 scripts/run-evals.sh | 41 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 76 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 package.json
 create mode 100644 pnpm-workspace.yaml
 create mode 100755 scripts/run-evals.sh
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e63fd40
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+# Node dev tooling (shared at the repo root; used by plugins/*/evals)
+node_modules/
+pnpm-lock.yaml
+# pnpm-workspace.yaml is committed — it carries the onlyBuiltDependencies setting
+
+# promptfoo eval outputs
+eval-*.json
+*.eval.json
+output.json
+.promptfoo/
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..1cfb608
--- /dev/null
+++ b/package.json
@@ -0,0 +1,14 @@
+{
+  "name": "cc-toolkit",
+  "version": "0.0.0",
+  "private": true,
+  "description": "Claude Code plugin marketplace — root dev tooling (promptfoo eval suites live in plugins/<name>/evals/)",
+  "scripts": {
+    "eval": "bash scripts/run-evals.sh",
+    "eval:comment-review": "promptfoo eval -c plugins/comment-review/evals/promptfooconfig.yaml --no-cache --no-share -o /tmp/eval-comment-review.json"
+  },
+  "devDependencies": {
+    "@anthropic-ai/claude-agent-sdk": "latest",
+    "promptfoo": "latest"
+  }
+}
diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml
new file mode 100644
index 0000000..a8e0538
--- /dev/null
+++ b/pnpm-workspace.yaml
@@ -0,0 +1,11 @@
+# promptfoo's transitive deps ship build scripts (esbuild, sharp, onnxruntime, …) that
+# we don't need to run for evals. Decline each one explicitly so `pnpm install` exits 0
+# instead of failing with ERR_PNPM_IGNORED_BUILDS (which also breaks pnpm's pre-run
+# dependency check used by `pnpm eval`). Flip one to `true` only if a feature needs it.
+allowBuilds:
+  '@playwright/browser-chromium': false
+  '@swc/core': false
+  esbuild: false
+  onnxruntime-node: false
+  protobufjs: false
+  sharp: false
diff --git a/scripts/run-evals.sh b/scripts/run-evals.sh
new file mode 100755
index 0000000..61c90d4
--- /dev/null
+++ b/scripts/run-evals.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+# Run every plugin's promptfoo eval suite found at plugins/<name>/evals/promptfooconfig.yaml.
+# Extra args are forwarded to `promptfoo eval` (e.g. --filter-pattern eval-3).
+#
+#   pnpm eval                          # run all suites
+#   pnpm eval -- --filter-pattern e3   # forward args to promptfoo
+#
+# Deps (promptfoo + @anthropic-ai/claude-agent-sdk) are resolved from the root
+# node_modules, so this must run from the repo root (it cd's there itself).
+set -euo pipefail
+
+# `pnpm eval -- <args>` forwards a literal "--" into the script; drop it so it isn't
+# passed on to promptfoo (where "--" would end option parsing).
+[ "${1:-}" = "--" ] && shift
+
+cd "$(dirname "$0")/.."
+shopt -s nullglob
+configs=(plugins/*/evals/promptfooconfig.yaml)
+
+if [ ${#configs[@]} -eq 0 ]; then
+  echo "No eval suites found (plugins/*/evals/promptfooconfig.yaml)."
+  exit 0
+fi
+
+bin="./node_modules/.bin/promptfoo"
+if [ ! -x "$bin" ]; then
+  echo "promptfoo not installed at root. Run: pnpm install" >&2
+  exit 1
+fi
+
+status=0
+for cfg in "${configs[@]}"; do
+  name="$(basename "$(dirname "$(dirname "$cfg")")")"
+  echo "==> ${name}  (${cfg})"
+  if ! "$bin" eval -c "$cfg" --no-cache --no-share -o "/tmp/eval-${name}.json" "$@"; then
+    echo "!! ${name} eval reported failures" >&2
+    status=1
+  fi
+done
+
+exit $status

From ca61a5b0d30ca7285ba3702d729843610c417bae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20Gosta=C5=84ski?= <mateusz.gostanski@gmail.com>
Date: Thu, 18 Jun 2026 10:19:22 +0200
Subject: [PATCH 2/2] test(comment-review): promptfoo eval suite
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

5 evals (R1-R12 comment-quality verdicts) running the real skill via the native
anthropic:claude-agent-sdk provider (local plugin, subscription auth).

- fixtures: 2 ported from skill-creator + 4 reconstructed (datadog, scheduler, dlq pair)
- assertions: llm-rubric per criterion + regex 'skill ran' proxy
- baseline: sonnet 41/42, opus 42/42; eval-3 §4.1 accepts REWRITE-or-REMOVE,
  eval-4 token-verification kept strict as a model-capability marker
---
 plugins/comment-review/evals/README.md        | 100 ++++++++
 .../evals/fixtures/datadog-integration.tf     |  37 +++
 .../evals/fixtures/dlq-codes.ts               |  13 +
 .../evals/fixtures/dlq.handler.ts             |  25 ++
 .../evals/fixtures/host-allowlist.ts          |  28 ++
 .../evals/fixtures/payment-validator.ts       |  33 +++
 .../evals/fixtures/scheduler.ts               |  38 +++
 .../comment-review/evals/promptfooconfig.yaml | 240 ++++++++++++++++++
 .../comment-review/evals/prompts/review.txt   |   6 +
 9 files changed, 520 insertions(+)
 create mode 100644 plugins/comment-review/evals/README.md
 create mode 100644 plugins/comment-review/evals/fixtures/datadog-integration.tf
 create mode 100644 plugins/comment-review/evals/fixtures/dlq-codes.ts
 create mode 100644 plugins/comment-review/evals/fixtures/dlq.handler.ts
 create mode 100644 plugins/comment-review/evals/fixtures/host-allowlist.ts
 create mode 100644 plugins/comment-review/evals/fixtures/payment-validator.ts
 create mode 100644 plugins/comment-review/evals/fixtures/scheduler.ts
 create mode 100644 plugins/comment-review/evals/promptfooconfig.yaml
 create mode 100644 plugins/comment-review/evals/prompts/review.txt

diff --git a/plugins/comment-review/evals/README.md b/plugins/comment-review/evals/README.md
new file mode 100644
index 0000000..d05dbbe
--- /dev/null
+++ b/plugins/comment-review/evals/README.md
@@ -0,0 +1,100 @@
+# comment-review — promptfoo eval suite
+
+Repeatable [promptfoo](https://www.promptfoo.dev/) evals for the `comment-review`
+skill. Each test runs the **real skill** (loaded as a local plugin through the
+Claude Agent SDK) against a fixture and grades the comment-quality verdicts
+(KEEP / REMOVE / REWRITE / MOVE, rules R1–R12).
+
+This is dev tooling — it is **not** shipped as part of the plugin runtime.
+
+## Layout
+
+```
+evals/
+  promptfooconfig.yaml   # provider + tests + assertions
+  prompts/review.txt     # natural-language trigger for the skill
+  fixtures/              # inputs (2 ported from skill-creator + 4 reconstructed)
+```
+
+Node dev deps (`@anthropic-ai/claude-agent-sdk` + `promptfoo`) and the run
+scripts live at the **repo root** (`package.json`, single shared `node_modules`),
+not per-plugin.
+
+| Test | Fixture(s) | Focus |
+|------|-----------|-------|
+| eval-0 | `datadog-integration.tf` | R5 banners + R4 internal-doc refs vs external RFC |
+| eval-1 | `scheduler.ts` | R4 file/doc refs, R5 banners, R1 narration, kept diagram/CVE |
+| eval-2 | `dlq-codes.ts` + `dlq.handler.ts` | R12 misplaced/duplicated rationale (REMOVE vs MOVE) |
+| eval-3 | `payment-validator.ts` | R4 spec-id pointers (REMOVE/REWRITE), token-stripping |
+| eval-4 | `host-allowlist.ts` | R4 spec-ids embedded mid-sentence → REWRITE, keep the WHY |
+
+## Prerequisites
+
+- `pnpm install` at the **repo root** (installs `@anthropic-ai/claude-agent-sdk` +
+  `promptfoo` into the shared root `node_modules`).
+- A logged-in Claude Code CLI. The target provider runs on your **subscription**
+  (`apiKeyRequired: false`) — no `ANTHROPIC_API_KEY` needed.
+
+## Run
+
+All commands run from the **repo root** (so the Agent SDK resolves from the root
+`node_modules`):
+
+```bash
+pnpm install
+
+# this suite (full)
+pnpm eval:comment-review
+
+# every plugin's eval suite (plugins/*/evals/*)
+pnpm eval
+
+# forward args to promptfoo, e.g. focus one test while iterating
+pnpm eval -- --filter-pattern eval-3
+
+# structural check (free)
+./node_modules/.bin/promptfoo validate config -c plugins/comment-review/evals/promptfooconfig.yaml
+
+# browse results
+./node_modules/.bin/promptfoo view
+```
+
+`scripts/run-evals.sh` (wired to `pnpm eval`) discovers every
+`plugins/<name>/evals/promptfooconfig.yaml`, runs each, and writes
+`/tmp/eval-<name>.json`.
+
+## Notes & knobs
+
+- **Model.** Default target/grader is `claude-sonnet-4-6` (cheap iteration); set a
+  stronger target in `promptfooconfig.yaml` (`providers[0].config.model`) for a
+  regression gate. Baseline: **sonnet 41/42, opus 42/42.** Two assertions are
+  instructive:
+  - *eval-3 `§4.1`* is a genuine judgment boundary — after the spec-id is stripped
+    the comment borders on R1, so REWRITE and REMOVE are both defensible (opus and
+    sonnet often pick REMOVE). The assertion accepts **either**; only "kept as-is" fails.
+  - *eval-4 token-verification* is a real capability discriminator: it asks the
+    report to verify each token against the code before stripping. Sonnet takes a
+    "letter+number ⇒ spec-id" shortcut and **fails** it; opus shows the check and
+    **passes**. Kept strict on purpose as a model marker.
+- **Flaky verdicts.** A borderline verdict can flip run-to-run; add `repeat: 2`/`3`
+  (or `defaultTest.options`) if you want a stable gate.
+- **Grader.** llm-rubric grades on the subscription via a single-turn agent
+  (slow-ish). For a faster/cheaper grader, set `ANTHROPIC_API_KEY` and change
+  `defaultTest.options.provider` to `anthropic:messages:claude-opus-4-8`.
+- **No `skill-used` assertion.** Plugin skills load via Agent-Skills *injection*,
+  not a `Skill()` tool call, so `metadata.skillCalls` stays empty. Each test
+  instead asserts (via `regex`) that the report uses the skill's R1–R12 taxonomy —
+  output of that shape requires the skill to have loaded (you can confirm in the
+  trace: the agent reads `skills/comment-review/references/rules.md`).
+- `setting_sources: []` keeps this repo's `CLAUDE.md`/hooks out of the run, so
+  results reflect the skill, not the surrounding harness.
+
+## Provenance
+
+Fixtures `payment-validator.ts` and `host-allowlist.ts` and their assertions come
+from the skill-creator workspace
+(`../skills/comment-review-workspace/`, evals 3 & 4). Fixtures
+`datadog-integration.tf`, `scheduler.ts`, `dlq-codes.ts`, `dlq.handler.ts` were
+reconstructed from the assertion lists in that workspace's
+`skill-snapshot/evals/evals.json` (evals 0/1/2), whose original fixtures were no
+longer present.
diff --git a/plugins/comment-review/evals/fixtures/datadog-integration.tf b/plugins/comment-review/evals/fixtures/datadog-integration.tf
new file mode 100644
index 0000000..a853298
--- /dev/null
+++ b/plugins/comment-review/evals/fixtures/datadog-integration.tf
@@ -0,0 +1,37 @@
+# ===== Datadog =====
+
+# ----- Provider credentials -----
+provider "datadog" {
+  api_key = var.datadog_api_key
+  app_key = var.datadog_app_key
+}
+
+# ----- DD GCP integration — per-env SA + STS registration -----
+resource "datadog_integration_gcp_sts" "main" {
+  client_email = google_service_account.datadog.email
+
+  # account scoping lives in DD_ARCH.md §3.2
+  host_filters = "env:prod"
+
+  # the per-env hard-stop on host_filters is intentional (DD_PLAN.md T4.1)
+  automute = true
+}
+
+# ----- Metric namespace allowlist -----
+resource "datadog_metric_metadata" "billed" {
+  # only gcp.* and custom.app.* are billed under our contract — everything else is
+  # dropped here so an accidental high-cardinality metric can't blow the bill
+  metric = "gcp.gce.instance"
+}
+
+resource "datadog_synthetics_test" "refund_probe" {
+  request_definition {
+    method = "POST"
+    url    = "https://api.example.com/refund"
+
+    # Retry-After handling per RFC 9110 §10.2.4
+    retry {
+      count = 2
+    }
+  }
+}
diff --git a/plugins/comment-review/evals/fixtures/dlq-codes.ts b/plugins/comment-review/evals/fixtures/dlq-codes.ts
new file mode 100644
index 0000000..797ee3b
--- /dev/null
+++ b/plugins/comment-review/evals/fixtures/dlq-codes.ts
@@ -0,0 +1,13 @@
+export const DLQ_MAX_RETRIES = 0; // 0 means unbounded, not disabled
+
+export enum AuditDlqInfoCode {
+  DLQ_MESSAGE_RECEIVED = "audit.dlq.message.received",
+
+  // the conditional UPDATE can't distinguish a missing row from an ineligible status
+  DLQ_STATUS_NOT_ELIGIBLE = "audit.dlq.no_op.status_not_eligible",
+
+  // we swallow the duplicate so a redelivered message doesn't double-count the audit metric
+  DLQ_DUPLICATE_DROPPED = "audit.dlq.no_op.duplicate_dropped",
+
+  DLQ_REQUEUED = "audit.dlq.requeued",
+}
diff --git a/plugins/comment-review/evals/fixtures/dlq.handler.ts b/plugins/comment-review/evals/fixtures/dlq.handler.ts
new file mode 100644
index 0000000..46770dc
--- /dev/null
+++ b/plugins/comment-review/evals/fixtures/dlq.handler.ts
@@ -0,0 +1,25 @@
+import { AuditDlqInfoCode } from "./dlq-codes";
+
+export class DlqHandler {
+  private seen = new Set<string>();
+
+  constructor(private repo: AuditRepo, private bus: EventBus) {}
+
+  async handle(msg: DlqMessage) {
+    if (this.seen.has(msg.id)) {
+      this.bus.emit(AuditDlqInfoCode.DLQ_DUPLICATE_DROPPED);
+      return;
+    }
+    this.seen.add(msg.id);
+
+    // the conditional UPDATE can't distinguish a missing row from an ineligible status,
+    // so a zero-row result is ambiguous — we treat both as not-eligible rather than retry
+    const updated = await this.repo.updateIfEligible(msg.auditId, msg.status);
+    if (updated === 0) {
+      this.bus.emit(AuditDlqInfoCode.DLQ_STATUS_NOT_ELIGIBLE);
+      return;
+    }
+
+    this.bus.emit(AuditDlqInfoCode.DLQ_REQUEUED);
+  }
+}
diff --git a/plugins/comment-review/evals/fixtures/host-allowlist.ts b/plugins/comment-review/evals/fixtures/host-allowlist.ts
new file mode 100644
index 0000000..a52786e
--- /dev/null
+++ b/plugins/comment-review/evals/fixtures/host-allowlist.ts
@@ -0,0 +1,28 @@
+export class HostAllowlist {
+  private ttlSeconds: number;
+
+  // We lowercase the host and strip the port before the R2 membership check, because the
+  // upstream registry (F3) stores bare lowercase hostnames — a mismatch here silently lets
+  // a blocked host through.
+  normalize(host: string): string {
+    return host.toLowerCase().replace(/:\d+$/, "");
+  }
+
+  // Per Q4 the cache TTL must never exceed the token lifetime; otherwise a revoked token
+  // keeps passing the check until the stale entry expires on its own.
+  setTtl(seconds: number) {
+    this.ttlSeconds = Math.min(seconds, this.tokenLifetime());
+  }
+
+  // Validate in two passes (see §2.3 then §2.4): structural shape first, semantics second,
+  // so a malformed payload never reaches the semantic validator and blows up mid-pass.
+  validate(payload: unknown): Result {
+    const shaped = this.checkShape(payload);
+    return this.checkSemantics(shaped);
+  }
+
+  // hosts are matched case-insensitively
+  isAllowed(host: string): boolean {
+    return this.set.has(this.normalize(host));
+  }
+}
diff --git a/plugins/comment-review/evals/fixtures/payment-validator.ts b/plugins/comment-review/evals/fixtures/payment-validator.ts
new file mode 100644
index 0000000..eab87a5
--- /dev/null
+++ b/plugins/comment-review/evals/fixtures/payment-validator.ts
@@ -0,0 +1,33 @@
+import { dedupe, getOrCreateIdempotencyKey } from "./util";
+
+const MAX_CHARGE_MINOR_UNITS = 500_000;
+
+// region R2 is EU-only; never route US tenants here
+const REGION = "R2";
+
+// F1: users can submit a payment with a saved card
+export function submitPayment(req: PaymentRequest): PaymentResult {
+  // see Q1
+  if (req.amount <= 0) throw new InvalidAmountError();
+
+  // §4.1 caps a single charge at 500_000 minor units
+  if (req.amount > MAX_CHARGE_MINOR_UNITS) throw new ChargeTooLargeError();
+
+  // R7: charge in source order — the ledger rejects out-of-order sequence numbers
+  const charges = dedupe(req.charges).sort((a, b) => a.seq - b.seq);
+
+  const key = getOrCreateIdempotencyKey(req.userId, 24 * 60 * 60 * 1000);
+
+  // sequential, not parallel — the gateway rate-limits per merchant IP
+  const results = [];
+  for (const c of charges) {
+    results.push(postCharge(c, key, REGION));
+  }
+  return collect(results);
+}
+
+/** Implements AC-3 and AC-4. */
+export function refund(charge: Charge): RefundResult {
+  // Retry-After handling per RFC 9110 §10.2.4
+  return withRetryAfter(() => gateway.refund(charge));
+}
diff --git a/plugins/comment-review/evals/fixtures/scheduler.ts b/plugins/comment-review/evals/fixtures/scheduler.ts
new file mode 100644
index 0000000..1095032
--- /dev/null
+++ b/plugins/comment-review/evals/fixtures/scheduler.ts
@@ -0,0 +1,38 @@
+// ===== Scheduler =====
+
+// Job lifecycle:
+//   IDLE ──submit──▶ QUEUED ──claim──▶ RUNNING ──ok───▶ DONE
+//                                  └───err──▶ BACKOFF ──retry──▶ QUEUED
+import { nextDelay } from "./utils/backoff";
+
+const MAX_DELAY_MS = 30_000;
+
+export class Scheduler {
+  constructor(private broker: Broker) {}
+
+  // backoff curve lives in utils/backoff.ts; see docs/scheduling.md for the tuning rationale
+  async run(jobs: Job[]) {
+    // Sequential, not parallel — the broker rate-limits per worker token
+    for (const job of jobs) {
+      let attempt = 0;
+      while (!job.done && attempt < job.maxAttempts) {
+        // increment the attempt counter
+        attempt++;
+        try {
+          await this.claimAndRun(job);
+        } catch (err) {
+          // clamp the delay so a hostile job can't request an unbounded sleep (CVE-2024-1234)
+          const delay = Math.min(nextDelay(attempt), MAX_DELAY_MS);
+          await sleep(delay);
+        }
+      }
+    }
+  }
+
+  // ----- helpers -----
+
+  private async claimAndRun(job: Job) {
+    const token = await this.broker.claim(job.id);
+    return this.execute(job, token);
+  }
+}
diff --git a/plugins/comment-review/evals/promptfooconfig.yaml b/plugins/comment-review/evals/promptfooconfig.yaml
new file mode 100644
index 0000000..14908fc
--- /dev/null
+++ b/plugins/comment-review/evals/promptfooconfig.yaml
@@ -0,0 +1,240 @@
+# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
+description: comment-review skill — comment-quality verdicts (R1–R12) on fixtures
+
+prompts:
+  # Natural-language trigger (not a literal "/comment-review" slash — the Agent SDK reads a
+  # leading "/" as a command and rejects it; the skill fires on description match instead).
+  - file://prompts/review.txt
+
+providers:
+  - id: anthropic:claude-agent-sdk # alias: anthropic:claude-code
+    label: comment-review
+    config:
+      apiKeyRequired: false # use the local Claude Code subscription, no API key for the target
+      model: claude-sonnet-4-6 # override per run; a capable model is needed for evals 3/4 to pass
+      permission_mode: bypassPermissions # the skill's Read/Grep/Bash run unattended
+      allow_dangerously_skip_permissions: true # required alongside bypassPermissions
+      working_dir: ../../.. # repo root, so the plugin path and {{file}} paths resolve
+      max_budget_usd: 1 # per-run cost guard
+      plugins:
+        - type: local
+          path: plugins/comment-review # loads the skill under test
+      skills: all
+      setting_sources: [] # keep this repo's CLAUDE.md / hooks OUT of the eval
+      ask_user_question:
+        behavior: first_option # file-path mode shouldn't prompt, but never block if it does
+
+defaultTest:
+  options:
+    # Grader for llm-rubric. No ANTHROPIC_API_KEY in this environment, so grade on the
+    # subscription via the agent SDK (single-turn, no tools). For a faster/cheaper grader,
+    # set ANTHROPIC_API_KEY and replace this with:  provider: anthropic:messages:claude-opus-4-8
+    provider:
+      id: anthropic:claude-agent-sdk
+      config:
+        apiKeyRequired: false
+        model: claude-sonnet-4-6
+        max_turns: 1
+
+tests:
+  # ----------------------------------------------------------------------------
+  - description: 'eval-0 banners-and-doc-refs (datadog-integration.tf)'
+    vars:
+      file: plugins/comment-review/evals/fixtures/datadog-integration.tf
+    assert:
+      # "skill actually ran" proxy: plugin skills load via Agent-Skills injection, not a
+      # Skill() tool call, so `skill-used` / metadata.skillCalls stay empty. Assert the
+      # report uses the skill's own R1–R12 rule taxonomy instead.
+      - type: regex
+        value: '\bR(1[0-2]|[1-9])\b'
+      - type: llm-rubric
+        value: >-
+          Flags the top `# ===== Datadog =====` banner as R5 with verdict REMOVE.
+      - type: llm-rubric
+        value: >-
+          Flags `# ----- Provider credentials -----` as R5 REMOVE.
+      - type: llm-rubric
+        value: >-
+          Flags the titled banner `# ----- DD GCP integration — per-env SA + STS
+          registration -----` as R5 REMOVE (a descriptive title does not redeem the fence).
+      - type: llm-rubric
+        value: >-
+          Flags `# ----- Metric namespace allowlist -----` as R5 REMOVE.
+      - type: llm-rubric
+        value: >-
+          Flags the `account scoping lives in DD_ARCH.md §3.2` comment under R4
+          (internal-doc reference).
+      - type: llm-rubric
+        value: >-
+          Flags the `DD_PLAN.md T4.1` comment under R4 — keeps the "hard-stop is
+          intentional" rationale but drops the internal-doc reference.
+      - type: llm-rubric
+        value: >-
+          Keeps `# Retry-After handling per RFC 9110 §10.2.4` (R4 external-pinned-spec
+          exception) — it is not flagged for removal.
+      - type: llm-rubric
+        value: >-
+          Keeps the metric-allowlist WHY rationale ("only gcp.* and custom.app.* are
+          billed … so an accidental high-cardinality metric can't blow the bill").
+
+  # ----------------------------------------------------------------------------
+  - description: 'eval-1 file-refs-and-diagram (scheduler.ts)'
+    vars:
+      file: plugins/comment-review/evals/fixtures/scheduler.ts
+    assert:
+      # "skill actually ran" proxy: plugin skills load via Agent-Skills injection, not a
+      # Skill() tool call, so `skill-used` / metadata.skillCalls stay empty. Assert the
+      # report uses the skill's own R1–R12 rule taxonomy instead.
+      - type: regex
+        value: '\bR(1[0-2]|[1-9])\b'
+      - type: llm-rubric
+        value: >-
+          Flags the comment that references `utils/backoff.ts` and `docs/scheduling.md`
+          under R4 (cross-file / internal-doc references).
+      - type: llm-rubric
+        value: >-
+          Flags `// ----- helpers -----` as R5 REMOVE.
+      - type: llm-rubric
+        value: >-
+          Flags the top `// ===== Scheduler =====` banner as R5 REMOVE.
+      - type: llm-rubric
+        value: >-
+          Flags `// increment the attempt counter` as R1 REMOVE.
+      - type: llm-rubric
+        value: >-
+          Keeps the ASCII state-machine / job-lifecycle diagram (IDLE → QUEUED → RUNNING …)
+          — the R5 exception for a diagram that encodes information; it is NOT flagged.
+      - type: llm-rubric
+        value: >-
+          Keeps `// Sequential, not parallel — the broker rate-limits per worker token`
+          as a genuine decision/WHY (R2); it is NOT flagged.
+      - type: llm-rubric
+        value: >-
+          Keeps the `CVE-2024-1234` reference (R4 external exception); it is NOT flagged
+          for removal.
+
+  # ----------------------------------------------------------------------------
+  - description: 'eval-2 misplaced-and-duplicated-rationale (dlq-codes.ts + dlq.handler.ts)'
+    vars:
+      file: plugins/comment-review/evals/fixtures/dlq-codes.ts plugins/comment-review/evals/fixtures/dlq.handler.ts
+    assert:
+      # "skill actually ran" proxy: plugin skills load via Agent-Skills injection, not a
+      # Skill() tool call, so `skill-used` / metadata.skillCalls stay empty. Assert the
+      # report uses the skill's own R1–R12 rule taxonomy instead.
+      - type: regex
+        value: '\bR(1[0-2]|[1-9])\b'
+      - type: llm-rubric
+        value: >-
+          Flags `// the conditional UPDATE can't distinguish a missing row from an
+          ineligible status` sitting on the enum member `DLQ_STATUS_NOT_ELIGIBLE` as R12
+          (rationale parked on a declaration).
+      - type: llm-rubric
+        value: >-
+          For that enum-member comment the verdict is REMOVE — the same rationale already
+          lives at the handler's `updateIfEligible` call, so the declaration copy is the
+          redundant/duplicated one.
+      - type: llm-rubric
+        value: >-
+          For the `// we swallow the duplicate …` comment on `DLQ_DUPLICATE_DROPPED` the
+          verdict is MOVE — the rationale exists only at the declaration, and the fix names
+          the handler method (`handle` / its duplicate branch) as the destination.
+      - type: llm-rubric
+        value: >-
+          Keeps `// 0 means unbounded, not disabled` on the `DLQ_MAX_RETRIES` constant —
+          a value-meaning note every reader needs (R12 contrast, not a MOVE/REMOVE).
+      - type: llm-rubric
+        value: >-
+          Keeps the genuine WHY already located at the handler's `updateIfEligible`
+          (R2); it is NOT flagged.
+      - type: llm-rubric
+        value: >-
+          Does not flag the bare enum members that carry no comment
+          (`DLQ_MESSAGE_RECEIVED`, `DLQ_REQUEUED`).
+
+  # ----------------------------------------------------------------------------
+  - description: 'eval-3 spec-id-references (payment-validator.ts)'
+    vars:
+      file: plugins/comment-review/evals/fixtures/payment-validator.ts
+    assert:
+      # "skill actually ran" proxy: plugin skills load via Agent-Skills injection, not a
+      # Skill() tool call, so `skill-used` / metadata.skillCalls stay empty. Assert the
+      # report uses the skill's own R1–R12 rule taxonomy instead.
+      - type: regex
+        value: '\bR(1[0-2]|[1-9])\b'
+      - type: llm-rubric
+        value: >-
+          Flags `// F1: users can submit a payment with a saved card` as R4 with verdict
+          REMOVE.
+      - type: llm-rubric
+        value: >-
+          Flags `// see Q1` as R4 REMOVE (a pure spec-id pointer; nothing survives stripping it).
+      - type: llm-rubric
+        value: >-
+          Flags `// §4.1 caps a single charge at 500_000 minor units` under R4 with a
+          verdict of REWRITE *or* REMOVE — both are defensible: once the `§4.1` spec-id
+          is stripped, the remainder ("caps a single charge at 500_000 minor units")
+          borders on R1 narration of `MAX_CHARGE_MINOR_UNITS = 500_000` + its guard, so a
+          strong model may delete it rather than rewrite. It must NOT be kept as-is.
+      - type: llm-rubric
+        value: >-
+          The handling of the `§4.1` comment never retains the token `§4.1` or `4.1`: a
+          REWRITE drops it from the replacement text, and a REMOVE (delete the line)
+          satisfies this trivially.
+      - type: llm-rubric
+        value: >-
+          Flags `// R7: charge in source order — the ledger rejects out-of-order sequence
+          numbers` as R4 REWRITE, keeping the ordering rationale.
+      - type: llm-rubric
+        value: >-
+          The suggested fix for the `R7` comment does NOT contain the token `R7` but keeps
+          the ordering meaning ("source order" / "out-of-order").
+      - type: llm-rubric
+        value: >-
+          Flags `/** Implements AC-3 and AC-4. */` as R4 REMOVE (a pure spec-id pointer doc).
+      - type: llm-rubric
+        value: >-
+          Keeps `// Retry-After handling per RFC 9110 §10.2.4` (R4 external-pin exception);
+          it is NOT flagged for removal.
+      - type: llm-rubric
+        value: >-
+          Keeps `// region R2 is EU-only; never route US tenants here` — `R2` is a real
+          string-literal value (`const REGION = "R2"`), not a spec-id; it is NOT flagged.
+      - type: llm-rubric
+        value: >-
+          Keeps `// sequential, not parallel — the gateway rate-limits per merchant IP`
+          as a genuine WHY (R2); it is NOT flagged.
+      - type: llm-rubric
+        value: >-
+          No kept or rewritten comment text in the report retains any spec-id fragment
+          (`F1`, `Q1`, `AC-3`, `AC-4`, `§4.1`, `R7`). Quoting the ORIGINAL comment in a
+          finding header is fine; the requirement is about the kept/rewritten text.
+
+  # ----------------------------------------------------------------------------
+  - description: 'eval-4 spec-id-embedded-midsentence (host-allowlist.ts)'
+    vars:
+      file: plugins/comment-review/evals/fixtures/host-allowlist.ts
+    assert:
+      # "skill actually ran" proxy: plugin skills load via Agent-Skills injection, not a
+      # Skill() tool call, so `skill-used` / metadata.skillCalls stay empty. Assert the
+      # report uses the skill's own R1–R12 rule taxonomy instead.
+      - type: regex
+        value: '\bR(1[0-2]|[1-9])\b'
+      - type: llm-rubric
+        value: >-
+          All three genuine-WHY comments (the R2 normalize note, the F3/registry note,
+          and the §2.3/§2.4 two-pass-validation note) are REWRITE — not a blanket REMOVE,
+          and not KEEP-as-is.
+      - type: llm-rubric
+        value: >-
+          No rewritten comment retains the tokens `R2`, `F3`, `Q4`, `§2.3` or `§2.4`.
+      - type: llm-rubric
+        value: >-
+          The ordering / failure-mode rationale of each comment is preserved in its rewrite
+          (the WHY survives; only the spec-id token is stripped).
+      - type: llm-rubric
+        value: >-
+          The bare R1 narration line `// hosts are matched case-insensitively` is REMOVE.
+      - type: llm-rubric
+        value: >-
+          The reasoning checks whether each token is a real code value before stripping it,
+          rather than rationalizing the spec-id `R2` as a code value.
diff --git a/plugins/comment-review/evals/prompts/review.txt b/plugins/comment-review/evals/prompts/review.txt
new file mode 100644
index 0000000..b68b947
--- /dev/null
+++ b/plugins/comment-review/evals/prompts/review.txt
@@ -0,0 +1,6 @@
+comment review {{file}}
+
+Review only the comments in the file path(s) above, judging each against the
+comment-quality rules (R1–R12). Return the per-comment verdict report
+(KEEP / REMOVE / REWRITE / MOVE) with a one-line reason and a concrete suggested
+fix for every finding, then the tally. Do not edit any files.