grixu · grixu · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,10 @@
+# Node dev tooling (shared at the repo root; used by plugins/*/evals)
+node_modules/
+pnpm-lock.yaml
+# pnpm-workspace.yaml is committed — it carries the onlyBuiltDependencies setting
+
+# promptfoo eval outputs
+eval-*.json
+*.eval.json
+output.json
+.promptfoo/
diff --git a/package.json b/package.json
@@ -0,0 +1,14 @@
+{
+  "name": "cc-toolkit",
+  "version": "0.0.0",
+  "private": true,
+  "description": "Claude Code plugin marketplace — root dev tooling (promptfoo eval suites live in plugins/<name>/evals/)",
+  "scripts": {
+    "eval": "bash scripts/run-evals.sh",
+    "eval:comment-review": "promptfoo eval -c plugins/comment-review/evals/promptfooconfig.yaml --no-cache --no-share -o /tmp/eval-comment-review.json"
+  },
+  "devDependencies": {
+    "@anthropic-ai/claude-agent-sdk": "latest",
+    "promptfoo": "latest"
+  }
+}
diff --git a/plugins/comment-review/evals/README.md b/plugins/comment-review/evals/README.md
@@ -0,0 +1,100 @@
+# comment-review — promptfoo eval suite
+
+Repeatable [promptfoo](https://www.promptfoo.dev/) evals for the `comment-review`
+skill. Each test runs the **real skill** (loaded as a local plugin through the
+Claude Agent SDK) against a fixture and grades the comment-quality verdicts
+(KEEP / REMOVE / REWRITE / MOVE, rules R1–R12).
+
+This is dev tooling — it is **not** shipped as part of the plugin runtime.
+
+## Layout
+
+```
+evals/
+  promptfooconfig.yaml   # provider + tests + assertions
+  prompts/review.txt     # natural-language trigger for the skill
+  fixtures/              # inputs (2 ported from skill-creator + 4 reconstructed)
+```
+
+Node dev deps (`@anthropic-ai/claude-agent-sdk` + `promptfoo`) and the run
+scripts live at the **repo root** (`package.json`, single shared `node_modules`),
+not per-plugin.
+
+| Test | Fixture(s) | Focus |
+|------|-----------|-------|
+| eval-0 | `datadog-integration.tf` | R5 banners + R4 internal-doc refs vs external RFC |
+| eval-1 | `scheduler.ts` | R4 file/doc refs, R5 banners, R1 narration, kept diagram/CVE |
+| eval-2 | `dlq-codes.ts` + `dlq.handler.ts` | R12 misplaced/duplicated rationale (REMOVE vs MOVE) |
+| eval-3 | `payment-validator.ts` | R4 spec-id pointers (REMOVE/REWRITE), token-stripping |
+| eval-4 | `host-allowlist.ts` | R4 spec-ids embedded mid-sentence → REWRITE, keep the WHY |
+
+## Prerequisites
+
+- `pnpm install` at the **repo root** (installs `@anthropic-ai/claude-agent-sdk` +
+  `promptfoo` into the shared root `node_modules`).
+- A logged-in Claude Code CLI. The target provider runs on your **subscription**
+  (`apiKeyRequired: false`) — no `ANTHROPIC_API_KEY` needed.
+
+## Run
+
+All commands run from the **repo root** (so the Agent SDK resolves from the root
+`node_modules`):
+
+```bash
+pnpm install
+
+# this suite (full)
+pnpm eval:comment-review
+
+# every plugin's eval suite (plugins/*/evals/*)
+pnpm eval
+
+# forward args to promptfoo, e.g. focus one test while iterating
+pnpm eval -- --filter-pattern eval-3
+
+# structural check (free)
+./node_modules/.bin/promptfoo validate config -c plugins/comment-review/evals/promptfooconfig.yaml
+
+# browse results
+./node_modules/.bin/promptfoo view
+```
+
+`scripts/run-evals.sh` (wired to `pnpm eval`) discovers every
+`plugins/<name>/evals/promptfooconfig.yaml`, runs each, and writes
+`/tmp/eval-<name>.json`.
+
+## Notes & knobs
+
+- **Model.** Default target/grader is `claude-sonnet-4-6` (cheap iteration); set a
+  stronger target in `promptfooconfig.yaml` (`providers[0].config.model`) for a
+  regression gate. Baseline: **sonnet 41/42, opus 42/42.** Two assertions are
+  instructive:
+  - *eval-3 `§4.1`* is a genuine judgment boundary — after the spec-id is stripped
+    the comment borders on R1, so REWRITE and REMOVE are both defensible (opus and
+    sonnet often pick REMOVE). The assertion accepts **either**; only "kept as-is" fails.
+  - *eval-4 token-verification* is a real capability discriminator: it asks the
+    report to verify each token against the code before stripping. Sonnet takes a
+    "letter+number ⇒ spec-id" shortcut and **fails** it; opus shows the check and
+    **passes**. Kept strict on purpose as a model marker.
+- **Flaky verdicts.** A borderline verdict can flip run-to-run; add `repeat: 2`/`3`
+  (or `defaultTest.options`) if you want a stable gate.
+- **Grader.** llm-rubric grades on the subscription via a single-turn agent
+  (slow-ish). For a faster/cheaper grader, set `ANTHROPIC_API_KEY` and change
+  `defaultTest.options.provider` to `anthropic:messages:claude-opus-4-8`.
+- **No `skill-used` assertion.** Plugin skills load via Agent-Skills *injection*,
+  not a `Skill()` tool call, so `metadata.skillCalls` stays empty. Each test
+  instead asserts (via `regex`) that the report uses the skill's R1–R12 taxonomy —
+  output of that shape requires the skill to have loaded (you can confirm in the
+  trace: the agent reads `skills/comment-review/references/rules.md`).
+- `setting_sources: []` keeps this repo's `CLAUDE.md`/hooks out of the run, so
+  results reflect the skill, not the surrounding harness.
+
+## Provenance
+
+Fixtures `payment-validator.ts` and `host-allowlist.ts` and their assertions come
+from the skill-creator workspace
+(`../skills/comment-review-workspace/`, evals 3 & 4). Fixtures
+`datadog-integration.tf`, `scheduler.ts`, `dlq-codes.ts`, `dlq.handler.ts` were
+reconstructed from the assertion lists in that workspace's
+`skill-snapshot/evals/evals.json` (evals 0/1/2), whose original fixtures were no
+longer present.
diff --git a/plugins/comment-review/evals/fixtures/datadog-integration.tf b/plugins/comment-review/evals/fixtures/datadog-integration.tf
@@ -0,0 +1,37 @@
+# ===== Datadog =====
+
+# ----- Provider credentials -----
+provider "datadog" {
+  api_key = var.datadog_api_key
+  app_key = var.datadog_app_key
+}
+
+# ----- DD GCP integration — per-env SA + STS registration -----
+resource "datadog_integration_gcp_sts" "main" {
+  client_email = google_service_account.datadog.email
+
+  # account scoping lives in DD_ARCH.md §3.2
+  host_filters = "env:prod"
+
+  # the per-env hard-stop on host_filters is intentional (DD_PLAN.md T4.1)
+  automute = true
+}
+
+# ----- Metric namespace allowlist -----
+resource "datadog_metric_metadata" "billed" {
+  # only gcp.* and custom.app.* are billed under our contract — everything else is
+  # dropped here so an accidental high-cardinality metric can't blow the bill
+  metric = "gcp.gce.instance"
+}
+
+resource "datadog_synthetics_test" "refund_probe" {
+  request_definition {
+    method = "POST"
+    url    = "https://api.example.com/refund"
+
+    # Retry-After handling per RFC 9110 §10.2.4
+    retry {
+      count = 2
+    }
+  }
+}
diff --git a/plugins/comment-review/evals/fixtures/dlq-codes.ts b/plugins/comment-review/evals/fixtures/dlq-codes.ts
@@ -0,0 +1,13 @@
+export const DLQ_MAX_RETRIES = 0; // 0 means unbounded, not disabled
+
+export enum AuditDlqInfoCode {
+  DLQ_MESSAGE_RECEIVED = "audit.dlq.message.received",
+
+  // the conditional UPDATE can't distinguish a missing row from an ineligible status
+  DLQ_STATUS_NOT_ELIGIBLE = "audit.dlq.no_op.status_not_eligible",
+
+  // we swallow the duplicate so a redelivered message doesn't double-count the audit metric
+  DLQ_DUPLICATE_DROPPED = "audit.dlq.no_op.duplicate_dropped",
+
+  DLQ_REQUEUED = "audit.dlq.requeued",
+}
diff --git a/plugins/comment-review/evals/fixtures/dlq.handler.ts b/plugins/comment-review/evals/fixtures/dlq.handler.ts
@@ -0,0 +1,25 @@
+import { AuditDlqInfoCode } from "./dlq-codes";
+
+export class DlqHandler {
+  private seen = new Set<string>();
+
+  constructor(private repo: AuditRepo, private bus: EventBus) {}
+
+  async handle(msg: DlqMessage) {
+    if (this.seen.has(msg.id)) {
+      this.bus.emit(AuditDlqInfoCode.DLQ_DUPLICATE_DROPPED);
+      return;
+    }
+    this.seen.add(msg.id);
+
+    // the conditional UPDATE can't distinguish a missing row from an ineligible status,
+    // so a zero-row result is ambiguous — we treat both as not-eligible rather than retry
+    const updated = await this.repo.updateIfEligible(msg.auditId, msg.status);
+    if (updated === 0) {
+      this.bus.emit(AuditDlqInfoCode.DLQ_STATUS_NOT_ELIGIBLE);
+      return;
+    }
+
+    this.bus.emit(AuditDlqInfoCode.DLQ_REQUEUED);
+  }
+}
diff --git a/plugins/comment-review/evals/fixtures/host-allowlist.ts b/plugins/comment-review/evals/fixtures/host-allowlist.ts
@@ -0,0 +1,28 @@
+export class HostAllowlist {
+  private ttlSeconds: number;
+
+  // We lowercase the host and strip the port before the R2 membership check, because the
+  // upstream registry (F3) stores bare lowercase hostnames — a mismatch here silently lets
+  // a blocked host through.
+  normalize(host: string): string {
+    return host.toLowerCase().replace(/:\d+$/, "");
+  }
+
+  // Per Q4 the cache TTL must never exceed the token lifetime; otherwise a revoked token
+  // keeps passing the check until the stale entry expires on its own.
+  setTtl(seconds: number) {
+    this.ttlSeconds = Math.min(seconds, this.tokenLifetime());
+  }
+
+  // Validate in two passes (see §2.3 then §2.4): structural shape first, semantics second,
+  // so a malformed payload never reaches the semantic validator and blows up mid-pass.
+  validate(payload: unknown): Result {
+    const shaped = this.checkShape(payload);
+    return this.checkSemantics(shaped);
+  }
+
+  // hosts are matched case-insensitively
+  isAllowed(host: string): boolean {
+    return this.set.has(this.normalize(host));
+  }
+}
diff --git a/plugins/comment-review/evals/fixtures/payment-validator.ts b/plugins/comment-review/evals/fixtures/payment-validator.ts
@@ -0,0 +1,33 @@
+import { dedupe, getOrCreateIdempotencyKey } from "./util";
+
+const MAX_CHARGE_MINOR_UNITS = 500_000;
+
+// region R2 is EU-only; never route US tenants here
+const REGION = "R2";
+
+// F1: users can submit a payment with a saved card
+export function submitPayment(req: PaymentRequest): PaymentResult {
+  // see Q1
+  if (req.amount <= 0) throw new InvalidAmountError();
+
+  // §4.1 caps a single charge at 500_000 minor units
+  if (req.amount > MAX_CHARGE_MINOR_UNITS) throw new ChargeTooLargeError();
+
+  // R7: charge in source order — the ledger rejects out-of-order sequence numbers
+  const charges = dedupe(req.charges).sort((a, b) => a.seq - b.seq);
+
+  const key = getOrCreateIdempotencyKey(req.userId, 24 * 60 * 60 * 1000);
+
+  // sequential, not parallel — the gateway rate-limits per merchant IP
+  const results = [];
+  for (const c of charges) {
+    results.push(postCharge(c, key, REGION));
+  }
+  return collect(results);
+}
+
+/** Implements AC-3 and AC-4. */
+export function refund(charge: Charge): RefundResult {
+  // Retry-After handling per RFC 9110 §10.2.4
+  return withRetryAfter(() => gateway.refund(charge));
+}
diff --git a/plugins/comment-review/evals/fixtures/scheduler.ts b/plugins/comment-review/evals/fixtures/scheduler.ts
@@ -0,0 +1,38 @@
+// ===== Scheduler =====
+
+// Job lifecycle:
+//   IDLE ──submit──▶ QUEUED ──claim──▶ RUNNING ──ok───▶ DONE
+//                                  └───err──▶ BACKOFF ──retry──▶ QUEUED
+import { nextDelay } from "./utils/backoff";
+
+const MAX_DELAY_MS = 30_000;
+
+export class Scheduler {
+  constructor(private broker: Broker) {}
+
+  // backoff curve lives in utils/backoff.ts; see docs/scheduling.md for the tuning rationale
+  async run(jobs: Job[]) {
+    // Sequential, not parallel — the broker rate-limits per worker token
+    for (const job of jobs) {
+      let attempt = 0;
+      while (!job.done && attempt < job.maxAttempts) {
+        // increment the attempt counter
+        attempt++;
+        try {
+          await this.claimAndRun(job);
+        } catch (err) {
+          // clamp the delay so a hostile job can't request an unbounded sleep (CVE-2024-1234)
+          const delay = Math.min(nextDelay(attempt), MAX_DELAY_MS);
+          await sleep(delay);
+        }
+      }
+    }
+  }
+
+  // ----- helpers -----
+
+  private async claimAndRun(job: Job) {
+    const token = await this.broker.claim(job.id);
+    return this.execute(job, token);
+  }
+}