diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 249fca8..a4e828c 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -151,6 +151,24 @@ "clean-code", "refactoring" ] + }, + { + "name": "researcher", + "source": "./plugins/researcher", + "description": "Source-grounded research assistant: an interactive skill gathers a research brief, then launches a bundled Dynamic Workflow that fans out firecrawl retrieval (WebSearch fallback) into cited findings, gates rounds on coverage and contradictions, and renders a self-contained HTML report where every claim traces to a numbered source. The report evolves across follow-up runs.", + "version": "0.0.0", + "author": { + "name": "Mateusz Gostański (grixu)", + "email": "mateusz.gostanski@gmail.com" + }, + "category": "productivity", + "tags": [ + "research", + "web", + "firecrawl", + "citations", + "workflow" + ] } ] } diff --git a/plugins/researcher/.claude-plugin/plugin.json b/plugins/researcher/.claude-plugin/plugin.json new file mode 100644 index 0000000..cd38aab --- /dev/null +++ b/plugins/researcher/.claude-plugin/plugin.json @@ -0,0 +1,17 @@ +{ + "name": "researcher", + "version": "0.0.0", + "description": "Source-grounded research assistant: an interactive skill gathers a research brief, then launches a bundled Dynamic Workflow that fans out firecrawl retrieval (WebSearch fallback) into cited findings, gates rounds on coverage and contradictions, and renders a self-contained HTML report where every claim traces to a numbered source. The report evolves across follow-up runs.", + "author": { + "name": "Mateusz Gostański (grixu)", + "email": "mateusz.gostanski@gmail.com" + }, + "repository": "https://github.com/grixu/cc-toolkit", + "keywords": [ + "research", + "web", + "firecrawl", + "citations", + "workflow" + ] +} diff --git a/plugins/researcher/CHANGELOG.md b/plugins/researcher/CHANGELOG.md new file mode 100644 index 0000000..b149aa7 --- /dev/null +++ b/plugins/researcher/CHANGELOG.md @@ -0,0 +1,53 @@ +# Changelog + +All notable changes to the **researcher** plugin will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +- `research` orchestrator skill — gathers a brief (infer-first; one consolidated prompt over only the dimensions it + can't infer: depth, recency, sources, audience), resolves the report folder (slugified goal; no index file — the + per-report `state.json` files are the registry), launches the bundled Dynamic Workflow, presents the manifest + + path, and runs the follow-up checkpoint as a structured **`AskUserQuestion`** (multiSelect — each proposed follow-up + is a selectable option, full text in the description) that extends the evolving report. +- `workflows/research.js` — the bundled Dynamic Workflow: + - **Setup** — creates the report dir and, when extending, loads only the prior report's `state.json` HEAD (with a + `schemaVersion` guard) — no read-back swarm; the prior findings stay on disk and the Synthesizer reads them later. + Tool availability (`mmdc`) is detected by the skill up front and passed in args, not preflighted here. + - **Plan** — derives distinct sub-query angles so parallel retrievers don't converge on the same hit; when the goal + hinges on a named product/vendor/standard's capability, it dedicates a primary/official-docs angle so "does X + support Y" is confirmed at the source rather than inferred from third-party threads. + - **Assessor-gated round loop** — parallel firecrawl retrievers (WebSearch fallback) emit findings with typed, + verbatim evidence spans and per-source trust tiers; deterministic dedup-by-URL assigns append-only source ids; a + **Conflict-scout** (every round) and, on a deep brief, an adversarial **Verifier** feed the **Assessor**, the + loop's single gate. Bounded by a per-depth round cap (quick=1 / standard=2 / deep=3), not a token budget. The + Assessor treats a central capability claim that rests only on secondary/community sources or inference as a + **material, resolvable gap** — it sends the next round to that actor's own primary docs instead of green-lighting + on inference, and a deep brief holds a high bar — so a single round can't settle "does X support Y" by guessing. + - **Synthesizer** — runs once on green light; reconciles findings, surfaces residual contradictions with + attribution, composes an audience-neutral cited draft. On a follow-up it reads the prior findings shards + directly (`Read`, batched in one turn) and rebuilds the whole answer from all findings holistically — never + seeded by the prior prose (`answer.md` is a write-only export). + - **Editor** — the sole audience-aware stage; re-cuts for concision and the brief's audience tier and marks + earn-their-place visuals. + - **Persist** — runs before the render: snapshots the prior `output.html`, then writes state append-only as a + small `state.json` HEAD plus bounded `findings/NNN.json` shards (≤20 findings each) and `answer.md`, each in its + own `Write` so no single agent turn ever emits the whole corpus — the failure mode that aborted large/deep runs. + On extend, this run's new findings are appended as **new** shards at indices continuing from the prior + `shardCount` (prior shards are never cleared or rewritten; the HEAD counts accumulate); `findings/` is cleared + only on a fresh run. + - **Composer** — renders an HTML report (semantic HTML against a fixed class vocabulary), copies the shipped + assets, compiles diagrams with a **global `mmdc`** (it never reads `report.css`, explores the dir, or downloads a + renderer — going straight to the render so it can't look hung), and renders Chart.js charts (state is already + persisted). Returns only the path + a compact manifest. +- Shipped assets — a version-pinned `chart.umd.js` (Chart.js 4.5.0) and a `report.css` (system fonts, light/dark, + ~70ch column, sticky ToC sidebar, print styles), copied into each report so it stays offline and self-contained. +- Single **evolving** HTML report per topic with sidecar `diagrams/`, `assets/`, `snapshots/`, and `findings/` (sharded + state) folders; source ids, findings shards, and content artifacts are all written append-only — follow-up runs add + new shards rather than rewriting prior ones, so older snapshots keep resolving and existing citations never break. + +[Unreleased]: https://github.com/grixu/cc-toolkit diff --git a/plugins/researcher/CONTEXT.md b/plugins/researcher/CONTEXT.md new file mode 100644 index 0000000..b679676 --- /dev/null +++ b/plugins/researcher/CONTEXT.md @@ -0,0 +1,193 @@ +# Researcher + +A Claude Code plugin that answers a research question with a **source-grounded, cited report**. +An interactive front-end skill gathers the brief, then launches a bundled Dynamic Workflow that +fans out firecrawl-backed retrieval subagents (WebSearch fallback) into findings, gates rounds on +coverage and contradictions, then synthesises a cited answer where every claim is traceable to a +numbered source — rendered as an HTML report. + +## Language + +**Research brief**: +The structured spec the front-end skill produces from the user's question and answers — the goal +plus scope, depth, recency, source, and **audience** constraints. It is what drives a workflow run. +The **audience** is one of four coarse expertise tiers — `lay`, `informed`, `practitioner`, or +`expert` (`practitioner` = in the field but junior: knows the basics, not advanced terms or +abbreviations) — optionally refined by a one-line free-form descriptor (e.g. "a PM evaluating vendors"). It calibrates the +**Editor** *only* (no other stage reads it); the skill infers the tier from the question and available +context (e.g. a global CLAUDE.md), and only asks the user outright when it cannot. +_Avoid_: query, prompt + +**Source**: +A web resource (URL) discovered during retrieval, deduplicated by URL and assigned a stable +**numeric id** used for citation, plus a coarse **trust tier** — `primary/official` > +`reputable-secondary` > `community/unverified` — set by the **Retriever** at fetch. The tier lets the +**Assessor** dismiss noise conflicts (official docs vs a stale blog) without a round, the +**Synthesizer** weigh sources when reconciling, and the **Report's** Sources list show the reader the +basis. Corroboration (how many independent sources back a finding) is a further, emergent signal — not +a substitute for the tier. Ids are append-only across the evolving **Report**: follow-up runs add new +sources with new ids and never renumber existing ones, so inline citations stay valid. +_Avoid_: link, reference + +**Finding**: +A discrete claim extracted from one or more sources, tagged with the source ids it came from *and* a +typed **evidence span**. Each span has a `kind`: a `quote` (verbatim source text — the default, and the +only kind a cheap deterministic string-check against the scrape can confirm), an `image_region` (url + +alt/caption, for charts and infographics), or a `locator` (page/timestamp + the retriever's paraphrase, +for paywalled or non-text sources). Non-text kinds are explicitly **non-verbatim**, so the fidelity +guarantee degrades *visibly* rather than silently — a `quote` is audited by construction; anything else +announces that it isn't. +_Avoid_: result, fact + +**Report**: +The deliverable — an HTML document backed by two sidecar folders, `diagrams/` (`.mmd` sources + +compiled SVGs) and `assets/` (the vendored chart library, the shipped `report.css`, and any +irreplaceable images), carrying the stated **goal**, a numbered list of **sources**, and an **answer** whose claims cite source ids inline +(e.g. "… happened in 2024 [2][5]"). Its body is kept readable by humans *and* agents — it is routinely +consumed as documentation, so heavy artifacts go to those sidecars and the body keeps only their +references and semantic pointers. It evolves across follow-up runs rather than spawning a new file each time: every +run merges its new **Findings** and the **Editor** re-cuts the whole **answer**, so the document stays +concise as it grows. The prior file is snapshotted before each overwrite. +_Avoid_: summary, output + +**Research round**: +One pass of plan → parallel retrieval → extract → dedup (by URL, assign append-only ids) → +**conflict-scout** (a *deep* brief inserts a **Verifier** pass next). Merge is *not* a per-round step — +it is deferred to the **Synthesizer**, which runs once after the **Assessor** green-lights coverage. +Rounds run in two tiers: *within* a workflow run the +**assessor** gates them autonomously; *between* runs the user steers via follow-up questions. Bounded +by a hard round cap per depth — *not* a token budget (Claude Code does not reliably expose one). Both a +**Conflict-scout** conflict and a **Verifier** refutation that needs fresh evidence become gaps filled +by the *same* round retrieval — there is one information-pulling mechanism, not several. +_Avoid_: iteration + +**Retriever**: +A workflow subagent that fetches from an external source. Today: **firecrawl** (search/scrape), +with **WebSearch** as fallback. Designed to admit more retrievers later (Perplexity, Gemini deep +research) without changing the report contract. Alongside text, a retriever records **candidate +image URLs** from each page (so the Composer can later fetch the ones worth including) and assigns each +**Source** its coarse **trust tier** at fetch. +_Avoid_: scraper, crawler, fetcher + +**Assessor** (coverage assessor): +The loop's single gate. A subagent that, after a round, judges whether more research is needed — +weighing subject complexity, accumulated context size, explicit user intent (a "deep research" brief +biases toward more rounds), the **Conflict-scout's** `conflicts[]`, and — on a deep brief — the +**Verifier's** unresolved refutations. It judges each conflict's/refutation's **materiality** against +the brief's goal and planned sub-questions (a material, resolvable one is itself a gap). It green-lights +only when coverage is sufficient *and* no material, resolvable conflict or refutation remains; +otherwise it emits the gaps plus proposed follow-up questions. +_Avoid_: evaluator, critic + +**Conflict-scout**: +A subagent that runs each round after dedup, *before* the **Assessor**: it diffs the accumulated +**Findings** for contradictions and emits `conflicts[]`, each tagging the clashing finding/source ids +with a *hint* at whether it is resolvable by more retrieval. **Materiality is not the scout's call** — +the **Assessor** judges it against the brief's goal and planned sub-questions (no drafted answer exists +yet). The scout only detects — it neither gates the loop (the **Assessor** does) nor writes prose (the +**Synthesizer** does), and it compares claims against *each other*, not ground truth, so it is no +fact-checker. +_Avoid_: verifier, fact-checker, referee + +**Verifier**: +A depth-gated adversarial subagent — runs only on a *deep* brief, each round, after the +**Conflict-scout**. It tries to *refute* the material **Findings**, reasoning over the already-gathered +corpus (other findings, source **trust tiers**, internal logic) — it does **not** fetch itself. A +finding it can refute outright is dropped; one it cannot settle without fresh counter-evidence becomes +a gap the **Assessor** acts on (filled by an ordinary round, the same mechanism conflicts use). It +judges *truth/reliability*, where the **Conflict-scout** judges only mutual *consistency*. Quick and +standard briefs skip it. +_Avoid_: fact-checker, skeptic, critic + +**Synthesizer**: +The reasoning core. A subagent that runs **once**, only after the **Assessor** green-lights coverage: +it reads the full deduplicated **Findings** and **Sources**, reconciles findings where they can be +reconciled, surfaces with attribution the residual contradictions the **Conflict-scout** flagged as +irreducible (it never hides them), and composes the structured draft **answer** with inline `[id]` +citations. It composes **audience-neutral** — the full, faithful argument with all its nuance and +caveats, never pre-trimmed for a reader; adapting it to the brief's audience is the **Editor's** job. +Turning gathered claims into a coherent, cited argument is its work — distinct from the **Editor**, +which adapts that draft to the reader and cuts and clarifies it afterward. On a follow-up run it +re-synthesizes the *whole* answer from the accumulated findings (holistic, per ADR-0005), not just the +new material. +_Avoid_: writer, merger, aggregator + +**Editor**: +A subagent that adapts the draft **answer** to its reader before it is rendered — the **sole +audience-aware stage**. Guided by the brief's audience tier (`lay` / `informed` / `practitioner` / `expert`) it sets how +much jargon to define, how much prior knowledge to assume, and how dense to write: for `lay` it defines +terms inline, leads with intuition, and cuts expert-only nuance; for `informed` it assumes general +literacy but defines field-specific terms; for `practitioner` it assumes the basics yet still defines +advanced terms and expands abbreviations on first use; for `expert` it assumes the terminology (jargon +and abbreviations), trims background, and foregrounds caveats and edge cases. Throughout it adversarially cuts +redundancy and filler — without flattening the **Findings'** accuracy. It also marks where a visual (a +Mermaid diagram, table, or chart) would carry an idea better than prose — leaning on more of them for a +`lay` reader, fewer for an `expert` — and specifies what it should show, so the **Composer** renders +only visuals that earn their place. Independent of +whoever drafted the answer, so the cutting is a second pair of eyes, not self-grading. +_Avoid_: proofreader, summarizer + +**Composer**: +The final stage of a workflow run — a subagent that renders the editor-approved **answer** as the +HTML **Report**: it builds the linear document (table of contents + anchors) and keeps the HTML body +**readable by humans and agents alike** — heavy artifacts live in the sidecar `diagrams/` and `assets/` +folders and the body carries only their references, since the Report is itself read as documentation. +It does not design: the body is semantic HTML against a fixed class vocabulary, styled by a shipped +`report.css` (copied into `assets/`, linked relatively), so every Report and snapshot shares one +identity and the head carries no bespoke `

Dynamic Workflow — subagenty (do wątku głównego wraca tylko ścieżka + manifest)

2. Dobieranie kontekstu — front-end skill (wątek główny)

Research round — równolegle (×N)

kolejna runda
(cap rund + budżet)

pokrycie OK

user wybiera/dodaje pytania →
nowy run rozszerza TEN SAM
ewoluujący Report

koniec

1. Input: pytanie researchowe

Subagent Explore:
codebase

Subagent Explore:
dokumentacja

CLAUDE.md
(inferencja audience)

Zadanie:
Jira / Linear MCP

Merge →
Research brief
(+ audience)

3. Plan: podział na pod-pytania
+ ustalenie zależności

Retriever: firecrawl
(WebSearch fallback)

Retriever: firecrawl

Retriever: firecrawl

Findings + kandydaci
na obrazki [src ids]

Findings [src ids]

Findings [src ids]

Merge + cross-check →
answer (findings)

4. Assessor:
czy brakuje pokrycia?
5. Editor: tnij lanie wody,
de-jargon pod audience,
oznacz wizualizacje
6. Composer: HTML report
ToC + [n] + mmdc→SVG
+ assets, snapshot, state
7. Prezentacja: manifest inline
+ ścieżka + oferta otwarcia

Follow-up?

Koniec

\ No newline at end of file diff --git a/plugins/researcher/skills/research/SKILL.md b/plugins/researcher/skills/research/SKILL.md new file mode 100644 index 0000000..73a7c2e --- /dev/null +++ b/plugins/researcher/skills/research/SKILL.md @@ -0,0 +1,144 @@ +--- +name: research +description: "Produce a source-grounded, cited HTML research report on a question. Gathers a brief (depth, recency, source mix, audience) then launches a bundled Dynamic Workflow that fans out firecrawl retrieval into findings, gates rounds on coverage + contradictions, synthesizes a cited answer, and renders an evolving HTML report. Use for substantive research requests: 'research X', 'zbadaj/zresearchuj X', 'do a deep dive on', 'find out everything about', 'write me a report on', 'compare A vs B with sources', 'what does the evidence say about'. Invoke explicitly as /researcher:research \"\". Requires Dynamic Workflows enabled + the firecrawl MCP." +argument-hint: "" +disable-model-invocation: true +allowed-tools: Workflow AskUserQuestion Read Bash(echo:*) Bash(ls:*) Bash(test:*) Bash(cat:*) Bash(find:*) Bash(command:*) Bash(open:*) Bash(xdg-open:*) +--- + +# research — source-grounded research report + +Front-end orchestrator for the `researcher` workflow. Gather a brief, resolve where the report lives, launch the +bundled Dynamic Workflow, present the result, then run the follow-up checkpoint that extends the **evolving** report. + +The workflow does all the heavy lifting (retrieval, the gated round loop, synthesis, editing, HTML rendering) and +returns only a compact manifest + path — the verbose HTML never enters this conversation. + +## Context + +- Plugin root: !`echo "$CLAUDE_PLUGIN_ROOT"` +- Default output base: `./research` (one folder per report: `./research//`) + +## 0. Prerequisites (check once, fail clearly) + +This skill needs **Dynamic Workflows** enabled (Claude Code v2.1.154+, paid plan; on Pro they must be enabled +per-session) and the **firecrawl MCP** installed with `mcp__firecrawl__firecrawl_search` / `firecrawl_scrape` +allow-listed. If the `Workflow` tool is unavailable, tell the user to enable Dynamic Workflows and stop — do not +attempt an inline `Agent` fan-out (that defeats the whole design; see the plugin README). + +## 1. Get the question + +- Take the question from `$ARGUMENTS`. If empty, ask the user what they want researched, then continue. + +## 2. Resolve the brief (infer first, ask once) + +Infer every dimension from the question and context; **only ask about what you genuinely cannot infer**, in a +**single** consolidated `AskUserQuestion` call (the tool allows ≤4 sub-questions). Often you will ask 1 question or +none. Never spend a sub-question on language — it is deterministically detectable (see below). + +Dimensions and their option sets (list the inferred/sensible default **first**, label it `(Recommended)`): + +| Dimension | Options (recommended first varies by inference) | Effect | +|---|---|---| +| **Depth** | Standard (balanced, 2 rounds) · Quick (1 round) · Deep (3 rounds + Verifier) | sets `maxRounds`; Deep also runs the adversarial Verifier | +| **Recency** | Recent (~2 years) · Any (incl. foundational) · Latest (fast-moving, newest first) | biases retriever date filters + query terms | +| **Sources** | Broad (all types, trust-weighted) · Authoritative (primary + reputable only) · Technical-academic (docs/standards/papers) | biases sub-query planning + inclusion | +| **Audience** | Informed · Lay · Practitioner · Expert | calibrates the Editor only | + +**Audience tiers** (`lay` / `informed` / `practitioner` / `expert`): `lay` = general public; `informed` = generally +literate, not a specialist; `practitioner` = in the field but junior (knows the basics, not advanced terms or +abbreviations); `expert` = fluent in the jargon. Infer from the question's framing and any context (e.g. a project +CLAUDE.md). You may also pass a one-line free-form `descriptor` (e.g. "a PM evaluating vendors") to sharpen it. + +**Language** (decision: never ask): default = **the language of the question**. Detect it from the question text. +An explicit in-question directive overrides it (e.g. "…napisz raport po angielsku" / "…in English" → English even +if the question is Polish). Pass a clear language name (e.g. `Polish`, `English`). + +Smart defaults when you choose not to ask: depth `standard`, recency `any` (or `recent`/`latest` if the topic is +clearly time-sensitive), sources `broad`, audience `informed`. + +## 3. Resolve the report folder (no index file — the per-report `state.json` files are the registry) + +Reports live under `//`. There is **no global index** — discover existing reports by scanning the base +directory; each `state.json` is self-describing (`goal`, `brief`). + +1. **Within this session** — if you just finished a run and the user is following up, **reuse the folder path you + already hold** (set `extending: true`, same `slug`). No scan, no prompt. +2. **Fresh / new session** — derive `slug` = a slugified `goal` (lowercase, words joined by `-`). Then: + - `test -f //state.json`? If it exists, read its `goal`: + - **same topic** → tell the user a report already exists and offer **Extend** (continue it) vs **Fresh** (new report). Extend → `extending: true`. Fresh on a same-slug-different-angle → pick the next free `-2`, `-3`, … + - **different goal** sharing the slug (collision) → use `-2` (etc.) for the new report. + - To extend a **different** existing report, scan `/*/state.json`, list their goals, and let the user pick which to continue. +3. New report → `extending: false`. + +(`mkdir` for the folder is handled by the workflow's Setup step — you only resolve the path + the extend flag.) + +## 4. Launch the workflow + +**First, detect tool availability here (cheap in this session) and pass it forward** — the workflow assumes a +**global `mmdc`** and never downloads a renderer at compose time. Run `command -v mmdc`; set `diagramsAvailable: true` +if it resolves, else `false`. Do **not** install it — if it's missing, the report simply renders without Mermaid +diagrams (reconstructing that data as tables/charts) and you may mention `pnpm add -g @mermaid-js/mermaid-cli` as an +optional one-time install for diagram support. + +Resolve the plugin root from the Context block above (call it `PLUGIN_ROOT`). Call the **Workflow** tool with the +bundled script by path and the brief as a real JSON object (not a stringified one): + +``` +Workflow({ + scriptPath: "/workflows/research.js", + args: { + goal: "", + depth: "quick|standard|deep", + recency: "recent|any|latest", + sources: "broad|authoritative|technical-academic", + audience: { tier: "lay|informed|practitioner|expert", descriptor: "" }, + language: "", + outputBase: "./research", + slug: "", + extending: , + diagramsAvailable: , + pluginRoot: "" + } +}) +``` + +`pluginRoot` must be the resolved absolute path (the Composer copies `report.css` + `chart.umd.js` from +`/assets`). If the tool rejects `scriptPath`, fall back to `Read`-ing the file and passing its contents +as `script:` with the same `args`. + +The workflow may return an `error` field instead of a report (`no-goal`, `schema-mismatch`, `no-findings`, +`synthesis-failed`, `compose-failed`). Relay its `message` plainly and stop — do not retry blindly. + +## 5. Present the result + +On success the workflow returns `{ artifactPath, manifest: { title, sections[], sourceCount, roundCount }, gaps[], followups[], warnings[] }`. + +- Print the manifest **inline**: the title, the section list, and the source/round counts. +- Print the `artifactPath`. +- Surface any `warnings` (e.g. an image that failed to fetch, diagrams skipped because `mmdc` was unavailable). +- **Offer** to open it — do not auto-open: suggest `open ""` on macOS (`xdg-open` on Linux). + +## 6. Follow-up checkpoint (extend the evolving report) + +When `followups[]` is non-empty you **MUST** hand the decision back through a single blocking `AskUserQuestion` +(`multiSelect: true`). **Do not** print the follow-ups as a markdown list and ask about them in prose — that is the +wrong behavior, and it is the failure mode this step exists to prevent. Build the call mechanically: + +- **One** question (e.g. *"Which follow-up(s) should I research next? (each extends this report)"*), `header: "Follow-up"`. +- **One option per `followups[]` entry.** They arrive as full-sentence questions, so **cap at 4** (AskUserQuestion + allows ≤4 options): set the option's `label` to a **short 3–6 word summary you synthesize** from the entry, and put + the **full follow-up sentence in the option's `description`**. The auto-added "Other" lets the user write their own. + +Then: + +- If the user selects one or more options (and/or writes their own) → **combine the chosen follow-up texts into the + next `goal`** (join multiple picks into one coherent question), and **inherit the same `slug`, `extending: true`, and + the prior `brief` defaults** (depth, recency, sources, audience, language) so the report stays one coherent, + single-language document — then relaunch the workflow (step 4). The report is re-synthesized holistically and + snapshotted before the overwrite — this is the same evolving `output.html`, not a new file. +- If the user picks nothing / says they're done → stop. + +The tone stays calm and decision-handing-back, like the other orchestrator skills — but here "handing the decision +back" **means the `AskUserQuestion` call**, not a prose offer. (If `followups[]` is empty, skip the question and simply +note the report is complete.) diff --git a/plugins/researcher/workflows/research.js b/plugins/researcher/workflows/research.js new file mode 100644 index 0000000..4559cf0 --- /dev/null +++ b/plugins/researcher/workflows/research.js @@ -0,0 +1,917 @@ +export const meta = { + name: 'researcher', + description: + 'Source-grounded research: fan out firecrawl retrievers (WebSearch fallback) into cited findings, gate rounds on coverage + contradictions (Conflict-scout + deep-only Verifier feeding a single Assessor), synthesize once, edit for the audience, and render an evolving HTML report. Returns only the artifact path + a compact manifest.', + phases: [ + { title: 'Setup', detail: 'create the report dir; load the prior state.json HEAD only (no read-back), schemaVersion check' }, + { title: 'Plan', detail: 'derive distinct sub-query angles from the brief (diversity)' }, + { title: 'Research', detail: 'assessor-gated rounds: parallel retrievers → Conflict-scout → (deep) Verifier → Assessor' }, + { title: 'Synthesize', detail: 'one terminal Synthesizer reconciles findings (reading the prior shards directly on extend) into a cited, audience-neutral draft' }, + { title: 'Edit', detail: 'Editor re-cuts for concision + the brief audience tier; marks earn-their-place visuals' }, + { title: 'Compose', detail: 'Persist appends new findings shards (prior shards untouched) + the HEAD; Composer snapshots, copies assets, renders semantic HTML + diagrams/charts' }, + ], +} + +// ───────────────────────────── args / brief ────────────────────────────────── +// AGENTS-NOTE: args sometimes arrives JSON-encoded as a string instead of an object; a naive typeof +// check then drops the whole payload and silently runs against defaults. Mirror multi-skill-review.js. +let a = {} +if (args && typeof args === 'object') a = args +else if (typeof args === 'string') { try { a = JSON.parse(args) } catch { a = {} } } + +const SCHEMA_VERSION = 2 // bump when state.json shape changes; setup refuses a mismatched prior state + +// slugify is a script-side fallback only — the skill normally resolves and passes `slug` (decision J). +const slugify = (s) => String(s || 'research') + .toLowerCase().normalize('NFKD').replace(/[^\w\s-]/g, '') + .trim().replace(/[\s_-]+/g, '-').replace(/^-+|-+$/g, '').slice(0, 60) || 'research' + +const brief = { + goal: a.goal || (a.brief && a.brief.goal) || '', + depth: ['quick', 'standard', 'deep'].includes(a.depth) ? a.depth : 'standard', + recency: ['recent', 'any', 'latest'].includes(a.recency) ? a.recency : 'any', + sources: ['broad', 'authoritative', 'technical-academic'].includes(a.sources) ? a.sources : 'broad', + audience: { + tier: ['lay', 'informed', 'practitioner', 'expert'].includes(a.audience && a.audience.tier) + ? a.audience.tier : 'informed', + descriptor: (a.audience && a.audience.descriptor) || '', + }, + language: a.language || 'en', // the SKILL detects the question's language and passes it (decision K) +} + +const OUTPUT_BASE = a.outputBase || './research' +const SLUG = a.slug || slugify(brief.goal) +const REPORT_DIR = `${OUTPUT_BASE}/${SLUG}` +const EXTENDING = !!a.extending +const PLUGIN_ROOT = a.pluginRoot || '' // ${CLAUDE_PLUGIN_ROOT}; the Composer copies assets/ from here +// The SKILL detects tool availability up front (it runs in the main session where a quick `command -v` is cheap) +// and passes it in — the Composer assumes a GLOBAL `mmdc` (no pnpm-dlx download at compose time, ADR: fail/degrade, +// don't fetch). Default true so a direct/legacy launch still attempts diagrams. +const DIAGRAMS_AVAILABLE = a.diagramsAvailable !== false +const MMDC_CMD = 'mmdc' + +// Round budget per depth (ADR-0003: bounded by rounds, NOT a token budget). Fan-out cap is TUNABLE +// (PLAN §8 — settle during build); the probe ran 5 parallel retrievers comfortably. +const MAX_ROUNDS = ({ quick: 1, standard: 2, deep: 3 })[brief.depth] +const FANOUT = Number.isFinite(a.fanout) ? a.fanout : ({ quick: 3, standard: 5, deep: 6 })[brief.depth] +const RUN_VERIFIER = brief.depth === 'deep' // Verifier is depth-gated (ADR-0006) + +if (!brief.goal) return { error: 'no-goal', message: 'args.goal (the research question) is required.' } + +// ───────────────────────────── shared contracts (schemas) ──────────────────── +// These are the INTERNAL structured contract every stage passes (PLAN §4). Heavy HTML never travels +// through them — only structured findings/answer. Trust tiers + typed evidence spans are first-class. + +const TRUST_TIERS = ['primary', 'reputable-secondary', 'community'] // set by the Retriever at fetch +const EVIDENCE_KINDS = ['quote', 'image_region', 'locator'] // only `quote` is verbatim/string-checkable + +// One retriever's raw output. The dedup step (script-side, deterministic) assigns the global +// append-only `source_id` and flattens findings — so retrievers do NOT invent global ids. +const RETRIEVER_SCHEMA = { + type: 'object', + required: ['sources'], + properties: { + sources: { + type: 'array', + items: { + type: 'object', + required: ['url', 'title', 'access_date', 'trust_tier', 'findings'], + properties: { + url: { type: 'string' }, + title: { type: 'string' }, + access_date: { type: 'string', description: 'UTC date the source was fetched, YYYY-MM-DD (from `date -u`)' }, + trust_tier: { type: 'string', enum: TRUST_TIERS }, + candidate_image_urls: { type: 'array', items: { type: 'string' }, description: 'chart/infographic URLs worth considering for the report' }, + findings: { + type: 'array', + items: { + type: 'object', + required: ['claim', 'evidence'], + properties: { + claim: { type: 'string', description: 'one discrete factual claim' }, + evidence: { + type: 'array', + items: { + type: 'object', + required: ['kind', 'value'], + properties: { + kind: { type: 'string', enum: EVIDENCE_KINDS }, + value: { type: 'string', description: 'a `quote` is VERBATIM source text; `image_region` = url + alt/caption; `locator` = page/timestamp + paraphrase' }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + notes: { type: 'string', description: 'optional: retrieval issues, e.g. "firecrawl unsupported on X, used WebSearch"' }, + }, +} + +// A source row (flat) — global append-only source_id assigned by mergeRound. +const SOURCE_ITEM_SCHEMA = { + type: 'object', + required: ['source_id', 'url', 'title', 'trust_tier'], + properties: { + source_id: { type: 'integer' }, + url: { type: 'string' }, + title: { type: 'string' }, + access_date: { type: 'string' }, + trust_tier: { type: 'string', enum: TRUST_TIERS }, + candidate_image_urls: { type: 'array', items: { type: 'string' } }, + }, +} + +// A finding (FLAT form) is persisted as { claim, source_ids[] (global, append-only), evidence[] {kind, value} } in +// sharded findings/NNN.json files (schema v2): the Persist write never forces an agent to emit the whole corpus in +// one turn, and on extend the Synthesizer reads the prior shards directly (no read-back round-trip through the +// orchestrator — ADR-0010). No StructuredOutput schema is bound to it — shards are written/read verbatim as JSON. + +// state.json HEAD (schema v2): everything EXCEPT findings (which live in shards). findingCount/shardCount let the +// extend Synthesizer read the prior shards (findings/000..shardCount-1.json) deterministically, and let Persist +// append new shards at indices continuing from shardCount. The SKILL still reads goal/brief from here. +const HEAD_SCHEMA = { + type: 'object', + required: ['schemaVersion', 'sources', 'roundCount', 'shardCount', 'findingCount'], + properties: { + schemaVersion: { type: 'integer' }, + brief: { type: 'object' }, + goal: { type: 'string' }, + sources: { type: 'array', items: SOURCE_ITEM_SCHEMA }, + findingCount: { type: 'integer' }, + shardCount: { type: 'integer' }, + roundCount: { type: 'integer' }, + }, +} + +// Setup agent return: prior HEAD (when extending) + directory prep. Findings are NOT returned here — on extend the +// Synthesizer reads the prior shards itself (ADR-0010), so they never round-trip through the orchestrator. Tool +// availability (mmdc) is detected by the SKILL and passed in args, NOT preflighted here (no agent round-trip, no pnpm-dlx download). +const SETUP_SCHEMA = { + type: 'object', + required: ['schemaOk', 'extending'], + properties: { + schemaOk: { type: 'boolean', description: 'false iff a prior state.json exists with a schemaVersion this workflow cannot read' }, + extending: { type: 'boolean', description: 'true iff a usable prior state.json HEAD was loaded' }, + priorSchemaVersion: { type: ['integer', 'null'] }, + state: { ...HEAD_SCHEMA, type: ['object', 'null'] }, + }, +} + +const PLAN_SCHEMA = { + type: 'object', + required: ['subQueries'], + properties: { + interpretation: { type: 'string', description: 'one line: how you read the goal' }, + subQueries: { + type: 'array', + description: 'DISTINCT angles so parallel retrievers do not converge on the same hit', + items: { + type: 'object', + required: ['angle', 'query'], + properties: { + angle: { type: 'string', description: 'what facet this covers (e.g. "official spec", "criticism", "benchmarks 2024+")' }, + query: { type: 'string', description: 'the search query string' }, + }, + }, + }, + }, +} + +const CONFLICT_SCHEMA = { + type: 'object', + required: ['conflicts'], + properties: { + conflicts: { + type: 'array', + items: { + type: 'object', + required: ['description', 'finding_ids', 'resolvable_hint'], + properties: { + description: { type: 'string', description: 'what contradicts what' }, + finding_ids: { type: 'array', items: { type: 'integer' } }, + source_ids: { type: 'array', items: { type: 'integer' } }, + resolvable_hint: { type: 'string', enum: ['likely', 'unlikely', 'unknown'], description: 'HINT only — materiality is the Assessor\'s call' }, + }, + }, + }, + }, +} + +const VERIFIER_SCHEMA = { + type: 'object', + required: ['refutations'], + properties: { + refutations: { + type: 'array', + items: { + type: 'object', + required: ['finding_id', 'verdict', 'basis'], + properties: { + finding_id: { type: 'integer' }, + verdict: { type: 'string', enum: ['stands', 'refuted', 'needs-evidence'], description: 'refuted ⇒ drop; needs-evidence ⇒ becomes a gap; stands ⇒ keep' }, + basis: { type: 'string', description: 'reasoning over the gathered corpus — the Verifier does NOT fetch' }, + }, + }, + }, + }, +} + +const ASSESSOR_SCHEMA = { + type: 'object', + required: ['sufficient', 'gaps', 'followups'], + properties: { + sufficient: { type: 'boolean' }, + reasoning: { type: 'string' }, + gaps: { type: 'array', items: { type: 'string' }, description: 'unmet coverage + material/resolvable conflicts + unresolved refutations — drive the next round' }, + followups: { type: 'array', items: { type: 'string' }, description: 'ready-to-use follow-up questions for the human checkpoint' }, + }, +} + +// ───────────────────────────── helpers (deterministic, no agent) ───────────── +// Dedup retriever outputs by URL and assign GLOBAL append-only source ids, continuing from the prior +// run's max id (never renumber — ADR-0005/0006). Returns the merged {sources, findings} for the round. +function mergeRound(state, retrieverOutputs) { + const byUrl = new Map(state.sources.map((s) => [s.url, s])) + let nextId = state.sources.reduce((m, s) => Math.max(m, s.source_id), 0) + 1 + for (const out of retrieverOutputs) { + if (!out || !Array.isArray(out.sources)) continue + for (const src of out.sources) { + if (!src || !src.url) continue + let existing = byUrl.get(src.url) + if (!existing) { + existing = { + source_id: nextId++, + url: src.url, + title: src.title || src.url, + access_date: src.access_date || '', + trust_tier: src.trust_tier || 'community', + candidate_image_urls: Array.isArray(src.candidate_image_urls) ? src.candidate_image_urls : [], + } + byUrl.set(src.url, existing) + state.sources.push(existing) + } else if (Array.isArray(src.candidate_image_urls)) { + for (const u of src.candidate_image_urls) if (!existing.candidate_image_urls.includes(u)) existing.candidate_image_urls.push(u) + } + for (const f of src.findings || []) { + if (!f || !f.claim) continue + state.findings.push({ claim: f.claim, source_ids: [existing.source_id], evidence: Array.isArray(f.evidence) ? f.evidence : [] }) + } + } + } + return state +} + +// Split findings into bounded shards so each Persist Write stays small — and each shard stays small enough for the +// extend Synthesizer to Read it back cheaply (the whole-corpus-in-one-turn emit is what aborts large/deep runs). +// 20/shard keeps a shard well under ~20KB. +const FINDINGS_PER_SHARD = 20 +const shardName = (i) => `findings/${String(i).padStart(3, '0')}.json` +function chunk(arr, n) { const out = []; for (let i = 0; i < arr.length; i += n) out.push(arr.slice(i, i + n)); return out } + +// Compact text view of accumulated findings/sources for prompts that reason over the corpus. +// Findings carry a #index so the Conflict-scout / Verifier / Assessor can reference them. +function corpusText(state) { + const srcLine = (s) => ` [${s.source_id}] (${s.trust_tier}) ${s.title} — ${s.url}` + const findLine = (f, i) => ` #${i} cites [${(f.source_ids || []).join('][')}]: ${f.claim}` + return `SOURCES (${state.sources.length}):\n${state.sources.map(srcLine).join('\n')}\n\nFINDINGS (${state.findings.length}):\n${state.findings.map(findLine).join('\n')}` +} + +// Synthesizer-only view: NO #index on findings (so the model never mistakes a finding's position for a +// citation) and an explicit list of the ONLY valid citation tokens — the SOURCE ids. On extend, `state.sources` +// holds old + new (the citation vocabulary spans the whole report) while `state.findings` holds only THIS run's +// new findings (the prior ones are read from the shards), so the evidence block is labelled accordingly. +function synthCorpus(state, evidenceLabel = 'EVIDENCE') { + const srcLine = (s) => ` [${s.source_id}] (${s.trust_tier}) ${s.title} — ${s.url}` + const findLine = (f) => ` - ${f.claim} (supported by ${(f.source_ids || []).map((i) => `[${i}]`).join('') || '[?]'})` + const ids = state.sources.map((s) => s.source_id).join(', ') + return `SOURCES — the ONLY valid citation ids are: ${ids}\n${state.sources.map(srcLine).join('\n')}\n\n${evidenceLabel} (cite the SOURCE id(s) shown after each item — never a finding's position):\n${state.findings.map(findLine).join('\n')}` +} + +// Sources-only coverage map: "what has already been consulted" — titles + URLs, no findings. The extend Planner +// gets this (not corpusText) so it biases toward gaps/newer material without the now-empty in-memory findings. +function sourcesText(state) { + if (!state.sources.length) return 'SOURCES ALREADY CONSULTED: (none)' + const srcLine = (s) => ` [${s.source_id}] (${s.trust_tier}) ${s.title} — ${s.url}` + return `SOURCES ALREADY CONSULTED (${state.sources.length}) — treat these as covered ground; aim elsewhere:\n${state.sources.map(srcLine).join('\n')}` +} + +// ───────────────────────────── Setup ───────────────────────────────────────── +const setupPrompt = ` +You are the SETUP step for a research-report workflow. Use Bash/Read only. Do NOT research anything. + +Report directory: ${REPORT_DIR} +Extending an existing report: ${EXTENDING} +This workflow's state schemaVersion: ${SCHEMA_VERSION} + +Do exactly this: +1. Ensure the directories exist: \`mkdir -p ${REPORT_DIR}/diagrams ${REPORT_DIR}/assets ${REPORT_DIR}/snapshots ${REPORT_DIR}/findings\`. +2. Prior state HEAD: if ${REPORT_DIR}/state.json exists, Read it. It is a SMALL head object — it has NO findings (those live in findings/NNN.json shards). Do NOT read the shards here. + - If its schemaVersion !== ${SCHEMA_VERSION}: return schemaOk=false, extending=false, priorSchemaVersion=, state=null (the orchestrator will refuse rather than corrupt an evolving report). Do not migrate. + - Else return schemaOk=true, extending=true, priorSchemaVersion=, and state = the parsed HEAD verbatim (schemaVersion, brief, goal, sources[], roundCount, findingCount, shardCount). + - If the file does not exist: schemaOk=true, extending=false, state=null. +Return the structured object. Nothing else. (Tool availability is NOT your concern — the orchestrator already detected it.)` + +phase('Setup') +const setup = await agent(setupPrompt, { label: 'setup', phase: 'Setup', schema: SETUP_SCHEMA }) + +if (setup && setup.schemaOk === false) { + return { + error: 'schema-mismatch', + message: `Existing report at ${REPORT_DIR} has state schemaVersion ${setup.priorSchemaVersion}, but this workflow speaks v${SCHEMA_VERSION}. Refusing to corrupt it — start a fresh slug or migrate manually.`, + artifactPath: REPORT_DIR, + } +} + +const isExtending = !!(setup && setup.extending && setup.state) +const diagramsAvailable = DIAGRAMS_AVAILABLE // from args (SKILL detected a global mmdc); no compose-time fallback + +// The accumulating research state (prior HEAD, when extending; else empty). Mutated in place by the loop. +// `findings` ALWAYS starts empty: on extend the prior findings stay on disk (the Synthesizer reads the shards +// itself — ADR-0010), and the loop/round reasoning runs over THIS run's new findings only. `sources` carries the +// prior HEAD's append-only source rows so new ids continue from them and the citation vocabulary stays whole. +const state = isExtending + ? { + schemaVersion: SCHEMA_VERSION, + brief: (setup.state && setup.state.brief) || brief, + goal: (setup.state && setup.state.goal) || brief.goal, + sources: Array.isArray(setup.state.sources) ? setup.state.sources : [], + findings: [], + answer: null, + roundCount: setup.state.roundCount || 0, + } + : { schemaVersion: SCHEMA_VERSION, brief, goal: brief.goal, sources: [], findings: [], answer: null, roundCount: 0 } + +// Prior-shard accounting (extend only): the Synthesizer reads findings/000..priorShardCount-1.json directly, and +// Persist appends this run's new shards at indices continuing from priorShardCount while the HEAD counts accumulate. +const priorShardCount = isExtending ? (setup.state.shardCount || 0) : 0 +const priorFindingCount = isExtending ? (setup.state.findingCount || 0) : 0 + +log(`${isExtending ? `Extending (${state.sources.length} prior sources, ${priorFindingCount} prior findings in ${priorShardCount} shard(s), round ${state.roundCount})` : 'Fresh report'} · depth=${brief.depth} maxRounds=${MAX_ROUNDS} fanout=${FANOUT} · diagrams=${diagramsAvailable ? MMDC_CMD : 'OFF'}`) + +// ───────────────────────────── Plan (distinct sub-queries) ─────────────────── +const planPrompt = ` +You plan the FIRST research round for this brief. Produce up to ${FANOUT} DISTINCT sub-query angles so that +parallel retrievers do not all converge on the same popular result (a known failure mode). Cover different +facets — official/spec sources, independent analysis, criticism/limitations, recent developments, data/benchmarks — +as fits the goal. Do NOT retrieve anything; only plan. + +CAPABILITY QUESTIONS: when the goal names specific products, vendors, tools, or standards and asks what they support +or whether they have some capability, DEDICATE at least one angle PER named entity to that entity's OWN primary/official +documentation for the EXACT capability in question — point the query at the vendor's own docs, not third-party blogs or +forum/issue threads (e.g. angle "Logto official docs — native RFC 7591 /register (DCR) endpoint", query targeting +Logto's documentation). "Does X support Y" must be answerable from X's primary source; do not leave it to inference. + +GOAL: ${brief.goal} +BRIEF: depth=${brief.depth}, recency=${brief.recency}, sourcePreference=${brief.sources}, language=${brief.language} +${isExtending ? `\nThis EXTENDS an existing report. Bias the angles toward GAPS and newer material, not what is already covered. Here is the coverage map — the sources already consulted (the prior findings are not shown; aim for what they did NOT cover):\n${sourcesText(state)}` : ''} + +Recency guidance: recent ⇒ favour ~last 2 years; latest ⇒ fast-moving, prioritise newest; any ⇒ include foundational. +Source guidance: broad ⇒ all types (trust-weighted); authoritative ⇒ primary + reputable only; technical-academic ⇒ docs/standards/papers. +Return the structured object — subQueries[] is REQUIRED and MUST contain at least one {angle, query} item.` + +phase('Plan') +const plan = await agent(planPrompt, { label: 'plan:sub-queries', phase: 'Plan', schema: PLAN_SCHEMA }) +const subQueries = (plan && Array.isArray(plan.subQueries) && plan.subQueries.length) + ? plan.subQueries.slice(0, FANOUT) + : [{ angle: 'general', query: brief.goal }] +log(`Planned ${subQueries.length} sub-query angles: ${subQueries.map((q) => q.angle).join(', ')}`) + +// ───────────────────────────── round-stage prompts ────────────────────────── +const RECENCY_HINT = { + recent: 'favour material from roughly the last 2 years', + latest: 'this is a fast-moving topic — prioritise the newest available material', + any: 'include foundational/older material where it is authoritative', +}[brief.recency] +const SOURCE_HINT = { + broad: 'all source types, weighted by trust', + authoritative: 'primary + reputable-secondary only; skip community/forum sources', + 'technical-academic': 'documentation, standards, and peer-reviewed papers', +}[brief.sources] + +const retrieverPrompt = (sq, round) => ` +You are a RESEARCH RETRIEVER. Cover ONE angle of the goal: find real web sources, read the best ones, and +extract grounded findings. Do NOT synthesise or write prose — just gather and extract. + +GOAL: ${brief.goal} +YOUR ANGLE: ${sq.angle} +STARTING QUERY (refine freely): ${sq.query} +Recency: ${RECENCY_HINT}. Sources: ${SOURCE_HINT}. Round ${round}. + +Tools: FIRST load firecrawl — ToolSearch with query "select:firecrawl_search,firecrawl_scrape". Use firecrawl_search +(pass a \`query\` string and optional \`limit\` — do NOT pass \`sources\` as a string) to find candidate pages, then +firecrawl_scrape to read the 2-4 most relevant. On every firecrawl_scrape pass \`formats: ["markdown","links"]\` and +\`onlyMainContent: true\` so the result carries the page's asset/image URLs — the \`links\` array plus the \`![](…)\` +URLs inside the returned markdown (you need these for candidate_image_urls). Do NOT call firecrawl_search_feedback. +If firecrawl errors, a site is unsupported (e.g. reddit.com — "site not supported"), or it returns a captcha/cookie-wall +instead of content, FALL BACK to WebSearch (ToolSearch "select:WebSearch") and work from its snippets. Record EVERY +fallback or skipped/blocked source in \`notes\` (e.g. "stripe.com restricted-keys: hCaptcha — skipped"). + +For every source you actually used: +- url, title, and a trust_tier: + "primary" = official docs/specs/standards, or the primary actor's own statement; + "reputable-secondary"= established press / well-known organisations; + "community" = forums, personal blogs, unverified posts. +- access_date: run \`date -u +%Y-%m-%d\` and use its output (workflow scripts cannot produce dates — you must). +- candidate_image_urls: from the scrape's \`links\` output and the \`![](…)\` URLs in the returned markdown, collect URLs of + CHARTS / DIAGRAMS / INFOGRAPHICS / FIGURES worth showing in a report — exclude logos, nav icons, avatars, social + buttons and tracking pixels (else []). +- findings: discrete factual claims, each with an evidence span: + prefer kind="quote" — the EXACT verbatim text from the scrape (auditable by construction); + kind="image_region" (value = url + alt/caption) for a chart/figure; + kind="locator" (value = page/section/timestamp + your paraphrase) for paywalled/non-text — explicitly non-verbatim. +CAPABILITY CLAIMS: if a finding asserts a capability of a named product/vendor/standard ("X supports Y", "X has Y"), +prefer confirming it against that actor's OWN official docs (primary tier) — scrape them when the angle points there. +If you can only find secondary/community evidence, still record the claim but say IN THE CLAIM TEXT that it is +UNCONFIRMED by a primary source (e.g. "Logto appears to lack a native /register endpoint — unconfirmed by Logto's own +docs"), rather than asserting it as established fact. +Quality over volume — a handful of well-supported findings beats many thin ones. Never fabricate a quote. +Return the structured object.` + +const conflictScoutPrompt = (st) => ` +You are the CONFLICT-SCOUT. DETECT contradictions among the accumulated findings — claims that cannot both hold, +or that materially disagree. You do NOT judge materiality (that is the Assessor's job) and you do NOT fetch or +fact-check against the world — you only compare findings against EACH OTHER. Reference findings by their #index below. +For each conflict, HINT whether more retrieval could likely resolve it (likely / unlikely / unknown). + +${corpusText(st)} + +Return conflicts[] (empty array if the findings are mutually consistent). EVERY conflict MUST include resolvable_hint set to exactly one of: likely, unlikely, unknown.` + +const verifierPrompt = (st) => ` +You are the VERIFIER (this is a DEEP brief). Adversarially try to REFUTE the material findings, reasoning ONLY over +the gathered corpus — other findings, source trust tiers, internal logic. You do NOT fetch anything. Reference by #index. +For each finding you challenge, give a verdict: + "refuted" = the corpus shows it is wrong or unsupported → it will be DROPPED; + "needs-evidence" = cannot be settled without fresh counter-evidence → becomes a GAP for another round; + "stands" = survives scrutiny. +Only list findings you actually challenge; anything unlisted is assumed to stand. +In particular: when a finding asserts a CAPABILITY of a named product/vendor/standard ("X supports Y", "X has Y") but +the corpus backs it only with secondary/community sources or inference — no primary/official source from X itself — +mark it "needs-evidence" (so the next round confirms it against X's own docs), NOT "stands". + +${corpusText(st)} + +Return refutations[].` + +const assessorPrompt = (st, conflicts, refutations, round, roundNotes) => ` +You are the ASSESSOR — the loop's SINGLE gate. Decide whether another research round is warranted. + +GOAL: ${brief.goal} +Planned facets: ${subQueries.map((q) => q.angle).join(', ')} +This is round ${round} of at most ${MAX_ROUNDS}. Depth intent: ${brief.depth} (a deep brief biases toward more rounds). + +${corpusText(st)} + +Conflicts detected this round: +${JSON.stringify(conflicts, null, 2)} +${RUN_VERIFIER ? `Verifier refutations this round:\n${JSON.stringify(refutations, null, 2)}` : '(no Verifier — quick/standard brief)'} +${(roundNotes && roundNotes.length) ? `\nRetrieval issues this round (a source was blocked, unsupported, captcha/cookie-walled, or fell back to WebSearch). Judge whether any blocked source leaves a REAL, retrievable coverage gap worth another round — if so add it to gaps[]:\n${roundNotes.map((n) => `- ${n}`).join('\n')}\n` : ''} +Judge each conflict's / refutation's MATERIALITY against the goal + planned facets. A material AND resolvable conflict, +or an unresolved (needs-evidence) refutation, is itself a gap. Green-light (sufficient=true) ONLY when coverage is +sufficient AND no material, resolvable conflict or refutation remains. Otherwise set sufficient=false and list gaps[] +that are SPECIFIC and retrievable (they become next round's retrievers). + +CAPABILITY CONFIRMATION — do NOT under-rate this. If the goal hinges on whether a specific product/vendor/standard has +some capability or support, and the corpus answers that only by INFERENCE or from secondary/community sources (no +primary/official source from that actor), that is a MATERIAL, RESOLVABLE gap: set sufficient=false and add a SPECIFIC +gap that sends a retriever to that actor's OWN primary docs (e.g. "confirm whether Logto Cloud exposes a native RFC +7591 /register endpoint against Logto's official documentation"). Do NOT downgrade such a gap to "residual / low- +materiality / one scrape would close it" and green-light anyway — if one scrape would close it, that scrape IS the next +round's job, not a follow-up to hand the human.${brief.depth === 'deep' ? ' This is a DEEP brief: hold a HIGH bar — a single round that settles a central capability by inference is NOT sufficient; chase the primary-source confirmation before green-lighting.' : ''} + +Always propose 2-4 ready-to-use followups[] for the human checkpoint, even when sufficient — these are NEW directions +to extend the report, NOT confirmations you should have chased this round (those go in gaps[]). +Return the structured object.` + +// ───────────────────────────── assessor-gated round loop ───────────────────── +phase('Research') +let assessment = null +const retrievalNotes = [] // blocked/unsupported/fallback sources — surfaced to the Assessor + final warnings +for (let round = 1; round <= MAX_ROUNDS; round++) { + const queries = round === 1 + ? subQueries + : ((assessment && assessment.gaps) || []).slice(0, FANOUT).map((g, i) => ({ angle: `gap-${i + 1}`, query: g })) + if (!queries.length) { log(`Round ${round}: no gaps to chase — stopping`); break } + + log(`Round ${round}/${MAX_ROUNDS}: ${queries.length} parallel retrievers`) + const outputs = await parallel(queries.map((sq) => () => + agent(retrieverPrompt(sq, round), { label: `retrieve:${sq.angle}`.slice(0, 38), phase: 'Research', schema: RETRIEVER_SCHEMA }))) + + const before = state.findings.length + mergeRound(state, outputs.filter(Boolean)) + state.roundCount++ + const roundNotes = outputs.filter(Boolean).map((o) => o.notes).filter((n) => n && n.trim()) + for (const n of roundNotes) retrievalNotes.push(`r${round}: ${n.trim()}`) + const newFindings = state.findings.length - before + log(`Round ${round}: +${newFindings} findings (total ${state.findings.length}) from ${state.sources.length} sources`) + + // Verifier — depth-gated; reasons over the corpus, drops refuted findings. Runs BEFORE the Conflict-scout so the + // scout (and the Assessor) reason over the post-splice findings, keeping conflict #index references aligned with + // the re-indexed corpus the Assessor sees. The Verifier's own refutedIdx splice (descending) is internally + // consistent because it reads the pre-splice corpus that produced those #indexes. + let refutations = [] + if (RUN_VERIFIER) { + const v = await agent(verifierPrompt(state), { label: 'verifier', phase: 'Research', schema: VERIFIER_SCHEMA }) + refutations = (v && v.refutations) || [] + const refutedIdx = refutations + .filter((r) => r.verdict === 'refuted' && Number.isInteger(r.finding_id)) + .map((r) => r.finding_id) + .sort((x, y) => y - x) // descending so earlier splices don't shift later indices + for (const idx of refutedIdx) if (idx >= 0 && idx < state.findings.length) state.findings.splice(idx, 1) + if (refutedIdx.length) log(`Round ${round}: Verifier dropped ${refutedIdx.length} refuted findings`) + } + + // Conflict-scout — detection only, every round, after dedup AND after the Verifier splice so its #index + // references line up with the corpus the Assessor reasons over. + const scout = await agent(conflictScoutPrompt(state), { label: 'conflict-scout', phase: 'Research', schema: CONFLICT_SCHEMA }) + const conflicts = (scout && scout.conflicts) || [] + + // Assessor — the single gate. + assessment = await agent(assessorPrompt(state, conflicts, refutations, round, roundNotes), { label: 'assessor', phase: 'Research', schema: ASSESSOR_SCHEMA }) + const sufficient = assessment ? assessment.sufficient : true + log(`Round ${round}: Assessor ${sufficient ? 'GREEN — coverage sufficient' : `wants more (${((assessment && assessment.gaps) || []).length} gaps)`}`) + if (sufficient) break + if (newFindings === 0 && round > 1) { log('No new findings this round — stopping to avoid spinning'); break } +} + +const gaps = (assessment && assessment.gaps) || [] +const followups = (assessment && assessment.followups) || [] + +// Abort only when there is NOTHING to synthesise — no new findings AND no prior ones on disk. On extend, the prior +// findings live in the shards (priorFindingCount), so a follow-up round that happens to add nothing still re-synthesises +// (the Synthesizer reads the prior shards) rather than falsely reporting an empty report. +if (!state.findings.length && priorFindingCount === 0) { + return { error: 'no-findings', message: 'Retrieval produced no usable findings (firecrawl/WebSearch may be unavailable or the query too narrow).', artifactPath: REPORT_DIR, gaps, followups } +} +if (!state.findings.length) log(`No new findings this run — re-synthesising from the ${priorFindingCount} prior finding(s) on disk`) + +// ───────────────────────────── Synthesize + Edit ──────────────────────────── +const SYNTH_SCHEMA = { + type: 'object', + required: ['title', 'answer'], + properties: { + title: { type: 'string', description: 'concise report title derived from the goal' }, + answer: { type: 'string', description: 'markdown with ## section headings and inline [id] citations; audience-NEUTRAL (full, faithful argument)' }, + residual_conflicts: { + type: 'array', + items: { + type: 'object', + required: ['description'], + properties: { description: { type: 'string' }, source_ids: { type: 'array', items: { type: 'integer' } } }, + }, + }, + }, +} + +const EDITOR_SCHEMA = { + type: 'object', + required: ['title', 'answer', 'sections', 'visuals'], + properties: { + title: { type: 'string' }, + answer: { type: 'string', description: 'edited markdown; each marked visual is a {{VISUAL:N}} token on its own line' }, + sections: { type: 'array', items: { type: 'string' }, description: 'final section heading titles in order (drives the ToC + manifest)' }, + visuals: { + type: 'array', + items: { + type: 'object', + required: ['id', 'type', 'intent', 'spec'], + properties: { + id: { type: 'integer', description: 'matches the {{VISUAL:N}} token in answer' }, + type: { type: 'string', enum: ['diagram', 'chart', 'table', 'image'] }, + intent: { type: 'string', description: 'why this visual earns its place' }, + spec: { type: 'string', description: 'WHAT to show: for diagram/chart/table the actual data/relationships (Composer authors the Mermaid/Chart.js/HTML); for image, the exact source image URL to download' }, + caption: { type: 'string' }, + source_ids: { type: 'array', items: { type: 'integer' }, description: 'provenance; REQUIRED for type=image (the source the image is attributed to)' }, + }, + }, + }, + open_questions: { type: 'array', items: { type: 'string' }, description: '0-4 questions worth flagging to the reader (curated from the assessor gaps)' }, + cut_summary: { type: 'string' }, + }, +} + +const TIER_GUIDE = { + lay: 'LAY reader: define jargon inline, lead with intuition and concrete analogies, and cut expert-only nuance. Lean on more visuals.', + informed: 'INFORMED reader: assume general literacy but DEFINE field-specific terms on first use.', + practitioner: 'PRACTITIONER (in the field but junior): assume the basics, but still DEFINE advanced terms and EXPAND abbreviations on first use.', + expert: 'EXPERT: assume the terminology including abbreviations, trim background, and foreground caveats and edge cases. Fewer visuals.', +}[brief.audience.tier] + +// On extend, the prior findings are NOT inlined — they live in the shard files the Synthesizer reads itself (ADR-0010). +// Hand it the deterministic shard-path list (derived from priorShardCount) plus read-and-reconcile instructions. +const priorShardList = Array.from({ length: priorShardCount }, (_, i) => `${REPORT_DIR}/${shardName(i)}`) +const synthExtendBlock = isExtending ? ` +This EXTENDS an existing report. Re-synthesise the WHOLE answer holistically from ALL findings — old + new — not just the +new material (ADR-0005); the findings are the source of truth, so NEVER seed from the prior prose. +The PRIOR findings are NOT inlined below — they live in ${priorShardCount} JSON shard file(s) you MUST read NOW: +${priorShardList.map((p) => ` - ${p}`).join('\n')} +Read EVERY one with the \`Read\` tool (NOT cat — a host hook blocks it), issuing ALL the Read calls in a SINGLE turn +(batched concurrent calls, not one-by-one). If any Read fails, RETRY it — you are responsible for loading every shard; +a silently dropped shard loses prior findings. Each shard is a JSON array of findings { claim, source_ids[], evidence[] }; +treat those prior findings as EQUALLY authoritative as the NEW evidence below, and reconcile any new-vs-prior +contradiction yourself (the round loop reasoned over the new findings only — it did not see the prior ones). +` : '' + +const synthPrompt = ` +You are the SYNTHESIZER — the reasoning core. You run ONCE, now that the Assessor has green-lit coverage. Compose a +single coherent, CITED draft answer to the goal from the full body of findings + sources. + +GOAL: ${brief.goal} +Write the answer in this language: ${brief.language}. +${synthExtendBlock} +${synthCorpus(state, isExtending ? 'NEW EVIDENCE gathered THIS run (the prior findings come from the shard files named above — read them)' : 'EVIDENCE')} + +Rules: +- Reconcile findings where they can be reconciled. Where a contradiction is irreducible, SURFACE it with attribution + (never hide it) and also list it in residual_conflicts. +- Every non-trivial claim cites its SOURCE id(s) inline as [id] or [id][id] (e.g. "throughput doubled [3][7]"). The ONLY + valid citation tokens are the source ids listed above (1..${state.sources.length}). NEVER cite a finding's position or a + number outside that set — a citation always points at a SOURCE. Reuse the [id]s shown beside each piece of evidence. +- Compose AUDIENCE-NEUTRAL: the full, faithful argument with all nuance and caveats. Do NOT trim for a reader — that is + the Editor's job next. +- Structure with ## section headings; lead with a direct answer to the goal, then the support. +Return: title, answer (markdown with [id] citations), residual_conflicts[].` + +// Compact inventory of source images the Editor may pull in — only when one is genuinely irreplaceable. +const candidateImages = state.sources + .flatMap((s) => (s.candidate_image_urls || []).map((u) => ({ source_id: s.source_id, url: u }))) + .slice(0, 30) + +const editPrompt = (synth) => ` +You are the EDITOR — the SOLE audience-aware stage and an independent second pair of eyes. Re-cut the draft for concision +and for THIS reader; mark where a visual genuinely earns its place. Never flatten accuracy or drop a citation. + +AUDIENCE: ${TIER_GUIDE}${brief.audience.descriptor ? ` Specifically: ${brief.audience.descriptor}.` : ''} +Keep the answer in this language: ${brief.language}. + +DRAFT ANSWER: +${synth.answer} + +Residual conflicts to keep visible (surface with attribution): ${JSON.stringify((synth && synth.residual_conflicts) || [])} +Assessor's open questions (curate + phrase for this reader): ${JSON.stringify(gaps)} +Source images available (mark type="image" ONLY if one is irreplaceable — otherwise RECONSTRUCT as a chart/diagram/table): ${JSON.stringify(candidateImages)} + +Tasks: +1. Cut redundancy, filler and waffle hard; keep every cited claim and its [id] citations intact.${isExtending ? ' Treat the draft as a wholesale re-cut of the ENTIRE document, not an append.' : ''} +2. Adapt density and jargon to the audience above. +3. Mark visuals that genuinely help (more for a lay reader, fewer for an expert): put a placeholder token {{VISUAL:N}} on + its OWN line where each belongs, and describe it in visuals[] — type, intent, spec, caption, source_ids. The Composer + renders ONLY what you mark here. Prefer RECONSTRUCTING a visual from the findings (a "chart" for quantitative data, a + "diagram" for flows/relationships, a "table" for structured comparisons) over pulling a source "image"; reserve "image" + for a figure that genuinely cannot be reconstructed (always attribute it via source_ids).${diagramsAvailable ? '' : ' NOTE: Mermaid diagrams are UNAVAILABLE this run (no mmdc) — do NOT use type "diagram"; prefer "table" or "chart".'} +4. open_questions[]: 0-4 questions worth flagging to the reader (curated from the assessor's). +Return: title, answer (edited markdown with {{VISUAL:N}} tokens), sections[], visuals[], open_questions[], cut_summary.` + +phase('Synthesize') +const synth = await agent(synthPrompt, { label: 'synthesize', phase: 'Synthesize', schema: SYNTH_SCHEMA }) +if (!synth || !synth.answer) { + return { error: 'synthesis-failed', message: 'The Synthesizer produced no answer.', artifactPath: REPORT_DIR, gaps, followups } +} + +phase('Edit') +const edited = await agent(editPrompt(synth), { label: 'edit', phase: 'Edit', schema: EDITOR_SCHEMA }) +const doc = (edited && edited.answer) + ? edited + : { title: synth.title, answer: synth.answer, sections: [], visuals: [], open_questions: gaps.slice(0, 4), cut_summary: 'editor unavailable — rendered the synthesizer draft as-is' } +state.answer = doc.answer + +// ───────────────────────────── Persist (snapshot + APPEND-ONLY sharded state, parallel) ────────────── +// state is the source of truth — written BEFORE the render so it survives a flaky HTML render. Writes are APPEND-ONLY +// (ADR-0010): `state.findings` now holds ONLY this run's new findings, so they go to NEW shards at indices continuing +// from priorShardCount; the prior shards are never cleared or rewritten on extend (a naive rewrite of the in-memory +// corpus would DELETE the prior findings from disk). Shards still write in PARALLEL (one bounded agent per shard) so +// wall-clock is the slowest single shard, not the sum. Three waves: (1) snapshot + (fresh-only) clear stale shards, +// (2) parallel NEW-shard + answer writes, (3) the HEAD last and only if every new shard landed — so state.json never +// points at a shard that was not written; the HEAD counts ACCUMULATE (prior + new). +const WRITE_OK_SCHEMA = { type: 'object', required: ['written'], properties: { written: { type: 'boolean' } } } +const PREP_SCHEMA = { type: 'object', required: ['ready'], properties: { ready: { type: 'boolean' }, snapshotMade: { type: 'boolean' } } } + +const findingShards = chunk(state.findings, FINDINGS_PER_SHARD) // this run's NEW findings only +const shardBase = priorShardCount // new shards are written at file indices priorShardCount + i (0 on a fresh run) +const head = { + schemaVersion: SCHEMA_VERSION, + brief: state.brief, + goal: state.goal, + sources: state.sources, + roundCount: state.roundCount, + findingCount: priorFindingCount + state.findings.length, // accumulate across runs (prior live on disk) + shardCount: priorShardCount + findingShards.length, +} +const persistFailed = (message) => ({ error: 'persist-failed', message, artifactPath: `${REPORT_DIR}/output.html`, gaps, followups }) + +phase('Compose') + +// Wave 1 — snapshot the prior report (once) + prep findings/. Must finish before the parallel shard writes. +// On a FRESH run we clear findings/ so a re-run from scratch leaves no stale shards; on an EXTEND we MUST preserve +// the prior shards (the Synthesizer just read them, and the appended HEAD still points at them) — never clear. +const prepClearStep = isExtending + ? `2. PRESERVE PRIOR SHARDS — this is an EXTEND run; the prior findings/*.json shards MUST survive (the new findings are appended as NEW shards, and state.json still points at the prior ones). Do NOT delete anything in findings/. Just ensure the dir exists: + mkdir -p ${REPORT_DIR}/findings` + : `2. CLEAR STALE SHARDS (FRESH run) so a re-run from scratch leaves none behind: + mkdir -p ${REPORT_DIR}/findings && rm -f ${REPORT_DIR}/findings/*.json 2>/dev/null; true` +const prepPrompt = ` +You are the PERSIST PREP step (Bash only). Under ${REPORT_DIR}, do exactly two things, then return: +1. SNAPSHOT — only if a prior output.html exists, freeze it BEFORE the renderer overwrites it: + cd ${REPORT_DIR} + if [ -f output.html ]; then mkdir -p snapshots; ts=$(date -u +%Y%m%dT%H%M%SZ); sed -e 's#="assets/#="../assets/#g' -e 's#="diagrams/#="../diagrams/#g' output.html > snapshots/output.$ts.html; fi + Use this redirect form, NOT 'sed -i' (its syntax differs macOS vs Linux). Set snapshotMade true iff you wrote one. +${prepClearStep} +Return { ready: true, snapshotMade }.` + +let prep = null +for (let attempt = 1; attempt <= 2 && !(prep && prep.ready); attempt++) { + if (attempt > 1) log('Persist prep (snapshot/clean) — retrying') + prep = await agent(prepPrompt, { label: attempt > 1 ? 'persist:prep#2' : 'persist:prep', phase: 'Compose', schema: PREP_SCHEMA }) +} +if (!prep || !prep.ready) return persistFailed('Could not prepare the report dir (snapshot/clean) after retries — aborted before render to keep the prior report consistent. Re-run to retry.') + +// Wave 2 — write every NEW findings shard + answer.md in PARALLEL. Each new shard lands at file index shardBase + i +// (append-only: shardBase = priorShardCount on extend, 0 on a fresh run), so prior shards are never touched. Each +// agent carries only its own slice (small in, bounded out). A failing thunk resolves to null (parallel never rejects), +// so each result is checked individually. +const shardWriters = findingShards.map((sh, i) => () => + agent( + `You are a PERSIST SHARD WRITER. Using the Write tool ONLY, write this exact JSON to ${REPORT_DIR}/${shardName(shardBase + i)} — copy it VERBATIM, do not reformat, merge, or add anything:\n${JSON.stringify(sh)}\nReturn { written: true } once the file is written.`, + { label: `persist:shard ${shardBase + i}`, phase: 'Compose', schema: WRITE_OK_SCHEMA }, + )) +const answerWriter = () => + agent( + `You are a PERSIST WRITER. Using the Write tool ONLY, write ${REPORT_DIR}/answer.md with EXACTLY the content between the markers (drop the markers themselves):\n<<>>\nReturn { written: true } once the file is written.`, + { label: 'persist:answer', phase: 'Compose', schema: WRITE_OK_SCHEMA }, + ) +const writeResults = await parallel([...shardWriters, answerWriter]) +const shardsWritten = writeResults.slice(0, findingShards.length).filter((r) => r && r.written).length +const answerWritten = !!(writeResults[findingShards.length] && writeResults[findingShards.length].written) +if (shardsWritten !== findingShards.length || !answerWritten) { + return persistFailed(`Could not write the sharded state (${shardsWritten}/${findingShards.length} shards, answer=${answerWritten}) — aborted before render to keep the prior report consistent. Re-run to retry.`) +} + +// Wave 3 — write the HEAD last, now that every shard + answer.md landed (so state.json never points at a missing +// shard). The head is tiny, so retry hard. +let headWritten = null +for (let attempt = 1; attempt <= 3 && !(headWritten && headWritten.written); attempt++) { + if (attempt > 1) log(`Persist head (state.json) attempt ${attempt} — retrying`) + headWritten = await agent( + `You are the PERSIST HEAD WRITER. Using the Write tool ONLY, write ${REPORT_DIR}/state.json with EXACTLY this JSON (copy verbatim):\n${JSON.stringify(head)}\nReturn { written: true } once the file is written.`, + { label: attempt > 1 ? `persist:head#${attempt}` : 'persist:head', phase: 'Compose', schema: WRITE_OK_SCHEMA }, + ) +} +if (!headWritten || !headWritten.written) return persistFailed('Could not write state.json head after retries — shards are on disk but the head is missing; aborted before render. Re-run to retry.') + +log(`Persisted ${findingShards.length} new shard(s) + head (${head.findingCount} findings across ${head.shardCount} shard(s), ${state.sources.length} sources)`) +const persisted = { stateWritten: true, snapshotMade: !!prep.snapshotMade } + +// ───────────────────────────── Compose (pure render, retryable) ───────────── +const COMPOSER_SCHEMA = { + type: 'object', + required: ['artifactPath', 'manifest'], + properties: { + artifactPath: { type: 'string', description: 'absolute or report-relative path to the written output.html' }, + manifest: { + type: 'object', + required: ['title', 'sections', 'sourceCount', 'roundCount'], + properties: { + title: { type: 'string' }, + sections: { type: 'array', items: { type: 'string' } }, + sourceCount: { type: 'integer' }, + roundCount: { type: 'integer' }, + }, + }, + snapshotMade: { type: 'boolean' }, + diagrams: { type: 'integer' }, + charts: { type: 'integer' }, + tables: { type: 'integer' }, + imagesFetched: { type: 'integer' }, + warnings: { type: 'array', items: { type: 'string' } }, + }, +} + +const sourcesForRender = state.sources.map((s) => ({ + source_id: s.source_id, url: s.url, title: s.title, access_date: s.access_date, trust_tier: s.trust_tier, +})) +const hasChart = (doc.visuals || []).some((v) => v && v.type === 'chart') + +const composerPrompt = ` +You are the COMPOSER — the final stage. Render the editor-approved answer as the HTML Report at ${REPORT_DIR}/output.html, +plus its sidecar diagrams/ and assets/. Do ALL file I/O yourself (Bash / Write / curl); the verbose HTML must NOT be +returned — return only the compact manifest. You author NO CSS and NO bespoke