From 078fb4d5dd40577f1fb08abb219de45d00855643 Mon Sep 17 00:00:00 2001 From: David Daniel Gonzalez Date: Wed, 3 Jun 2026 17:00:33 -0400 Subject: [PATCH 1/9] feat(resources): parse and validate local HTML resources (#112) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make local .html/.htm files first-class resources alongside markdown: - html-link-parser.ts (parse5): extract / links, id/name fragment anchors, and well-formedness diagnostics into the shared ParseResult shape - ResourceRegistry: route .html/.htm through parseHtml; discover HTML in crawls; persist optional strict anchors/parseErrors on ResourceMetadata - Replace the headings map with a format-neutral fragment index (Map>) and a tri-state checkAnchor: skip un-indexed targets (general false-positive fix), HTML case-sensitive / markdown lowercased — enables cross-format anchor validation - MALFORMED_HTML (info) validation code emitted from validate(), documented - Unit + integration tests: discovery, cross-format anchors, skip-un-indexed, malformed-HTML emission --- bun.lock | 7 +- docs/validation-codes.md | 7 + .../schemas/validation-config.json | 2 + packages/agent-schema/src/validation-codes.ts | 6 + .../test/validation-codes-html.test.ts | 17 ++ packages/resources/package.json | 1 + .../src/frontmatter-link-validator.ts | 8 +- packages/resources/src/html-link-parser.ts | 129 ++++++++++ packages/resources/src/link-parser.ts | 21 +- packages/resources/src/link-validator.ts | 96 +++----- packages/resources/src/resource-registry.ts | 90 +++++-- .../src/schemas/resource-metadata.ts | 16 +- .../test/frontmatter-link-validator.test.ts | 19 +- .../resources/test/html-link-parser.test.ts | 59 +++++ ...nk-validator-gitignore.integration.test.ts | 3 +- .../link-validator.integration.test.ts | 227 +++++++++++------- .../resource-parser.integration.test.ts | 33 +-- .../resource-registry.integration.test.ts | 69 ++++-- .../test/link-validator-helpers.test.ts | 24 ++ .../resources/test/metadata-schema.test.ts | 28 ++- packages/resources/test/test-helpers.ts | 68 ++++-- 21 files changed, 665 insertions(+), 265 deletions(-) create mode 100644 packages/agent-schema/test/validation-codes-html.test.ts create mode 100644 packages/resources/src/html-link-parser.ts create mode 100644 packages/resources/test/html-link-parser.test.ts diff --git a/bun.lock b/bun.lock index 81cdaaf9..82499473 100644 --- a/bun.lock +++ b/bun.lock @@ -285,6 +285,7 @@ "ajv-formats": "^3.0.1", "github-slugger": "^2.0.0", "markdown-link-check": "^3.14.2", + "parse5": "^7.2.1", "picomatch": "^4.0.3", "remark-frontmatter": "^5.0.0", "remark-gfm": "^4.0.0", @@ -1307,7 +1308,7 @@ "enhanced-resolve": ["enhanced-resolve@5.19.0", "", { "dependencies": { "graceful-fs": "^4.2.4", "tapable": "^2.3.0" } }, "sha512-phv3E1Xl4tQOShqSte26C7Fl84EwUdZsyOuSSk9qtAGyyQs2s3jJzComh+Abf4g187lUUAvH+H26omrqia2aGg=="], - "entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="], + "entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="], "environment": ["environment@1.1.0", "", {}, "sha512-xUtoPkMggbz0MPyPiIWr1Kp4aeWJjDZ6SMvURhimjdZgsRuDplF5/s9hcgGhyXMhs+6vpnuoiZ2kFiu3FMnS8Q=="], @@ -2581,6 +2582,8 @@ "cross-spawn/which": ["which@2.0.2", "", { "dependencies": { "isexe": "^2.0.0" }, "bin": { "node-which": "./bin/node-which" } }, "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA=="], + "dom-serializer/entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="], + "encoding-sniffer/iconv-lite": ["iconv-lite@0.6.3", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" } }, "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw=="], "eslint/ajv": ["ajv@6.12.6", "", { "dependencies": { "fast-deep-equal": "^3.1.1", "fast-json-stable-stringify": "^2.0.0", "json-schema-traverse": "^0.4.1", "uri-js": "^4.2.2" } }, "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g=="], @@ -2661,8 +2664,6 @@ "p-retry/retry": ["retry@0.13.1", "", {}, "sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg=="], - "parse5/entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="], - "path-scurry/minipass": ["minipass@7.1.2", "", {}, "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw=="], "proxy-agent/lru-cache": ["lru-cache@7.18.3", "", {}, "sha512-jumlc0BIUrS3qJGgIkWZsyfAM7NCWiBcCDhnd+3NNM5KbBmLTgHVfWBcg6W+rLUsIpzpERPsvwUP7CckAQSOoA=="], diff --git a/docs/validation-codes.md b/docs/validation-codes.md index 6b264ab7..6d65061e 100644 --- a/docs/validation-codes.md +++ b/docs/validation-codes.md @@ -124,6 +124,13 @@ Static-analysis codes that fire anywhere markdown is analyzed — `vat resources - **Why it matters:** A committed document declaring a dependency on a gitignored target breaks portability — anyone cloning the repo gets the document but not the target. It also risks treating local-only or generated content as if it were part of the published artifact. Distinct from the skills-packaging code [`LINK_TO_GITIGNORED_FILE`](#link_to_gitignored_file), which guards against leaking ignored data into a *bundle*; this code fires in the `vat resources validate` path and the two coexist intentionally. - **Fix:** Link a tracked target, or un-ignore the file in `.gitignore` if it should be committed. +### `MALFORMED_HTML` + +- **Default:** `info` +- **What:** An HTML resource has well-formedness problems (unclosed tags, stray characters, misnested elements) reported by the HTML parser. +- **Why it matters:** Malformed markup parses unpredictably across browsers and tools, and can hide or mangle the links VAT extracts. Surfaced as `info` because browsers are lenient and most pages still render. +- **Fix:** Fix the markup the parser flags. Raise severity via `validation.severity.MALFORMED_HTML` to enforce well-formedness. + ## Frontmatter Link Codes Validation codes that fire when a collection's frontmatter schema declares a URI-family `format` (`uri-reference`, `uri`, `iri-reference`, `iri`) on a field and `vat resources validate` walks those values through the same engine as markdown link checking. Disabled per-collection via `validation.checkFrontmatterLinks: false` or globally via `vat resources validate --no-check-frontmatter-links`. See [Frontmatter link validation](./guides/collection-validation.md#frontmatter-link-validation). diff --git a/packages/agent-schema/schemas/validation-config.json b/packages/agent-schema/schemas/validation-config.json index 640a24d7..51763f1e 100644 --- a/packages/agent-schema/schemas/validation-config.json +++ b/packages/agent-schema/schemas/validation-config.json @@ -65,6 +65,7 @@ "LINK_BROKEN_ANCHOR", "LINK_UNKNOWN", "LINK_TO_GITIGNORED", + "MALFORMED_HTML", "FRONTMATTER_MISSING", "FRONTMATTER_INVALID_YAML", "FRONTMATTER_SCHEMA_ERROR", @@ -160,6 +161,7 @@ "LINK_BROKEN_ANCHOR", "LINK_UNKNOWN", "LINK_TO_GITIGNORED", + "MALFORMED_HTML", "FRONTMATTER_MISSING", "FRONTMATTER_INVALID_YAML", "FRONTMATTER_SCHEMA_ERROR", diff --git a/packages/agent-schema/src/validation-codes.ts b/packages/agent-schema/src/validation-codes.ts index 20acc6ff..63b52057 100644 --- a/packages/agent-schema/src/validation-codes.ts +++ b/packages/agent-schema/src/validation-codes.ts @@ -326,6 +326,12 @@ export const CODE_REGISTRY = { 'Link a tracked target or un-ignore it.', 'link_to_gitignored', ), + MALFORMED_HTML: entry( + 'info', + 'HTML resource has well-formedness issues reported by the parser.', + 'Fix the malformed markup (unclosed tags, stray characters). Informational by default; raise severity via validation.severity to enforce.', + 'malformed_html', + ), FRONTMATTER_MISSING: entry( 'error', 'Schema requires frontmatter but the file has none.', diff --git a/packages/agent-schema/test/validation-codes-html.test.ts b/packages/agent-schema/test/validation-codes-html.test.ts new file mode 100644 index 00000000..0a9c45b3 --- /dev/null +++ b/packages/agent-schema/test/validation-codes-html.test.ts @@ -0,0 +1,17 @@ +import { describe, expect, it } from 'vitest'; + +import { CODE_REGISTRY } from '../src/validation-codes.js'; +import { createRegistryIssue } from '../src/validation-issue.js'; + +describe('MALFORMED_HTML code', () => { + it('is registered as info', () => { + expect(CODE_REGISTRY.MALFORMED_HTML.defaultSeverity).toBe('info'); + expect(CODE_REGISTRY.MALFORMED_HTML.reference).toBe('#malformed_html'); + }); + + it('builds an info-severity issue', () => { + const issue = createRegistryIssue('MALFORMED_HTML', 'Malformed HTML: missing-end-tag', { line: 3 }); + expect(issue.severity).toBe('info'); + expect(issue.line).toBe(3); + }); +}); diff --git a/packages/resources/package.json b/packages/resources/package.json index 5382c0f1..ad03e860 100644 --- a/packages/resources/package.json +++ b/packages/resources/package.json @@ -39,6 +39,7 @@ "ajv-formats": "^3.0.1", "github-slugger": "^2.0.0", "markdown-link-check": "^3.14.2", + "parse5": "^7.2.1", "picomatch": "^4.0.3", "remark-frontmatter": "^5.0.0", "remark-gfm": "^4.0.0", diff --git a/packages/resources/src/frontmatter-link-validator.ts b/packages/resources/src/frontmatter-link-validator.ts index 13542325..0fe310f6 100644 --- a/packages/resources/src/frontmatter-link-validator.ts +++ b/packages/resources/src/frontmatter-link-validator.ts @@ -23,7 +23,7 @@ import { createRegistryIssue, type IssueCode } from '@vibe-agent-toolkit/agent-s import { classifyLink } from './link-parser.js'; import { validateLink, type ValidateLinkOptions } from './link-validator.js'; import { walkFrontmatterUriReferences } from './schema-uri-walker.js'; -import type { HeadingNode, ResourceLink, ValidationIssue } from './types.js'; +import type { ResourceLink, ValidationIssue } from './types.js'; /** Map the link-level code emitted by validateLink to its frontmatter-scoped code. */ const LINK_CODE_TO_FRONTMATTER_CODE: Partial> = { @@ -51,14 +51,14 @@ export interface FrontmatterLinkValidationResult { * @param frontmatter - Parsed frontmatter (or undefined) * @param schema - JSON Schema for the collection * @param sourceFilePath - Absolute path to the source file - * @param headingsByFile - Heading trees (for anchor validation) + * @param fragmentsByFile - Fragment index (file path → set of valid fragments) for anchor validation * @param options - Same shape as validateLink (projectRoot, gitTracker, ...) */ export async function validateFrontmatterLinks( frontmatter: Record | undefined, schema: object, sourceFilePath: string, - headingsByFile: Map, + fragmentsByFile: Map>, options?: ValidateLinkOptions, ): Promise { if (!frontmatter) return { issues: [], externalUrls: [] }; @@ -89,7 +89,7 @@ export async function validateFrontmatterLinks( line: 1, // Frontmatter per-field line numbers are post-v1. }; - const issue = await validateLink(syntheticLink, sourceFilePath, headingsByFile, options); + const issue = await validateLink(syntheticLink, sourceFilePath, fragmentsByFile, options); if (!issue) continue; issues.push(rewriteIssue(issue, capture.dottedPath)); diff --git a/packages/resources/src/html-link-parser.ts b/packages/resources/src/html-link-parser.ts new file mode 100644 index 00000000..52d397dc --- /dev/null +++ b/packages/resources/src/html-link-parser.ts @@ -0,0 +1,129 @@ +/** + * HTML resource parser. + * + * Parses local HTML files into the shared `ParseResult` shape: + * - `` and `` links (classified by `classifyLink`) + * - `id` / `name` attributes as fragment anchors + * - well-formedness diagnostics from parse5's `onParseError` + * + * Uses parse5 (WHATWG-conformant). The parse5 document + element walker are + * exported so the link rewriter (html-transform.ts) shares one parser path. + */ + +import { readFile, stat } from 'node:fs/promises'; + +import { parse, type DefaultTreeAdapterMap } from 'parse5'; + +import { classifyLink, type HtmlParseError, type ParseResult } from './link-parser.js'; +import type { ResourceLink } from './types.js'; + +type P5Node = DefaultTreeAdapterMap['node']; +type P5Element = DefaultTreeAdapterMap['element']; +type P5ChildNode = DefaultTreeAdapterMap['childNode']; + +/** A parsed parse5 document together with any well-formedness diagnostics. */ +export interface HtmlDocument { + document: DefaultTreeAdapterMap['document']; + parseErrors: HtmlParseError[]; +} + +/** + * Parse an HTML source string with source-location info and collect parser + * errors. Shared by `parseHtml` (link/anchor extraction) and `rewriteHtmlLinks` + * (attribute offset splicing). + */ +export function parseHtmlDocument(source: string): HtmlDocument { + const parseErrors: HtmlParseError[] = []; + const document = parse(source, { + sourceCodeLocationInfo: true, + onParseError: (err) => { + // parse5's ParserError extends Location which always has startLine: number. + // We always include line since it is always present. + parseErrors.push({ message: err.code, line: err.startLine }); + }, + }); + return { document, parseErrors }; +} + +/** Depth-first walk yielding every element node in the tree. */ +export function* walkElements(node: P5Node): Generator { + if ('tagName' in node) { + yield node; + } + if ('childNodes' in node) { + for (const child of node.childNodes as P5ChildNode[]) { + yield* walkElements(child); + } + } +} + +function getAttr(element: P5Element, name: string): string | undefined { + return element.attrs.find((a) => a.name === name)?.value; +} + +function makeLink(href: string, line: number | undefined): ResourceLink { + return { text: '', href, type: classifyLink(href), ...(line !== undefined && { line }) }; +} + +function visitElement( + element: P5Element, + links: ResourceLink[], + anchors: Set, +): void { + const line = element.sourceCodeLocation?.startLine; + + if (element.tagName === 'a') { + const href = getAttr(element, 'href'); + if (href !== undefined) { + links.push(makeLink(href, line)); + } + const name = getAttr(element, 'name'); + if (name !== undefined && name !== '') { + anchors.add(name); + } + } else if (element.tagName === 'img') { + const src = getAttr(element, 'src'); + if (src !== undefined) { + links.push(makeLink(src, line)); + } + } + + const id = getAttr(element, 'id'); + if (id !== undefined && id !== '') { + anchors.add(id); + } +} + +/** + * Parse an HTML file into a `ParseResult`. + * + * @param filePath - Absolute path to the HTML file. + */ +export async function parseHtml(filePath: string): Promise { + const [content, stats] = await Promise.all([ + // eslint-disable-next-line security/detect-non-literal-fs-filename -- filePath is a user-provided path parameter + readFile(filePath, 'utf-8'), + // eslint-disable-next-line security/detect-non-literal-fs-filename -- filePath is a user-provided path parameter + stat(filePath), + ]); + + const { document, parseErrors } = parseHtmlDocument(content); + + const links: ResourceLink[] = []; + const anchors = new Set(); + + for (const element of walkElements(document)) { + visitElement(element, links, anchors); + } + + const anchorList = [...anchors]; + return { + links, + headings: [], + content, + sizeBytes: stats.size, + estimatedTokenCount: Math.ceil(content.length / 4), + ...(anchorList.length > 0 && { anchors: anchorList }), + ...(parseErrors.length > 0 && { parseErrors }), + }; +} diff --git a/packages/resources/src/link-parser.ts b/packages/resources/src/link-parser.ts index 9cc76bab..4882e7a9 100644 --- a/packages/resources/src/link-parser.ts +++ b/packages/resources/src/link-parser.ts @@ -1,5 +1,5 @@ /** - * Markdown link parser and analyzer. + * Markdown link parser and shared resource-parsing types. * * Parses markdown files to extract: * - Links (regular, reference-style, autolinks) @@ -7,6 +7,9 @@ * - File size and token estimates * * Uses unified/remark for robust markdown parsing with GFM support. + * + * Also defines the format-neutral `ParseResult` contract and `HtmlParseError` + * shared with the HTML parser (`html-link-parser.ts`). */ import { readFile, stat } from 'node:fs/promises'; @@ -23,7 +26,17 @@ import * as yaml from 'yaml'; import type { HeadingNode, LinkType, ResourceLink } from './types.js'; /** - * Result of parsing a markdown file. + * A single HTML well-formedness diagnostic from the parser. + */ +export interface HtmlParseError { + /** parse5 error code (e.g. "missing-end-tag"). */ + message: string; + /** 1-based source line. Optional in the type for forward-compat; parse5 v7 always populates it. */ + line?: number; +} + +/** + * Result of parsing a resource file (markdown or HTML). */ export interface ParseResult { links: ResourceLink[]; @@ -33,6 +46,10 @@ export interface ParseResult { content: string; sizeBytes: number; estimatedTokenCount: number; + /** Fragment targets (HTML `id`/`name` attributes). Markdown leaves this undefined. */ + anchors?: string[]; + /** HTML well-formedness diagnostics. Markdown leaves this undefined. */ + parseErrors?: HtmlParseError[]; } /** diff --git a/packages/resources/src/link-validator.ts b/packages/resources/src/link-validator.ts index 854f1138..0fc5cddd 100644 --- a/packages/resources/src/link-validator.ts +++ b/packages/resources/src/link-validator.ts @@ -25,7 +25,7 @@ import { verifyCaseSensitiveFilename, } from '@vibe-agent-toolkit/utils'; -import type { HeadingNode, ResourceLink } from './types.js'; +import type { ResourceLink } from './types.js'; import { isWithinProject, issueLocation, resolveLocalHref } from './utils.js'; type LinkIssueExtras = Partial>; @@ -66,7 +66,7 @@ export interface ValidateLinkOptions { * * @param link - The link to validate * @param sourceFilePath - Absolute path to the file containing the link - * @param headingsByFile - Map of file paths to their heading trees + * @param fragmentsByFile - Fragment index: file path → set of valid fragments (markdown slugs + HTML id/name) * @param options - Validation options (projectRoot, skipGitIgnoreCheck) * @returns ValidationIssue if link is broken, null if valid * @@ -84,15 +84,15 @@ export interface ValidateLinkOptions { export async function validateLink( link: ResourceLink, sourceFilePath: string, - headingsByFile: Map, + fragmentsByFile: Map>, options?: ValidateLinkOptions ): Promise { switch (link.type) { case 'local_file': - return await validateLocalFileLink(link, sourceFilePath, headingsByFile, options); + return await validateLocalFileLink(link, sourceFilePath, fragmentsByFile, options); case 'anchor': - return await validateAnchorLink(link, sourceFilePath, headingsByFile, options?.projectRoot); + return await validateAnchorLink(link, sourceFilePath, fragmentsByFile, options?.projectRoot); case 'external': // External URLs are not validated - don't report them @@ -232,7 +232,7 @@ export function gitIgnoreSafetyIssue( async function validateLocalFileLink( link: ResourceLink, sourceFilePath: string, - headingsByFile: Map, + fragmentsByFile: Map>, options?: ValidateLinkOptions ): Promise { const resolved = resolveLocalHref(link.href, sourceFilePath, options?.projectRoot); @@ -263,8 +263,8 @@ async function validateLocalFileLink( if (gitIgnoreIssue) return gitIgnoreIssue; if (resolved.anchor) { - const anchorValid = await validateAnchor(resolved.anchor, fileResult.resolvedPath, headingsByFile); - if (!anchorValid) { + const check = checkAnchor(resolved.anchor, fileResult.resolvedPath, fragmentsByFile); + if (check === 'broken') { return createRegistryIssue( 'LINK_BROKEN_ANCHOR', `Anchor not found: #${resolved.anchor} in ${fileResult.resolvedPath}`, @@ -282,16 +282,16 @@ async function validateLocalFileLink( async function validateAnchorLink( link: ResourceLink, sourceFilePath: string, - headingsByFile: Map, + fragmentsByFile: Map>, projectRoot?: string, ): Promise { // Extract anchor (strip leading #) const anchor = link.href.startsWith('#') ? link.href.slice(1) : link.href; // Validate anchor exists in current file - const isValid = await validateAnchor(anchor, sourceFilePath, headingsByFile); + const check = checkAnchor(anchor, sourceFilePath, fragmentsByFile); - if (!isValid) { + if (check === 'broken') { return createRegistryIssue( 'LINK_BROKEN_ANCHOR', `Anchor not found: ${link.href}`, @@ -338,65 +338,31 @@ async function validateResolvedFile( return result; } +/** Result of checking an anchor against the fragment index. */ +export type AnchorCheck = 'skip' | 'valid' | 'broken'; + /** - * Validate that an anchor (heading slug) exists in a file. + * Check whether a fragment exists in the target file's anchor set. * - * @param anchor - The heading slug to find (without leading #) - * @param targetFilePath - Absolute path to the file containing the heading - * @param headingsByFile - Map of file paths to their heading trees - * @returns True if anchor exists, false otherwise + * - `'skip'` — target file is not indexed; we cannot prove the anchor is + * broken, so callers must not emit an issue. + * - HTML targets (`.html`/`.htm`) are matched case-sensitively (ids are + * case-sensitive); all other targets are matched lowercased (markdown slugs). * - * @example - * ```typescript - * const valid = await validateAnchor('my-heading', '/project/docs/guide.md', headingsMap); - * ``` + * @param anchor - Fragment without the leading `#`. + * @param targetFilePath - Absolute path of the file the fragment lives in. + * @param fragmentsByFile - Format-neutral fragment index. */ -async function validateAnchor( +export function checkAnchor( anchor: string, targetFilePath: string, - headingsByFile: Map -): Promise { - // Get headings for target file - const headings = headingsByFile.get(targetFilePath); - if (!headings) { - return false; - } - - // Search for matching slug (case-insensitive) - return findHeadingBySlug(headings, anchor); -} - -/** - * Recursively search heading tree for a matching slug. - * - * Performs case-insensitive comparison of slugs. - * - * @param headings - Array of heading nodes to search - * @param targetSlug - The slug to find - * @returns True if slug found, false otherwise - * - * @example - * ```typescript - * const found = findHeadingBySlug(headings, 'my-heading'); - * ``` - */ -function findHeadingBySlug( - headings: HeadingNode[], - targetSlug: string -): boolean { - const normalizedTarget = targetSlug.toLowerCase(); - - for (const heading of headings) { - // Check current heading - if (heading.slug.toLowerCase() === normalizedTarget) { - return true; - } - - // Recursively check children - if (heading.children && findHeadingBySlug(heading.children, targetSlug)) { - return true; - } + fragmentsByFile: Map>, +): AnchorCheck { + const fragments = fragmentsByFile.get(targetFilePath); + if (!fragments) { + return 'skip'; } - - return false; + const isHtml = /\.html?$/i.test(targetFilePath); + const found = isHtml ? fragments.has(anchor) : fragments.has(anchor.toLowerCase()); + return found ? 'valid' : 'broken'; } diff --git a/packages/resources/src/resource-registry.ts b/packages/resources/src/resource-registry.ts index 6c844ac1..041a0814 100644 --- a/packages/resources/src/resource-registry.ts +++ b/packages/resources/src/resource-registry.ts @@ -22,6 +22,7 @@ import { type FrontmatterExternalUrl, } from './frontmatter-link-validator.js'; import { validateFrontmatter } from './frontmatter-validator.js'; +import { parseHtml } from './html-link-parser.js'; import { parseMarkdown } from './link-parser.js'; import { validateLink, type ValidateLinkOptions } from './link-validator.js'; import type { ResourceCollectionInterface } from './resource-collection-interface.js'; @@ -304,8 +305,12 @@ export class ResourceRegistry implements ResourceCollectionInterface { // Normalize path to absolute const absolutePath = safePath.resolve(filePath); - // Parse the markdown file (needed before ID generation for frontmatter lookup) - const parseResult = await parseMarkdown(absolutePath); + // Parse the file — HTML or markdown depending on extension + const lowerPath = absolutePath.toLowerCase(); + const isHtml = lowerPath.endsWith('.html') || lowerPath.endsWith('.htm'); + const parseResult = isHtml + ? await parseHtml(absolutePath) + : await parseMarkdown(absolutePath); // Generate ID using priority chain: frontmatter field → relative path → filename stem const id = this.generateId(absolutePath, parseResult.frontmatter); @@ -336,6 +341,8 @@ export class ResourceRegistry implements ResourceCollectionInterface { filePath: absolutePath, links: parseResult.links, headings: parseResult.headings, + ...(parseResult.anchors !== undefined && { anchors: parseResult.anchors }), + ...(parseResult.parseErrors !== undefined && { parseErrors: parseResult.parseErrors }), ...(parseResult.frontmatter !== undefined && { frontmatter: parseResult.frontmatter }), ...(parseResult.frontmatterError !== undefined && { frontmatterError: parseResult.frontmatterError }), sizeBytes: parseResult.sizeBytes, @@ -396,7 +403,7 @@ export class ResourceRegistry implements ResourceCollectionInterface { async crawl(options: CrawlOptions): Promise { const { baseDir, - include = ['**/*.md'], + include = ['**/*.md', '**/*.html', '**/*.htm'], exclude = ['**/node_modules/**', '**/.git/**', '**/dist/**'], followSymlinks = false, } = options; @@ -442,12 +449,35 @@ export class ResourceRegistry implements ResourceCollectionInterface { return issues; } + /** + * Emit MALFORMED_HTML issues from each resource's HTML parse errors. + * @private + */ + private collectHtmlParseErrors(): ValidationIssue[] { + const issues: ValidationIssue[] = []; + for (const resource of this.resourcesByPath.values()) { + for (const parseError of resource.parseErrors ?? []) { + issues.push( + createRegistryIssue( + 'MALFORMED_HTML', + `Malformed HTML: ${parseError.message}`, + { + location: issueLocation(resource.filePath, this.baseDir), + ...(parseError.line !== undefined && { line: parseError.line }), + }, + ), + ); + } + } + return issues; + } + /** * Validate all links in all resources. * @private */ private async validateAllLinks( - headingsByFile: Map, + fragmentsByFile: Map>, skipGitIgnoreCheck: boolean ): Promise { const issues: ValidationIssue[] = []; @@ -463,7 +493,7 @@ export class ResourceRegistry implements ResourceCollectionInterface { ...(this.gitTracker !== undefined && { gitTracker: this.gitTracker }) }; - const issue = await validateLink(link, resource.filePath, headingsByFile, validateOptions); + const issue = await validateLink(link, resource.filePath, fragmentsByFile, validateOptions); if (issue) { issues.push(issue); } @@ -501,7 +531,7 @@ export class ResourceRegistry implements ResourceCollectionInterface { * @private */ private async validateCollectionFrontmatter( - headingsByFile: Map, + fragmentsByFile: Map>, skipGitIgnoreCheck: boolean, ): Promise { const issues: ValidationIssue[] = []; @@ -523,7 +553,7 @@ export class ResourceRegistry implements ResourceCollectionInterface { const collectionIssues = await this.validateResourceCollectionSchemas( resource, fsPromises, - headingsByFile, + fragmentsByFile, skipGitIgnoreCheck, ); issues.push(...collectionIssues); @@ -539,7 +569,7 @@ export class ResourceRegistry implements ResourceCollectionInterface { private async validateResourceCollectionSchemas( resource: ResourceMetadata, fsModule: typeof fs, - headingsByFile: Map, + fragmentsByFile: Map>, skipGitIgnoreCheck: boolean, ): Promise { const issues: ValidationIssue[] = []; @@ -560,7 +590,7 @@ export class ResourceRegistry implements ResourceCollectionInterface { resource, collection.validation, fsModule, - headingsByFile, + fragmentsByFile, skipGitIgnoreCheck, ); issues.push(...collectionIssues); @@ -577,7 +607,7 @@ export class ResourceRegistry implements ResourceCollectionInterface { resource: ResourceMetadata, validation: NonNullable['collections']>[string]['validation'], fsModule: typeof fs, - headingsByFile: Map, + fragmentsByFile: Map>, skipGitIgnoreCheck: boolean, ): Promise { if (!validation?.frontmatterSchema) { @@ -620,7 +650,7 @@ export class ResourceRegistry implements ResourceCollectionInterface { resource.frontmatter, schema, resource.filePath, - headingsByFile, + fragmentsByFile, linkOptions, ); issues.push(...linkIssues); @@ -679,8 +709,8 @@ export class ResourceRegistry implements ResourceCollectionInterface { async validate(options?: ValidateOptions): Promise { const startTime = Date.now(); - // Build headings map for validation - const headingsByFile = this.buildHeadingsByFileMap(); + // Build fragment index for anchor validation + const fragmentsByFile = this.buildFragmentIndex(); // Reset frontmatter external URL state for this validation run this.frontmatterExternalUrlsByResource.clear(); @@ -691,16 +721,19 @@ export class ResourceRegistry implements ResourceCollectionInterface { // Check for YAML parsing errors first issues.push(...this.collectYamlErrors()); + // Surface HTML well-formedness diagnostics + issues.push(...this.collectHtmlParseErrors()); + // Validate each link in each resource const linkIssues = await this.validateAllLinks( - headingsByFile, + fragmentsByFile, options?.skipGitIgnoreCheck ?? false ); issues.push(...linkIssues); // Per-collection frontmatter validation const collectionFrontmatterIssues = await this.validateCollectionFrontmatter( - headingsByFile, + fragmentsByFile, options?.skipGitIgnoreCheck ?? false, ); issues.push(...collectionFrontmatterIssues); @@ -1253,14 +1286,19 @@ export class ResourceRegistry implements ResourceCollectionInterface { } /** - * Build a map of file paths to their heading trees. - * - * Used for link validation. + * Build a format-neutral fragment index for anchor validation: each file's + * absolute path → the set of valid fragment targets. Markdown contributes + * heading slugs (lowercased); HTML contributes its `id`/`name` anchors. */ - private buildHeadingsByFileMap(): Map { - const map = new Map(); + private buildFragmentIndex(): Map> { + const map = new Map>(); for (const resource of this.resourcesByPath.values()) { - map.set(resource.filePath, resource.headings); + const fragments = new Set(); + collectHeadingSlugs(resource.headings, fragments); + for (const anchor of resource.anchors ?? []) { + fragments.add(anchor); + } + map.set(resource.filePath, fragments); } return map; } @@ -1318,6 +1356,16 @@ export class ResourceRegistry implements ResourceCollectionInterface { } } +/** Recursively collect lowercased heading slugs into `out`. */ +function collectHeadingSlugs(headings: HeadingNode[], out: Set): void { + for (const heading of headings) { + out.add(heading.slug.toLowerCase()); + if (heading.children) { + collectHeadingSlugs(heading.children, out); + } + } +} + /** * Generate an ID from a file path. * diff --git a/packages/resources/src/schemas/resource-metadata.ts b/packages/resources/src/schemas/resource-metadata.ts index 0a3b158d..23253b86 100644 --- a/packages/resources/src/schemas/resource-metadata.ts +++ b/packages/resources/src/schemas/resource-metadata.ts @@ -36,6 +36,16 @@ export const LinkNodeTypeSchema = z.enum([ export type LinkNodeType = z.infer; +/** + * Zod schema for an HTML well-formedness diagnostic. + */ +export const HtmlParseErrorSchema = z.object({ + message: z.string().describe('parse5 error code (e.g. "missing-end-tag")'), + line: z.number().int().positive().optional().describe('1-based source line, when known'), +}).describe('HTML well-formedness diagnostic'); + +export type HtmlParseError = z.infer; + /** * Represents a heading node in the document's table of contents. * @@ -96,6 +106,10 @@ export const ResourceMetadataSchema = z.object({ filePath: z.string().describe('Absolute path to the resource file'), links: z.array(ResourceLinkSchema).describe('All links found in the resource'), headings: z.array(HeadingNodeSchema).describe('Document table of contents (top-level headings only; children are nested)'), + anchors: z.array(z.string()).optional() + .describe('Fragment targets for anchor validation (HTML id/name attributes)'), + parseErrors: z.array(HtmlParseErrorSchema).optional() + .describe('HTML well-formedness diagnostics (populated for HTML resources only)'), frontmatter: z.record(z.string(), z.unknown()).optional() .describe('Parsed YAML frontmatter (if present in markdown file)'), frontmatterError: z.string().optional() @@ -106,6 +120,6 @@ export const ResourceMetadataSchema = z.object({ checksum: SHA256Schema.describe('SHA-256 checksum of file content'), collections: z.array(z.string()).optional() .describe('Collection names this resource belongs to (populated when using config-based discovery)'), -}).describe('Complete metadata for a markdown resource'); +}).strict().describe('Complete metadata for a markdown resource'); export type ResourceMetadata = z.infer; diff --git a/packages/resources/test/frontmatter-link-validator.test.ts b/packages/resources/test/frontmatter-link-validator.test.ts index 48c9b9ee..4c238638 100644 --- a/packages/resources/test/frontmatter-link-validator.test.ts +++ b/packages/resources/test/frontmatter-link-validator.test.ts @@ -5,13 +5,12 @@ import { normalizedTmpdir, safePath } from '@vibe-agent-toolkit/utils'; import { afterAll, beforeAll, describe, expect, it } from 'vitest'; import { validateFrontmatterLinks } from '../src/frontmatter-link-validator.js'; -import type { HeadingNode } from '../src/types.js'; describe('validateFrontmatterLinks', () => { let projectRoot: string; let sourceFile: string; let targetFile: string; - let headingsByFile: Map; + let fragmentsByFile: Map>; beforeAll(async () => { projectRoot = await mkdtemp(safePath.join(normalizedTmpdir(), 'vat-fmlv-')); @@ -21,15 +20,9 @@ describe('validateFrontmatterLinks', () => { await writeFile(sourceFile, '---\n---\n# Source\n'); await writeFile(targetFile, '# Target\n\n## Section A\n'); - headingsByFile = new Map(); - headingsByFile.set(targetFile, [ - { level: 1, text: 'Target', slug: 'target', children: [ - { level: 2, text: 'Section A', slug: 'section-a', children: [] }, - ] }, - ]); - headingsByFile.set(sourceFile, [ - { level: 1, text: 'Source', slug: 'source', children: [] }, - ]); + fragmentsByFile = new Map(); + fragmentsByFile.set(targetFile, new Set(['target', 'section-a'])); + fragmentsByFile.set(sourceFile, new Set(['source'])); }); afterAll(async () => { @@ -48,7 +41,7 @@ describe('validateFrontmatterLinks', () => { frontmatter: Record | undefined, schema: object = refSchema, ) => - validateFrontmatterLinks(frontmatter, schema, sourceFile, headingsByFile, { + validateFrontmatterLinks(frontmatter, schema, sourceFile, fragmentsByFile, { projectRoot, skipGitIgnoreCheck: true, }); @@ -140,7 +133,7 @@ describe('validateFrontmatterLinks', () => { { ref: '/docs/target.md' }, refSchema, sourceFile, - headingsByFile, + fragmentsByFile, { skipGitIgnoreCheck: true }, ); expect(issues).toHaveLength(1); diff --git a/packages/resources/test/html-link-parser.test.ts b/packages/resources/test/html-link-parser.test.ts new file mode 100644 index 00000000..32e6a5d4 --- /dev/null +++ b/packages/resources/test/html-link-parser.test.ts @@ -0,0 +1,59 @@ +/* eslint-disable security/detect-non-literal-fs-filename -- test writes to temp dirs from computed paths */ +import { mkdtemp, writeFile } from 'node:fs/promises'; + +import { normalizedTmpdir, safePath } from '@vibe-agent-toolkit/utils'; +import { afterAll, describe, expect, it } from 'vitest'; + +import { parseHtml } from '../src/html-link-parser.js'; + +const dirs: string[] = []; + +async function writeHtml(name: string, body: string): Promise { + const dir = await mkdtemp(safePath.join(normalizedTmpdir(), 'vat-html-')); + dirs.push(dir); + const file = safePath.join(dir, name); + await writeFile(file, body, 'utf-8'); + return file; +} + +afterAll(async () => { + const { rm } = await import('node:fs/promises'); + await Promise.all(dirs.map((d) => rm(d, { recursive: true, force: true }))); +}); + +describe('parseHtml', () => { + it('extracts and links', async () => { + const file = await writeHtml( + 'page.html', + 'x', + ); + const result = await parseHtml(file); + const hrefs = result.links.map((l) => l.href).sort((a, b) => a.localeCompare(b)); + expect(hrefs).toEqual(['./other.html', 'img/logo.png']); + expect(result.links.find((l) => l.href === './other.html')?.type).toBe('local_file'); + }); + + it('collects id and name attributes as anchors', async () => { + const file = await writeHtml( + 'anchors.html', + '

Intro

', + ); + const result = await parseHtml(file); + expect(new Set(result.anchors)).toEqual(new Set(['intro', 'legacy'])); + expect(result.headings).toEqual([]); + }); + + it('reports malformed markup via parseErrors', async () => { + const file = await writeHtml('bad.html', '

unclosed'); + const result = await parseHtml(file); + expect(result.parseErrors).toBeDefined(); + expect((result.parseErrors ?? []).length).toBeGreaterThan(0); + }); + + it('omits anchors/parseErrors when there are none', async () => { + const file = await writeHtml('clean.html', 't

hi

'); + const result = await parseHtml(file); + expect(result.anchors).toBeUndefined(); + expect(result.parseErrors).toBeUndefined(); + }); +}); diff --git a/packages/resources/test/integration/link-validator-gitignore.integration.test.ts b/packages/resources/test/integration/link-validator-gitignore.integration.test.ts index 5f8fe150..fc4dab41 100644 --- a/packages/resources/test/integration/link-validator-gitignore.integration.test.ts +++ b/packages/resources/test/integration/link-validator-gitignore.integration.test.ts @@ -16,7 +16,6 @@ import { normalizedTmpdir, safePath } from '@vibe-agent-toolkit/utils'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; import { validateLink } from '../../src/link-validator.js'; -import type { HeadingNode } from '../../src/types.js'; import { isWithinProject } from '../../src/utils.js'; import { createGitRepo, createLink, setupTempDirTestSuite } from '../test-helpers.js'; @@ -73,7 +72,7 @@ async function validateWithGitIgnoreCheck( projectRoot: string ) { const link = createLink('local_file', linkHref, 'Test link', 2); - const headingsMap = new Map(); + const headingsMap = new Map>(); return await validateLink(link, sourceFile, headingsMap, { projectRoot, diff --git a/packages/resources/test/integration/link-validator.integration.test.ts b/packages/resources/test/integration/link-validator.integration.test.ts index 4f239947..33c7d319 100644 --- a/packages/resources/test/integration/link-validator.integration.test.ts +++ b/packages/resources/test/integration/link-validator.integration.test.ts @@ -10,12 +10,16 @@ */ +import { writeFile } from 'node:fs/promises'; + +/* eslint-disable security/detect-non-literal-fs-filename -- tests use dynamic file paths in temp directory */ + import { mkdirSyncReal, normalizedTmpdir, safePath } from '@vibe-agent-toolkit/utils'; -import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { describe, it, expect, beforeEach, afterEach, beforeAll, afterAll } from 'vitest'; import { validateLink } from '../../src/link-validator.js'; -import type { HeadingNode } from '../../src/types.js'; -import { assertValidation, createGitRepo, createHeadings, createLink } from '../test-helpers.js'; +import { ResourceRegistry } from '../../src/resource-registry.js'; +import { assertValidation, createGitRepo, createLink, setupSubdirTestSuite } from '../test-helpers.js'; /** * Helper to test that a non-ignored file linking to a gitignored file returns an error @@ -23,7 +27,7 @@ import { assertValidation, createGitRepo, createHeadings, createLink } from '../ async function assertGitignoreError(gitRoot: string, linkHref: string, linkText: string): Promise { const sourceFile = safePath.join(gitRoot, 'source.md'); const link = createLink('local_file', linkHref, linkText, 1); - const headingsMap = new Map(); + const headingsMap = new Map>(); await assertValidation( { @@ -61,12 +65,32 @@ const NONEXISTENT_FILE_LINK = './nonexistent.md'; const NONEXISTENT_ANCHOR = '#nonexistent'; const TARGET_FILE_LINK = './target.md'; +/** + * Build the standard headings map used in anchor-validation tests: + * a single entry mapping `sourceFile` to the slug of HEADING_ANCHOR_HEADING. + */ +function makeAnchorHeadingsMap(sourceFile: string): Map> { + return new Map>([ + [sourceFile, new Set([HEADING_ANCHOR_HEADING.slug.toLowerCase()])], + ]); +} + +/** + * Crawl a directory into a fresh ResourceRegistry and run validation. + * Extracted to eliminate the repeated 3-line setup in cross-format anchor tests. + */ +async function crawlAndValidate(dir: string): Promise>> { + const reg = new ResourceRegistry({ baseDir: dir }); + await reg.crawl({ baseDir: dir }); + return reg.validate({ skipGitIgnoreCheck: true }); +} + describe('validateLink', () => { describe('local_file links', () => { it('should validate valid relative path', async () => { const sourceFile = safePath.join(FIXTURES_DIR, VALID_MD); const link = createLink('local_file', TARGET_FILE_LINK, 'Link to target', 3); - const headingsMap = new Map(); + const headingsMap = new Map>(); const result = await validateLink(link, sourceFile, headingsMap); @@ -76,7 +100,7 @@ describe('validateLink', () => { it('should validate valid relative path with ../', async () => { const sourceFile = safePath.join(FIXTURES_DIR, 'subdir', 'nested.md'); const link = createLink('local_file', '../target.md', 'Link to parent', 1); - const headingsMap = new Map(); + const headingsMap = new Map>(); const result = await validateLink(link, sourceFile, headingsMap); @@ -89,7 +113,7 @@ describe('validateLink', () => { { sourceFile: safePath.join(FIXTURES_DIR, BROKEN_FILE_MD), link: createLink('local_file', NONEXISTENT_FILE_LINK, 'Broken link', 3), - headingsMap: new Map(), + headingsMap: new Map>(), expected: { code: 'LINK_BROKEN_FILE', messageContains: ['File not found', 'nonexistent.md'], @@ -118,7 +142,7 @@ describe('validateLink', () => { { sourceFile, link: createLink('local_file', './testfile.md', 'Wrong case link', 3), - headingsMap: new Map(), + headingsMap: new Map>(), expected: { code: 'LINK_BROKEN_FILE', messageContains: ['case mismatch', 'TestFile.md', 'testfile.md'], @@ -137,8 +161,8 @@ describe('validateLink', () => { const targetFile = safePath.join(FIXTURES_DIR, TARGET_MD); const link = createLink('local_file', './target.md#valid-anchor', 'Link with anchor', 5); - const headingsMap = new Map([ - [targetFile, createHeadings(VALID_ANCHOR_HEADING)], + const headingsMap = new Map>([ + [targetFile, new Set([VALID_ANCHOR_HEADING.slug.toLowerCase()])], ]); const result = await validateLink(link, sourceFile, headingsMap); @@ -153,8 +177,8 @@ describe('validateLink', () => { { sourceFile: safePath.join(FIXTURES_DIR, 'broken-anchor.md'), link: createLink('local_file', './target.md#nonexistent-heading', 'Broken anchor', 3), - headingsMap: new Map([ - [targetFile, createHeadings(VALID_ANCHOR_HEADING)], + headingsMap: new Map>([ + [targetFile, new Set([VALID_ANCHOR_HEADING.slug.toLowerCase()])], ]), expected: { code: 'LINK_BROKEN_ANCHOR', @@ -174,7 +198,7 @@ describe('validateLink', () => { // leading-/ branch entirely). const sourceFile = safePath.join(FIXTURES_DIR, VALID_MD); const link = createLink('local_file', '/does/not/exist.md', 'Absolute path'); - const headingsMap = new Map(); + const headingsMap = new Map>(); const result = await validateLink(link, sourceFile, headingsMap); @@ -196,7 +220,7 @@ describe('validateLink', () => { fs.writeFileSync(sourceFile, ''); const link = createLink('local_file', 'files/My%20Document%20Name.pdf', 'PDF link', 3); - const headingsMap = new Map(); + const headingsMap = new Map>(); const result = await validateLink(link, sourceFile, headingsMap); @@ -215,9 +239,7 @@ describe('validateLink', () => { { sourceFile, link: createLink('anchor', '#heading-anchor', 'Anchor link', 5), - headingsMap: new Map([ - [sourceFile, createHeadings(HEADING_ANCHOR_HEADING)], - ]), + headingsMap: makeAnchorHeadingsMap(sourceFile), expected: null, }, expect, @@ -231,9 +253,7 @@ describe('validateLink', () => { { sourceFile, link: createLink('anchor', NONEXISTENT_ANCHOR, 'Broken anchor', 10), - headingsMap: new Map([ - [sourceFile, createHeadings(HEADING_ANCHOR_HEADING)], - ]), + headingsMap: makeAnchorHeadingsMap(sourceFile), expected: { code: 'LINK_BROKEN_ANCHOR', messageContains: ['Anchor not found', NONEXISTENT_ANCHOR], @@ -250,9 +270,7 @@ describe('validateLink', () => { { sourceFile, link: createLink('anchor', '#HEADING-ANCHOR', 'Case mismatch', 5), - headingsMap: new Map([ - [sourceFile, createHeadings(HEADING_ANCHOR_HEADING)], - ]), + headingsMap: makeAnchorHeadingsMap(sourceFile), expected: null, }, expect, @@ -263,17 +281,8 @@ describe('validateLink', () => { const sourceFile = safePath.join(FIXTURES_DIR, 'complex.md'); const link = createLink('anchor', '#nested-child', 'Nested heading', 10); - const headingsMap = new Map([ - [sourceFile, createHeadings({ - text: 'Parent Heading', - slug: 'parent-heading', - level: 2, - children: createHeadings({ - text: 'Nested Child', - slug: 'nested-child', - level: 3, - }), - })], + const headingsMap = new Map>([ + [sourceFile, new Set(['parent-heading', 'nested-child'])], ]); const result = await validateLink(link, sourceFile, headingsMap); @@ -287,7 +296,7 @@ describe('validateLink', () => { { sourceFile: safePath.join(FIXTURES_DIR, VALID_MD), link: createLink('anchor', '#any-heading', 'No headings', 5), - headingsMap: new Map(), + headingsMap: new Map>([[safePath.join(FIXTURES_DIR, VALID_MD), new Set()]]), expected: { code: 'LINK_BROKEN_ANCHOR', }, @@ -301,7 +310,7 @@ describe('validateLink', () => { it('should return null for HTTP URL (external links not validated)', async () => { const sourceFile = safePath.join(FIXTURES_DIR, VALID_MD); const link = createLink('external', 'http://example.com', 'HTTP link', 6); - const headingsMap = new Map(); + const headingsMap = new Map>(); const result = await validateLink(link, sourceFile, headingsMap); @@ -311,7 +320,7 @@ describe('validateLink', () => { it('should return null for HTTPS URL (external links not validated)', async () => { const sourceFile = safePath.join(FIXTURES_DIR, VALID_MD); const link = createLink('external', 'https://example.com/path', 'HTTPS link', 7); - const headingsMap = new Map(); + const headingsMap = new Map>(); const result = await validateLink(link, sourceFile, headingsMap); @@ -323,7 +332,7 @@ describe('validateLink', () => { it('should return null for valid email', async () => { const sourceFile = safePath.join(FIXTURES_DIR, VALID_MD); const link = createLink('email', 'mailto:test@example.com', 'Email link', 8); - const headingsMap = new Map(); + const headingsMap = new Map>(); const result = await validateLink(link, sourceFile, headingsMap); @@ -333,7 +342,7 @@ describe('validateLink', () => { it('should return null for email without mailto:', async () => { const sourceFile = safePath.join(FIXTURES_DIR, VALID_MD); const link = createLink('email', 'test@example.com', 'Plain email', 9); - const headingsMap = new Map(); + const headingsMap = new Map>(); const result = await validateLink(link, sourceFile, headingsMap); @@ -348,7 +357,7 @@ describe('validateLink', () => { { sourceFile: safePath.join(FIXTURES_DIR, VALID_MD), link: createLink('unknown', 'ftp://example.com/file', 'FTP link', 10), - headingsMap: new Map(), + headingsMap: new Map>(), expected: { code: 'LINK_UNKNOWN', messageContains: 'Unknown link type', @@ -365,7 +374,7 @@ describe('validateLink', () => { { sourceFile: safePath.join(FIXTURES_DIR, VALID_MD), link: createLink('unknown', 'tel:+1234567890', 'Tel link', 11), - headingsMap: new Map(), + headingsMap: new Map>(), expected: { code: 'LINK_UNKNOWN', }, @@ -379,7 +388,7 @@ describe('validateLink', () => { it('should handle Unix-style paths', async () => { const sourceFile = safePath.join(FIXTURES_DIR, VALID_MD); const link = createLink('local_file', TARGET_FILE_LINK, 'Unix path'); - const headingsMap = new Map(); + const headingsMap = new Map>(); const result = await validateLink(link, sourceFile, headingsMap); @@ -390,7 +399,7 @@ describe('validateLink', () => { const sourceFile = safePath.join(FIXTURES_DIR, VALID_MD); // Node's path.resolve will normalize this correctly on all platforms const link = createLink('local_file', './subdir/nested.md', 'Mixed path'); - const headingsMap = new Map(); + const headingsMap = new Map>(); const result = await validateLink(link, sourceFile, headingsMap); @@ -404,8 +413,8 @@ describe('validateLink', () => { const link = createLink('anchor', '#heading-anchor', 'No line number'); delete link.line; - const headingsMap = new Map([ - [sourceFile, createHeadings({ text: 'Heading Anchor', slug: 'heading-anchor' })], + const headingsMap = new Map>([ + [sourceFile, new Set(['heading-anchor'])], ]); const result = await validateLink(link, sourceFile, headingsMap); @@ -417,8 +426,8 @@ describe('validateLink', () => { const sourceFile = safePath.join(FIXTURES_DIR, VALID_MD); const link = createLink('anchor', '#', 'Empty anchor', 5); - const headingsMap = new Map([ - [sourceFile, createHeadings({ text: 'Heading', slug: 'heading' })], + const headingsMap = new Map>([ + [sourceFile, new Set(['heading'])], ]); const result = await validateLink(link, sourceFile, headingsMap); @@ -430,7 +439,7 @@ describe('validateLink', () => { it('should handle file path with anchor where file does not exist', async () => { const sourceFile = safePath.join(FIXTURES_DIR, VALID_MD); const link = createLink('local_file', './nonexistent.md#heading', 'Broken file with anchor', 5); - const headingsMap = new Map(); + const headingsMap = new Map>(); const result = await validateLink(link, sourceFile, headingsMap); @@ -448,8 +457,8 @@ describe('validateLink', () => { const link = createLink('local_file', '#heading', 'Anchor as file', 5); const targetFile = safePath.join(FIXTURES_DIR, 'target.md'); - const headingsMap = new Map([ - [targetFile, createHeadings({ text: 'Heading', slug: 'heading' })], + const headingsMap = new Map>([ + [targetFile, new Set(['heading'])], ]); const result = await validateLink(link, sourceFile, headingsMap); @@ -461,27 +470,8 @@ describe('validateLink', () => { const sourceFile = safePath.join(FIXTURES_DIR, 'complex.md'); const link = createLink('anchor', '#deeply-nested', 'Deep nesting', 20); - const headingsMap = new Map([ - [sourceFile, createHeadings({ - text: 'Level 1', - slug: 'level-1', - level: 1, - children: createHeadings({ - text: 'Level 2', - slug: 'level-2', - level: 2, - children: createHeadings({ - text: 'Level 3', - slug: 'level-3', - level: 3, - children: createHeadings({ - text: 'Deeply Nested', - slug: 'deeply-nested', - level: 4, - }), - }), - }), - })], + const headingsMap = new Map>([ + [sourceFile, new Set(['level-1', 'level-2', 'level-3', 'deeply-nested'])], ]); const result = await validateLink(link, sourceFile, headingsMap); @@ -494,7 +484,7 @@ describe('validateLink', () => { it('should include all required fields in error issue', async () => { const sourceFile = safePath.join(FIXTURES_DIR, BROKEN_FILE_MD); const link = createLink('local_file', NONEXISTENT_FILE_LINK, 'Broken', 3); - const headingsMap = new Map(); + const headingsMap = new Map>(); const result = await validateLink(link, sourceFile, headingsMap); @@ -514,7 +504,7 @@ describe('validateLink', () => { it('should include empty suggestion in broken file issue', async () => { const sourceFile = safePath.join(FIXTURES_DIR, BROKEN_FILE_MD); const link = createLink('local_file', NONEXISTENT_FILE_LINK, 'Broken'); - const headingsMap = new Map(); + const headingsMap = new Map>(); const result = await validateLink(link, sourceFile, headingsMap); @@ -525,8 +515,8 @@ describe('validateLink', () => { it('should include empty suggestion in broken anchor issue', async () => { const sourceFile = safePath.join(FIXTURES_DIR, VALID_MD); const link = createLink('anchor', NONEXISTENT_ANCHOR, 'Broken'); - const headingsMap = new Map([ - [sourceFile, createHeadings({ text: 'Valid', slug: 'valid' })], + const headingsMap = new Map>([ + [sourceFile, new Set(['valid'])], ]); const result = await validateLink(link, sourceFile, headingsMap); @@ -591,7 +581,7 @@ describe('validateLink', () => { fs.writeFileSync(sourceFile, '# Source'); const link = createLink('local_file', TARGET_FILE_LINK, 'Link to target', 1); - const headingsMap = new Map(); + const headingsMap = new Map>(); await assertValidation( { @@ -629,6 +619,74 @@ describe('validateLink', () => { }); }); + describe('cross-format anchor validation', () => { + const suite = setupSubdirTestSuite('cross-format-suite-'); + + beforeAll(suite.beforeAll); + afterAll(suite.afterAll); + beforeEach(suite.beforeEach); + + it('validates markdown → HTML anchors and flags missing ones', async () => { + // guide.md links to page.html#intro (valid) and page.html#nope (broken) + await writeFile( + safePath.join(suite.tempDir, 'guide.md'), + '[a](./page.html#intro)\n[b](./page.html#nope)\n', + 'utf-8', + ); + await writeFile( + safePath.join(suite.tempDir, 'page.html'), + '

Intro

', + 'utf-8', + ); + + const result = await crawlAndValidate(suite.tempDir); + + const brokenAnchors = result.issues.filter((i) => i.code === 'LINK_BROKEN_ANCHOR'); + // Only #nope should be flagged; #intro is a valid HTML id + expect(brokenAnchors).toHaveLength(1); + expect(brokenAnchors[0]?.message).toContain('nope'); + }); + + it('validates HTML → markdown anchors (zero broken-anchor)', async () => { + // page.html links to guide.md#my-heading — heading slug matches + await writeFile( + safePath.join(suite.tempDir, 'page.html'), + 'x', + 'utf-8', + ); + await writeFile( + safePath.join(suite.tempDir, 'guide.md'), + '## My Heading\n', + 'utf-8', + ); + + const result = await crawlAndValidate(suite.tempDir); + + const brokenAnchors = result.issues.filter((i) => i.code === 'LINK_BROKEN_ANCHOR'); + expect(brokenAnchors).toHaveLength(0); + }); + + it('skips anchor check for a non-indexed target file', async () => { + // doc.md links to external.md#whatever — external.md is NOT present/crawled + await writeFile( + safePath.join(suite.tempDir, 'doc.md'), + '[x](./external.md#whatever)\n', + 'utf-8', + ); + // external.md intentionally absent + + const result = await crawlAndValidate(suite.tempDir); + + const brokenFiles = result.issues.filter((i) => i.code === 'LINK_BROKEN_FILE'); + const brokenAnchors = result.issues.filter((i) => i.code === 'LINK_BROKEN_ANCHOR'); + + // The file is missing → LINK_BROKEN_FILE emitted + expect(brokenFiles.length).toBeGreaterThan(0); + // But no LINK_BROKEN_ANCHOR — target is un-indexed so anchor check is skipped + expect(brokenAnchors).toHaveLength(0); + }); + }); + describe('leading-/ links (RFC 3986 §4.2 absolute-path reference)', () => { let projectRoot: string; let sourceFile: string; @@ -653,7 +711,7 @@ describe('validateLink', () => { it('resolves /docs/foo.md against projectRoot', async () => { const link = createLink('local_file', '/docs/foo.md', 'Leading slash', 1); - const result = await validateLink(link, sourceFile, new Map(), { + const result = await validateLink(link, sourceFile, new Map>(), { projectRoot, skipGitIgnoreCheck: true, }); @@ -662,7 +720,7 @@ describe('validateLink', () => { it('emits absolute_no_root broken_file when projectRoot is undefined', async () => { const link = createLink('local_file', '/docs/foo.md', 'Leading slash', 1); - const result = await validateLink(link, sourceFile, new Map(), { + const result = await validateLink(link, sourceFile, new Map>(), { skipGitIgnoreCheck: true, }); expect(result).not.toBeNull(); @@ -672,7 +730,7 @@ describe('validateLink', () => { it('emits absolute_escapes_root broken_file when leading-/ escapes projectRoot', async () => { const link = createLink('local_file', '/../escape.md', 'Escape', 1); - const result = await validateLink(link, sourceFile, new Map(), { + const result = await validateLink(link, sourceFile, new Map>(), { projectRoot, skipGitIgnoreCheck: true, }); @@ -683,13 +741,10 @@ describe('validateLink', () => { it('resolves anchor on leading-/ link', async () => { const link = createLink('local_file', '/docs/foo.md#section-a', 'Leading slash anchor', 1); - const headings = new Map([ + const headings = new Map>([ [ safePath.join(projectRoot, 'docs', 'foo.md'), - createHeadings( - { text: 'Foo', slug: 'foo', level: 1 }, - { text: 'Section A', slug: 'section-a', level: 2 }, - ), + new Set(['foo', 'section-a']), ], ]); const result = await validateLink(link, sourceFile, headings, { @@ -705,7 +760,7 @@ describe('validateLink', () => { { sourceFile, link: createLink('local_file', '/docs/', 'Directory target', 1), - headingsMap: new Map(), + headingsMap: new Map>(), expected: { code: 'LINK_BROKEN_FILE', messageContains: 'Link target is a directory' }, validationOptions: { projectRoot, skipGitIgnoreCheck: true }, }, @@ -719,7 +774,7 @@ describe('validateLink', () => { { sourceFile, link: createLink('local_file', '../', 'Relative directory target', 1), - headingsMap: new Map(), + headingsMap: new Map>(), expected: { code: 'LINK_BROKEN_FILE', messageContains: 'Link target is a directory' }, validationOptions: { projectRoot, skipGitIgnoreCheck: true }, }, diff --git a/packages/resources/test/integration/resource-parser.integration.test.ts b/packages/resources/test/integration/resource-parser.integration.test.ts index 1bedb87a..8996185d 100644 --- a/packages/resources/test/integration/resource-parser.integration.test.ts +++ b/packages/resources/test/integration/resource-parser.integration.test.ts @@ -3,11 +3,10 @@ * Tests parsing real markdown, JSON, and YAML files with proper resource type detection */ -import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { writeFile } from 'node:fs/promises'; - -import { normalizedTmpdir, safePath } from '@vibe-agent-toolkit/utils'; -import { afterAll, afterEach, beforeAll, beforeEach, describe, expect, it } from 'vitest'; +import { safePath } from '@vibe-agent-toolkit/utils'; +import { afterAll, beforeAll, beforeEach, describe, expect, it } from 'vitest'; import { detectResourceType, @@ -17,6 +16,7 @@ import { parseYamlResource, } from '../../src/types/resource-parser.ts'; import { ResourceType } from '../../src/types/resources.ts'; +import { setupSubdirTestSuite } from '../test-helpers.js'; // ============================================================================ // File-scope test constants (accessible to all tests) @@ -34,33 +34,13 @@ const CONFIG_YAML_FILE = 'config.yaml'; // Test suite setup // ============================================================================ -let testCounter = 0; -const suite = { - suiteDir: '', - tempDir: '', - beforeAll: async () => { - suite.suiteDir = await mkdtemp(safePath.join(normalizedTmpdir(), 'resource-parser-suite-')); - }, - afterAll: async () => { - await rm(suite.suiteDir, { recursive: true, force: true }); - }, - beforeEach: async () => { - testCounter++; - suite.tempDir = safePath.join(suite.suiteDir, `test-${testCounter}`); - // eslint-disable-next-line security/detect-non-literal-fs-filename -- tempDir is from mkdtemp - await mkdir(suite.tempDir, { recursive: true }); - }, - afterEach: async () => { - // Per-test cleanup handled by suite cleanup - }, -}; +const suite = setupSubdirTestSuite('resource-parser-suite-'); describe('parseMarkdownResource', () => { beforeAll(suite.beforeAll); afterAll(suite.afterAll); beforeEach(suite.beforeEach); - afterEach(suite.afterEach); it('should parse markdown with frontmatter', async () => { const filePath = safePath.join(suite.tempDir, 'doc.md'); @@ -202,7 +182,6 @@ describe('parseJsonSchemaResource', () => { beforeAll(suite.beforeAll); afterAll(suite.afterAll); beforeEach(suite.beforeEach); - afterEach(suite.afterEach); it('should parse JSON Schema with $schema keyword', async () => { const filePath = safePath.join(suite.tempDir, USER_SCHEMA_FILE); @@ -281,7 +260,6 @@ describe('parseJsonSchemaResource', () => { describe('parseJsonResource', () => { beforeEach(suite.beforeEach); - afterEach(suite.afterEach); it('should parse regular JSON data', async () => { const filePath = safePath.join(suite.tempDir, 'data.json'); @@ -331,7 +309,6 @@ describe('parseJsonResource', () => { describe('parseYamlResource', () => { beforeEach(suite.beforeEach); - afterEach(suite.afterEach); it('should parse YAML data', async () => { const filePath = safePath.join(suite.tempDir, CONFIG_YAML_FILE); diff --git a/packages/resources/test/integration/resource-registry.integration.test.ts b/packages/resources/test/integration/resource-registry.integration.test.ts index 11739950..cb8482e8 100644 --- a/packages/resources/test/integration/resource-registry.integration.test.ts +++ b/packages/resources/test/integration/resource-registry.integration.test.ts @@ -16,7 +16,7 @@ import { describe, expect, it, beforeEach, afterEach, beforeAll, afterAll } from import { ResourceRegistry } from '../../src/resource-registry.js'; import type { ResourceMetadata } from '../../src/types.js'; -import { findPackageRoot } from '../test-helpers.js'; +import { findPackageRoot, setupSubdirTestSuite } from '../test-helpers.js'; // Get test fixtures directory const packageRoot = findPackageRoot(import.meta.dirname); @@ -534,27 +534,11 @@ describe('ResourceRegistry - Integration Tests', () => { }); describe('validate with frontmatter schema', () => { - let suiteDir: string; - let tempDir: string; - let testCounter = 0; - - beforeAll(async () => { - suiteDir = await mkdtemp(safePath.join(normalizedTmpdir(), 'frontmatter-suite-')); - }); - - afterAll(async () => { - await rm(suiteDir, { recursive: true, force: true }); - }); - - beforeEach(async () => { - testCounter++; - tempDir = safePath.join(suiteDir, `test-${testCounter}`); - await mkdir(tempDir, { recursive: true }); - }); + const suite = setupSubdirTestSuite('frontmatter-suite-'); - afterEach(async () => { - // Per-test cleanup handled by suite cleanup - }); + beforeAll(suite.beforeAll); + afterAll(suite.afterAll); + beforeEach(suite.beforeEach); it('should validate frontmatter against schema and report missing required fields', async () => { const registry = new ResourceRegistry(); @@ -589,7 +573,7 @@ describe('ResourceRegistry - Integration Tests', () => { const registry = new ResourceRegistry(); // Create a file with invalid YAML - const invalidYamlPath = safePath.join(tempDir, 'invalid-yaml.md'); + const invalidYamlPath = safePath.join(suite.tempDir, 'invalid-yaml.md'); await writeFile( invalidYamlPath, `--- @@ -872,4 +856,45 @@ tags: test }); }); }); + + describe('HTML resource discovery', () => { + let htmlTempDir: string; + + beforeEach(async () => { + htmlTempDir = await mkdtemp(safePath.join(normalizedTmpdir(), 'html-discovery-test-')); + }); + + afterEach(async () => { + await rm(htmlTempDir, { recursive: true, force: true }); + }); + + it('discovers and parses HTML resources', async () => { + await writeFile( + safePath.join(htmlTempDir, 'page.html'), + 'n', + 'utf-8', + ); + const reg = new ResourceRegistry({ baseDir: htmlTempDir }); + await reg.crawl({ baseDir: htmlTempDir }); + const html = reg.getAllResources().find((r) => r.filePath.endsWith('page.html')); + expect(html).toBeDefined(); + expect(html?.links.map((l) => l.href)).toContain('./next.html'); + }); + + it('emits MALFORMED_HTML info issues for HTML parse errors', async () => { + // A malformed HTML file: unclosed

tag triggers a parse5 diagnostic + await writeFile( + safePath.join(htmlTempDir, 'bad.html'), + '

unclosed', + 'utf-8', + ); + const reg = new ResourceRegistry({ baseDir: htmlTempDir }); + await reg.crawl({ baseDir: htmlTempDir }); + const result = await reg.validate({ skipGitIgnoreCheck: true }); + + const malformedIssues = result.issues.filter((i) => i.code === 'MALFORMED_HTML'); + expect(malformedIssues.length).toBeGreaterThan(0); + expect(malformedIssues[0]?.severity).toBe('info'); + }); + }); }); diff --git a/packages/resources/test/link-validator-helpers.test.ts b/packages/resources/test/link-validator-helpers.test.ts index 757201db..c8203b76 100644 --- a/packages/resources/test/link-validator-helpers.test.ts +++ b/packages/resources/test/link-validator-helpers.test.ts @@ -9,6 +9,7 @@ import { describe, expect, it } from 'vitest'; import { + checkAnchor, fileExistenceIssue, gitIgnoreSafetyIssue, resolutionFailureIssue, @@ -125,6 +126,29 @@ describe('fileExistenceIssue', () => { }); }); +describe('checkAnchor', () => { + const GUIDE_MD = '/abs/guide.md'; + const PAGE_HTML = '/abs/page.html'; + const index = new Map>([ + [GUIDE_MD, new Set(['my-heading'])], + [PAGE_HTML, new Set(['Intro', 'legacy'])], + ]); + + it('skips targets that are not indexed', () => { + expect(checkAnchor('anything', '/abs/not-indexed.md', index)).toBe('skip'); + }); + + it('matches markdown slugs case-insensitively', () => { + expect(checkAnchor('My-Heading', GUIDE_MD, index)).toBe('valid'); + expect(checkAnchor('missing', GUIDE_MD, index)).toBe('broken'); + }); + + it('matches HTML ids case-sensitively', () => { + expect(checkAnchor('Intro', PAGE_HTML, index)).toBe('valid'); + expect(checkAnchor('intro', PAGE_HTML, index)).toBe('broken'); + }); +}); + describe('gitIgnoreSafetyIssue', () => { it('returns null when skipGitIgnoreCheck is true', () => { expect( diff --git a/packages/resources/test/metadata-schema.test.ts b/packages/resources/test/metadata-schema.test.ts index 4ef520ce..b63b7ab6 100644 --- a/packages/resources/test/metadata-schema.test.ts +++ b/packages/resources/test/metadata-schema.test.ts @@ -1,6 +1,5 @@ import { describe, it, expect } from 'vitest'; - import { ResourceMetadataSchema } from '../src/schemas/resource-metadata.js'; describe('ResourceMetadataSchema with checksum', () => { @@ -63,3 +62,30 @@ describe('ResourceMetadataSchema with checksum', () => { } }); }); + +describe('ResourceMetadataSchema HTML fields', () => { + const base = { + id: 'x', + filePath: '/abs/x.html', + links: [], + headings: [], + sizeBytes: 1, + estimatedTokenCount: 1, + modifiedAt: new Date(), + checksum: 'a'.repeat(64), + }; + + it('accepts optional anchors and parseErrors', () => { + const parsed = ResourceMetadataSchema.parse({ + ...base, + anchors: ['intro'], + parseErrors: [{ message: 'missing-end-tag', line: 3 }], + }); + expect(parsed.anchors).toEqual(['intro']); + expect(parsed.parseErrors?.[0]?.message).toBe('missing-end-tag'); + }); + + it('rejects unknown top-level fields (strict)', () => { + expect(() => ResourceMetadataSchema.parse({ ...base, bogus: 1 })).toThrow(); + }); +}); diff --git a/packages/resources/test/test-helpers.ts b/packages/resources/test/test-helpers.ts index f5cc275f..bc1ecfd6 100644 --- a/packages/resources/test/test-helpers.ts +++ b/packages/resources/test/test-helpers.ts @@ -5,7 +5,7 @@ import { spawnSync } from 'node:child_process'; import { readFileSync } from 'node:fs'; -import { mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; import path from 'node:path'; import { normalizedTmpdir, safePath } from '@vibe-agent-toolkit/utils'; @@ -184,6 +184,54 @@ export function setupTempDirTestSuite(testPrefix: string): { return suite; } +/** + * Setup a suite-scoped temp directory with per-test subdirectories. + * + * Creates a single `suiteDir` once for the whole `describe` block, then + * allocates a numbered subdirectory (`test-1`, `test-2`, …) for each test so + * that tests are isolated without paying the cost of creating and deleting a + * fresh top-level temp directory for every test. The suite directory is + * removed in `afterAll`. + * + * Lifecycle wiring (call inside the target `describe` block): + * ```typescript + * const suite = setupSubdirTestSuite('my-suite-'); + * beforeAll(suite.beforeAll); + * afterAll(suite.afterAll); + * beforeEach(suite.beforeEach); + * ``` + * + * @param suitePrefix - Prefix for the suite-level temp directory + * @returns Object with `suiteDir`/`tempDir` refs and lifecycle hook callbacks + */ +export function setupSubdirTestSuite(suitePrefix: string): { + suiteDir: string; + tempDir: string; + beforeAll: () => Promise; + afterAll: () => Promise; + beforeEach: () => Promise; +} { + let testCounter = 0; + const suite = { + suiteDir: '', + tempDir: '', + beforeAll: async () => { + suite.suiteDir = await mkdtemp(safePath.join(normalizedTmpdir(), suitePrefix)); + }, + afterAll: async () => { + await rm(suite.suiteDir, { recursive: true, force: true }); + }, + beforeEach: async () => { + testCounter++; + suite.tempDir = safePath.join(suite.suiteDir, `test-${testCounter}`); + // eslint-disable-next-line security/detect-non-literal-fs-filename -- suite.tempDir is constructed from mkdtemp output, safe in test context + await mkdir(suite.tempDir, { recursive: true }); + }, + }; + + return suite; +} + /** * Setup external link validator test suite with temp directory and validator. * @@ -240,20 +288,6 @@ export function createLink( }; } -/** - * Helper to create a simple heading tree - */ -export function createHeadings( - ...headings: Array<{ text: string; slug: string; level?: number; children?: HeadingNode[] }> -): HeadingNode[] { - return headings.map((h) => ({ - text: h.text, - slug: h.slug, - level: h.level ?? 2, - children: h.children, - })); -} - /** * Options for validating a link with expected results */ @@ -262,8 +296,8 @@ export interface ValidateLinkOptions { sourceFile: string; /** Link to validate */ link: ResourceLink; - /** Headings map for validation */ - headingsMap: Map; + /** Fragment index for anchor validation (file path → set of valid fragments) */ + headingsMap: Map>; /** Expected validation result (null = valid, object = error) */ expected: null | { code: ValidationIssue['code']; From eb2cc54762af85f97cff783498795c495c3085a4 Mon Sep 17 00:00:00 2001 From: David Daniel Gonzalez Date: Wed, 3 Jun 2026 17:15:03 -0400 Subject: [PATCH 2/9] feat(agent-skills): rewrite links in bundled HTML resources (#112) Make HTML resources participate in linkFollowDepth bundling like markdown: - html-transform.ts: rewriteHtmlLinks offset-splices / values at parse5-reported offsets and never re-serializes, so unchanged input round-trips byte-for-byte; preserves original quoting (keeps unquoted values unquoted unless the new value would break syntax) - skill-packager: format-aware copy guard routes .html/.htm through rewriteHtmlLinks (no frontmatter parsing); .md path unchanged; other files still binary-copied. Reuses the shared href-resolution callback (buildFrontmatterHrefRewriter renamed buildHrefRewriter) - Unit tests (round-trip fidelity, quoting, escaping) + packager integration test (bundled HTML link rewritten, markup preserved) --- packages/agent-skills/src/skill-packager.ts | 34 ++++- ...er-frontmatter-rewrite.integration.test.ts | 7 +- .../packager-html-rewrite.integration.test.ts | 74 +++++++++++ .../test/integration/packager-test-helpers.ts | 29 +++++ packages/resources/src/html-transform.ts | 122 ++++++++++++++++++ packages/resources/src/index.ts | 4 + .../resources/test/html-transform.test.ts | 48 +++++++ 7 files changed, 307 insertions(+), 11 deletions(-) create mode 100644 packages/agent-skills/test/integration/packager-html-rewrite.integration.test.ts create mode 100644 packages/agent-skills/test/integration/packager-test-helpers.ts create mode 100644 packages/resources/src/html-transform.ts create mode 100644 packages/resources/test/html-transform.test.ts diff --git a/packages/agent-skills/src/skill-packager.ts b/packages/agent-skills/src/skill-packager.ts index 86074d4b..9938abb4 100644 --- a/packages/agent-skills/src/skill-packager.ts +++ b/packages/agent-skills/src/skill-packager.ts @@ -31,6 +31,7 @@ import { openFrontmatter, resolveLocalHref, rewriteFrontmatterUriReferencesFromSchema, + rewriteHtmlLinks, transformContent, type LinkRewriteRule, type ParseResult, @@ -1031,8 +1032,12 @@ async function copyAndRewriteFile( // eslint-disable-next-line security/detect-non-literal-fs-filename -- targetPath is constructed from validated paths await mkdir(dirname(targetPath), { recursive: true }); - // Non-markdown files or rewriting disabled: plain binary copy - if (!sourcePath.endsWith('.md') || !ctx.rewriteLinks) { + const lower = sourcePath.toLowerCase(); + const isMarkdown = lower.endsWith('.md'); + const isHtml = lower.endsWith('.html') || lower.endsWith('.htm'); + + // Non-rewritable files or rewriting disabled: plain binary copy + if ((!isMarkdown && !isHtml) || !ctx.rewriteLinks) { await copyFile(sourcePath, targetPath); return; } @@ -1045,12 +1050,29 @@ async function copyAndRewriteFile( const resource = ctx.fromRegistry.getResource(safePath.resolve(sourcePath)); if (!resource) { - // Resource not in registry — write content as-is + // Resource not in registry — write content as-is. For HTML this is only + // reachable on an ID collision (e.g. page.html + page.md), where the asset + // is copied verbatim and its links are NOT rewritten (v1 limitation, + // mirrors the pre-existing asset-collision behavior). // eslint-disable-next-line security/detect-non-literal-fs-filename -- targetPath is constructed from validated paths await writeFile(targetPath, content, 'utf-8'); return; } + // HTML: offset-splice link rewrite (no frontmatter, no template body rewrite). + if (isHtml) { + const rewriteHref = buildHrefRewriter( + ctx.fromRegistry, + ctx.toRegistry, + sourcePath, + targetPath, + ctx.projectRoot, + ); + // eslint-disable-next-line security/detect-non-literal-fs-filename -- targetPath is constructed from validated paths + await writeFile(targetPath, rewriteHtmlLinks(content, rewriteHref), 'utf-8'); + return; + } + // Parse once via FrontmatterEditor so comments survive any frontmatter // rewrites. The body is held verbatim; we run it through transformContent // for the existing rule/template body-link rewrite contract (unchanged). @@ -1072,7 +1094,7 @@ async function copyAndRewriteFile( (id) => ctx.collectionSchemas.has(id), ); if (matchingCollections.length > 0) { - const rewriteHref = buildFrontmatterHrefRewriter( + const rewriteHref = buildHrefRewriter( ctx.fromRegistry, ctx.toRegistry, sourcePath, @@ -1092,7 +1114,7 @@ async function copyAndRewriteFile( } /** - * Build the per-href rewrite callback used for frontmatter URI-refs. + * Build the per-href rewrite callback used for frontmatter URI-refs and HTML attributes. * * Mirrors the body-rewrite path so frontmatter and body link rewriting agree * on target paths: @@ -1107,7 +1129,7 @@ async function copyAndRewriteFile( * * Returns the original href when no rewrite applies. */ -function buildFrontmatterHrefRewriter( +function buildHrefRewriter( fromRegistry: WalkableRegistry, toRegistry: ResourceRegistry, sourcePath: string, diff --git a/packages/agent-skills/test/integration/packager-frontmatter-rewrite.integration.test.ts b/packages/agent-skills/test/integration/packager-frontmatter-rewrite.integration.test.ts index 34f75872..bf30b5f7 100644 --- a/packages/agent-skills/test/integration/packager-frontmatter-rewrite.integration.test.ts +++ b/packages/agent-skills/test/integration/packager-frontmatter-rewrite.integration.test.ts @@ -4,7 +4,7 @@ import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; import { mkdirSyncReal, normalizedTmpdir, safePath } from '@vibe-agent-toolkit/utils'; import { afterAll, beforeAll, describe, expect, it } from 'vitest'; -import { packageSkill } from '../../src/skill-packager.js'; +import { buildExampleSkill } from './packager-test-helpers.js'; const SCHEMA = { type: 'object', @@ -76,10 +76,7 @@ describe('packager rewrites frontmatter URI-refs with body parity (Gap 3)', () = }); it('rewrites frontmatter URI-refs to packaged output paths and preserves comments', async () => { - const skillPath = safePath.join(projectRoot, 'skills', 'example', 'SKILL.md'); - const outputPath = safePath.join(projectRoot, 'dist', 'example'); - const result = await packageSkill(skillPath, { outputPath, formats: ['directory'] }); - expect(result.hasErrors).toBe(false); + const { outputPath } = await buildExampleSkill(projectRoot); const builtSkill = readFileSync(safePath.join(outputPath, 'SKILL.md'), 'utf-8'); diff --git a/packages/agent-skills/test/integration/packager-html-rewrite.integration.test.ts b/packages/agent-skills/test/integration/packager-html-rewrite.integration.test.ts new file mode 100644 index 00000000..f6ce4140 --- /dev/null +++ b/packages/agent-skills/test/integration/packager-html-rewrite.integration.test.ts @@ -0,0 +1,74 @@ +/* eslint-disable security/detect-non-literal-fs-filename -- Test code with temp directories */ +import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; + +import { mkdirSyncReal, normalizedTmpdir, safePath } from '@vibe-agent-toolkit/utils'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; + +import { buildExampleSkill } from './packager-test-helpers.js'; + +describe('packager rewrites links inside bundled HTML resources', () => { + let projectRoot: string; + + beforeAll(() => { + projectRoot = mkdtempSync(safePath.join(normalizedTmpdir(), 'vat-packager-html-')); + // Lay out: + // /vibe-agent-toolkit.config.yaml + // /skills/example/SKILL.md (links to page.html) + // /skills/example/page.html (links back to SKILL.md, has a comment) + mkdirSyncReal(safePath.join(projectRoot, 'skills', 'example'), { recursive: true }); + + writeFileSync( + safePath.join(projectRoot, 'vibe-agent-toolkit.config.yaml'), + ['version: 1', ''].join('\n'), + ); + writeFileSync( + safePath.join(projectRoot, 'skills', 'example', 'SKILL.md'), + [ + '---', + 'name: example', + 'description: A test skill that links to an HTML page.', + '---', + '# Example', + '', + 'See [the page](./page.html).', + '', + ].join('\n'), + ); + writeFileSync( + safePath.join(projectRoot, 'skills', 'example', 'page.html'), + [ + '', + '', + 'Test', + '', + '', + 'back', + '', + '', + '', + ].join('\n'), + ); + }); + + afterAll(() => { + rmSync(projectRoot, { recursive: true, force: true }); + }); + + it('rewrites links inside bundled HTML resources', async () => { + const { outputPath } = await buildExampleSkill(projectRoot); + + // Non-SKILL.md resources land in the 'resources/' subdirectory of the bundle + const html = readFileSync(safePath.join(outputPath, 'resources', 'page.html'), 'utf-8'); + + // HTML comment and surrounding markup is preserved verbatim + expect(html).toContain(''); + expect(html).toContain(''); + expect(html).toContain(''); + + // The source-relative href was rewritten to point at the bundled SKILL.md + expect(html).toMatch(/href="[^"]*SKILL\.md"/); + + // The original source-relative href is gone (it pointed to the source layout, not the output) + expect(html).not.toContain('href="./SKILL.md"'); + }); +}); diff --git a/packages/agent-skills/test/integration/packager-test-helpers.ts b/packages/agent-skills/test/integration/packager-test-helpers.ts new file mode 100644 index 00000000..9ef0b55f --- /dev/null +++ b/packages/agent-skills/test/integration/packager-test-helpers.ts @@ -0,0 +1,29 @@ +/** + * Shared helpers for packager integration tests. + * + * Extracted to eliminate duplicated setup across packager-*.integration.test.ts files. + */ + +import { safePath } from '@vibe-agent-toolkit/utils'; +import { expect } from 'vitest'; + +import { packageSkill } from '../../src/skill-packager.js'; + +/** + * Package the canonical `skills/example/SKILL.md` skill inside `projectRoot` + * into `dist/example` and assert that packaging succeeded without errors. + * + * Both HTML-rewrite and frontmatter-rewrite tests use the same skill path + * layout; this helper centralises the three-line setup that was duplicated + * across those test files. + * + * @param projectRoot - Root directory of the temporary test project + * @returns `outputPath` (`/dist/example`) for further assertions + */ +export async function buildExampleSkill(projectRoot: string): Promise<{ outputPath: string }> { + const skillPath = safePath.join(projectRoot, 'skills', 'example', 'SKILL.md'); + const outputPath = safePath.join(projectRoot, 'dist', 'example'); + const result = await packageSkill(skillPath, { outputPath, formats: ['directory'] }); + expect(result.hasErrors).toBe(false); + return { outputPath }; +} diff --git a/packages/resources/src/html-transform.ts b/packages/resources/src/html-transform.ts new file mode 100644 index 00000000..b87c70cc --- /dev/null +++ b/packages/resources/src/html-transform.ts @@ -0,0 +1,122 @@ +/** + * Structure-preserving HTML link rewriter. + * + * Rewrites `` and `` attribute VALUES in place by splicing the + * original source string at parse5-reported offsets. The document is never + * re-serialized (parse5's serializer normalizes whitespace, quotes, void + * elements, and the doctype), so unchanged input round-trips byte-for-byte. + * + * Uses the same `RewriteHref` callback model as `rewriteBodyLinks` — callers + * supply per-href target resolution; this module owns only the splice mechanics. + */ + +import { parseHtmlDocument, walkElements } from './html-link-parser.js'; +import type { RewriteHref } from './rewriter-helpers.js'; + +/** Tag name → the single link-bearing attribute we rewrite. */ +const LINK_ATTR_BY_TAG: Record = { a: 'href', img: 'src' }; + +interface ValueSpan { + valueStart: number; + valueEnd: number; + /** `"` or `'` for quoted attributes; `''` for unquoted. */ + quote: string; +} + +interface Edit extends ValueSpan { + newValue: string; +} + +/** + * Chars that force an unquoted HTML attribute value to be quoted. Follows the + * WHATWG unquoted-attribute-value rules: whitespace, quotes, `<`, `>`, plus + * backtick and `=` (a superset of paths we expect, kept strict for safety). + */ +const UNQUOTED_UNSAFE = /[\s"'`<>=]/; + +/** + * Locate the value sub-range within an attribute's full source span. + * + * parse5 reports the whole-attribute span (`href="value"`); this finds the + * value's absolute offsets and the quote char. Returns undefined for boolean + * attributes (no `=`). + */ +function valueSpan(attrSource: string, base: number): ValueSpan | undefined { + const eq = attrSource.indexOf('='); + if (eq === -1) { + return undefined; + } + let i = eq + 1; + while (i < attrSource.length && /\s/.test(attrSource.charAt(i))) { + i += 1; + } + if (i >= attrSource.length) { + return undefined; + } + const ch = attrSource.charAt(i); + if (ch === '"' || ch === "'") { + const close = attrSource.indexOf(ch, i + 1); + const end = close === -1 ? attrSource.length : close; + return { valueStart: base + i + 1, valueEnd: base + end, quote: ch }; + } + return { valueStart: base + i, valueEnd: base + attrSource.length, quote: '' }; +} + +/** + * Encode a new value for writing. For quoted attributes the surrounding quotes + * stay in the source (we only replace the inner value), so we escape `&` and the + * active quote. For originally-unquoted values we keep them bare when safe, else + * wrap in double quotes. + */ +function encodeValue(newValue: string, quote: string): string { + if (quote === '"' || quote === "'") { + const amp = newValue.replaceAll('&', '&'); + return quote === '"' ? amp.replaceAll('"', '"') : amp.replaceAll("'", '''); + } + if (newValue.length > 0 && !UNQUOTED_UNSAFE.test(newValue)) { + return newValue.replaceAll('&', '&'); + } + const escaped = newValue.replaceAll('&', '&').replaceAll('"', '"'); + return `"${escaped}"`; +} + +/** + * Rewrite `` / `` values in `source` using `rewriteHref`. + * Returns `source` unchanged (byte-for-byte) when no value changes. + */ +export function rewriteHtmlLinks(source: string, rewriteHref: RewriteHref): string { + const { document } = parseHtmlDocument(source); + const edits: Edit[] = []; + + for (const element of walkElements(document)) { + const attrName = LINK_ATTR_BY_TAG[element.tagName]; + if (attrName === undefined) { + continue; + } + const attr = element.attrs.find((a) => a.name === attrName); + if (attr === undefined) { + continue; + } + const location = element.sourceCodeLocation?.attrs?.[attrName]; + if (location === undefined) { + continue; + } + const newValue = rewriteHref(attr.value); + if (newValue === attr.value) { + continue; + } + const span = valueSpan(source.slice(location.startOffset, location.endOffset), location.startOffset); + if (span === undefined) { + continue; + } + edits.push({ ...span, newValue }); + } + + // Apply descending by start offset so earlier edits don't shift later ones. + edits.sort((a, b) => b.valueStart - a.valueStart); + let result = source; + for (const edit of edits) { + result = result.slice(0, edit.valueStart) + encodeValue(edit.newValue, edit.quote) + result.slice(edit.valueEnd); + } + return result; +} diff --git a/packages/resources/src/index.ts b/packages/resources/src/index.ts index aadfcbe1..91fccccc 100644 --- a/packages/resources/src/index.ts +++ b/packages/resources/src/index.ts @@ -85,6 +85,10 @@ export { // Export parser interface for advanced use cases export { parseMarkdown, type ParseResult } from './link-parser.js'; +export { parseHtml } from './html-link-parser.js'; +export type { HtmlParseError } from './link-parser.js'; +export { rewriteHtmlLinks } from './html-transform.js'; + // Export frontmatter validation export { validateFrontmatter } from './frontmatter-validator.js'; diff --git a/packages/resources/test/html-transform.test.ts b/packages/resources/test/html-transform.test.ts new file mode 100644 index 00000000..93899346 --- /dev/null +++ b/packages/resources/test/html-transform.test.ts @@ -0,0 +1,48 @@ +import { describe, expect, it } from 'vitest'; + +import { rewriteHtmlLinks } from '../src/html-transform.js'; + +const swap = (from: string, to: string) => (href: string) => (href === from ? to : href); + +describe('rewriteHtmlLinks', () => { + it('returns byte-identical source when nothing changes', () => { + const src = '\nx \n'; + expect(rewriteHtmlLinks(src, (h) => h)).toBe(src); + }); + + it('rewrites only the targeted href, preserving double quotes', () => { + const src = 'x'; + expect(rewriteHtmlLinks(src, swap('./old.html', './new.html'))).toBe( + 'x', + ); + }); + + it('preserves single quotes', () => { + const src = ""; + expect(rewriteHtmlLinks(src, swap('old.png', 'new.png'))).toBe(""); + }); + + it('keeps unquoted values unquoted when safe', () => { + const src = 'x'; + expect(rewriteHtmlLinks(src, swap('old.html', 'new.html'))).toBe('x'); + }); + + it('adds quotes to a previously-unquoted value only when unsafe', () => { + const src = 'x'; + expect(rewriteHtmlLinks(src, swap('old.html', 'new file.html'))).toBe( + 'x', + ); + }); + + it('escapes & and the active quote in the written value', () => { + const src = 'x'; + expect(rewriteHtmlLinks(src, swap('old', 'a&b"c'))).toBe('x'); + }); + + it('applies multiple rewrites without offset drift', () => { + const src = '12'; + const mapping: Record = { 'a.html': 'x.html', 'b.html': 'y.html' }; + const rw = (h: string) => mapping[h] ?? h; + expect(rewriteHtmlLinks(src, rw)).toBe('12'); + }); +}); From d7100855aca72463d9663b2044fa9df824b45cc9 Mon Sep 17 00:00:00 2001 From: David Daniel Gonzalez Date: Wed, 3 Jun 2026 17:25:20 -0400 Subject: [PATCH 3/9] docs(changelog): add HTML resources entry (#112) --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7854fbd6..e8f3b5f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Cowork driver spike.** Added [`docs/contributing/cowork-driver-spike.md`](docs/contributing/cowork-driver-spike.md) — a time-boxed investigation (per §4a of the harness v2 design) of whether `claude-cowork` can be driven programmatically by the empirical compat harness today. Verdict: **not feasible**; cowork is a Claude Desktop app product with no public API/CLI surface. The `claude-cowork` runtime stays on `scripted-assisted` until Anthropic ships a Cowork CLI mode, Sessions API, or documented filesystem-import path. Adjacent finding (not a cowork replacement): the public-beta Skills API (`POST /v1/skills` + `container.skills[]` on `/v1/messages`) supports a fully-automatable *new* runtime — captured in the spike doc as a potential follow-up, gated on a separate design decision. - **Subscription-only compat harness billing.** The harness now bills a Claude Pro/Max subscription instead of the API: both token-consuming surfaces (the `claude-code` runtime driver and the LLM judge) route through one shared `claude` CLI invoker (`runtimes/shared/claude-cli.ts`) that injects the operator's `CLAUDE_CODE_OAUTH_TOKEN` and deletes every API credential from the child env, so the CLI cannot fall back to API billing. The operator's own token is sourced at preflight — env var if set, otherwise an interactive prompt — so a run only ever spends the operator's personal plan. The judge was migrated off `@anthropic-ai/sdk` (dependency removed) onto the CLI, parsing a strict JSON verdict with one retry instead of the SDK's forced-tool call (`judge-system.md` now asks for a JSON object). `RunMetadata` gains `authMode` and the report methodology discloses subscription auth + parsed-not-forced verdicts. Premise (zero API billing under the OAuth token) still pending the manual smoke test. +### Added + +- **First-class local HTML resources (#112).** `.html`/`.htm` files are now discovered, parsed, link- and anchor-validated, checked for well-formedness, and link-rewritten on bundle — using the same `ParseResult` contract and validation framework as markdown. A parse5-backed parser extracts `` and `` links plus `id`/`name` fragment anchors; `ResourceRegistry` routes HTML through it and persists optional `anchors`/`parseErrors` on `ResourceMetadata`. Anchor validation now uses a format-neutral fragment index (`Map>` of markdown heading slugs + HTML `id`/`name`), enabling cross-format anchor checks (md↔html) with HTML ids matched case-sensitively and markdown slugs case-insensitively. A new `MALFORMED_HTML` code (default `info`) surfaces parser well-formedness diagnostics. On bundle, ``/`` values are rewritten by offset-splicing the original source (never re-serialized), so unchanged markup round-trips byte-for-byte and original attribute quoting is preserved. Scope is `` + `` only; ``/`