From 20778d56892197e157c4f1416cf53d27b8b07199 Mon Sep 17 00:00:00 2001 From: Jeff Dutton Date: Mon, 18 May 2026 13:00:38 -0400 Subject: [PATCH 1/4] fix(extractors): preserve structured YAML in generic extractor output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The generic extractor's line-by-line keyword filter mangled structured YAML output from steps like `vat resources validate`. It kept lines containing error keywords (which matched YAML keys `errors:` and `errorSummary:`) but dropped their value lines, producing orphaned keys and stacked empty `errors:` blocks in extraction.errorSummary. Two detection paths added ahead of the keyword filter: 1. Delimited block extraction: `---` followed by a content-bearing YAML line. Walks forward, stopping inclusively at closing `---`/`...` or exclusively at first non-YAML-structural line. Extracts only the YAML slice; preamble and trailing log noise are dropped. 2. Multi-key heuristic for output without document markers: >=3 top-level YAML keys + >=5 meaningful lines + no log indicators (Traceback, npm ERR!, stack-frame `at /`, caret markers). Preserves the full denoised output. Both paths produce errorSummary with an 80-line cap and a truncation marker. Free-form logs fall through to the existing keyword filter unchanged. Per-line regex set is anchored, ReDoS-safe, and split into individual literals (no nested quantifiers, no overlapping alternation). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/extractors/generic/index.test.ts | 201 ++++++++++++++++++ .../src/extractors/generic/index.ts | 186 +++++++++++++++- 2 files changed, 380 insertions(+), 7 deletions(-) diff --git a/packages/extractors/src/extractors/generic/index.test.ts b/packages/extractors/src/extractors/generic/index.test.ts index fe6916e6..81e35913 100644 --- a/packages/extractors/src/extractors/generic/index.test.ts +++ b/packages/extractors/src/extractors/generic/index.test.ts @@ -114,6 +114,207 @@ FAIL example.com/project 0.123s }); }); + describe('Structured YAML output preservation', () => { + const vatYamlOutput = `status: failed +filesScanned: 248 +filesWithErrors: 1 +errorsFound: 1 +errorSummary: + broken_file: 1 +durationSecs: 1.45 +validationMode: strict +collections: + adrs: + resourceCount: 5 + hasSchema: true + validationMode: permissive + processes: + resourceCount: 10 + hasSchema: true + validationMode: permissive + systems: + resourceCount: 28 + hasSchema: true + validationMode: permissive + teams: + resourceCount: 2 + hasSchema: true + validationMode: permissive +errors: + - file: /fixtures/repo/CLAUDE.md + errors: + - line: 28 + column: 1 + type: broken_file + message: "Link target is a directory: /fixtures/repo/docs/teams" +`; + + it('should preserve YAML value lines, not just keys', () => { + const result = extractGenericErrors(vatYamlOutput); + + expect(result.errorSummary).toContain('broken_file: 1'); + expect(result.errorSummary).toContain('/fixtures/repo/CLAUDE.md'); + expect(result.errorSummary).toContain('line: 28'); + expect(result.errorSummary).toContain('message: "Link target is a directory'); + }); + + it('should preserve all top-level YAML keys', () => { + const result = extractGenericErrors(vatYamlOutput); + + expect(result.errorSummary).toContain('status:'); + expect(result.errorSummary).toContain('filesScanned:'); + expect(result.errorSummary).toContain('filesWithErrors:'); + expect(result.errorSummary).toContain('errorsFound:'); + expect(result.errorSummary).toContain('errorSummary:'); + expect(result.errorSummary).toContain('errors:'); + }); + + it('should report Command failed summary when YAML output indicates failure', () => { + const result = extractGenericErrors(vatYamlOutput); + expect(result.summary).toBe('Command failed - see output'); + }); + + it('should truncate structured YAML output longer than 80 lines', () => { + const header = 'status: failed\nphase: build\nstep: compile\n'; + const longBody = Array.from({ length: 200 }, (_, i) => ` key${i}: value${i}`).join('\n'); + const longYaml = `${header}details:\n${longBody}\n`; + + const result = extractGenericErrors(longYaml); + + const lines = (result.errorSummary ?? '').split('\n'); + expect(lines.length).toBeLessThanOrEqual(81); + expect(lines.at(-1)).toContain('truncated at 80 lines'); + }); + }); + + describe('Delimited (---bracketed) YAML extraction', () => { + it('detects ---bracketed YAML and extracts only the YAML block', () => { + const input = `preamble line 1 +> npm install +--- +status: ok +key: value +--- +trailing log line +more trailing +`; + const result = extractGenericErrors(input); + + expect(result.errorSummary).toContain('---'); + expect(result.errorSummary).toContain('status: ok'); + expect(result.errorSummary).toContain('key: value'); + expect(result.errorSummary).not.toContain('preamble line 1'); + expect(result.errorSummary).not.toContain('> npm install'); + expect(result.errorSummary).not.toContain('trailing log line'); + expect(result.errorSummary).not.toContain('more trailing'); + }); + + it('stops at non-YAML line when no closing --- exists', () => { + const input = `--- +foo: 1 +bar: 2 +this is not yaml +more log +`; + const result = extractGenericErrors(input); + + expect(result.errorSummary).toContain('---'); + expect(result.errorSummary).toContain('foo: 1'); + expect(result.errorSummary).toContain('bar: 2'); + expect(result.errorSummary).not.toContain('this is not yaml'); + expect(result.errorSummary).not.toContain('more log'); + }); + + it('comments and blank lines between --- and first key are allowed', () => { + const input = `--- +# header comment + +key: value +`; + const result = extractGenericErrors(input); + + expect(result.errorSummary).toContain('---'); + expect(result.errorSummary).toContain('# header comment'); + expect(result.errorSummary).toContain('key: value'); + }); + + it('--- alone followed by a non-YAML line is rejected (falls through)', () => { + const input = `--- +this is garbage not yaml +more stuff +`; + const result = extractGenericErrors(input); + + // YAML preservation path would set summary to 'Command failed - see output'. + // Falling through to keyword filter with no error keywords yields 'No errors detected'. + expect(result.summary).toBe('No errors detected'); + }); + + it('closing ... terminates the block', () => { + const input = `--- +key: value +... +after the ellipsis line +`; + const result = extractGenericErrors(input); + + expect(result.errorSummary).toContain('---'); + expect(result.errorSummary).toContain('key: value'); + expect(result.errorSummary).toContain('...'); + expect(result.errorSummary).not.toContain('after the ellipsis line'); + }); + + it('VAT-style output (no ---) still goes through the multi-key heuristic', () => { + const vatYamlOutput = `status: failed +filesScanned: 248 +filesWithErrors: 1 +errorsFound: 1 +errorSummary: + broken_file: 1 +durationSecs: 1.45 +validationMode: strict +collections: + adrs: + resourceCount: 5 + hasSchema: true + validationMode: permissive +errors: + - file: /fixtures/repo/CLAUDE.md + errors: + - line: 28 + column: 1 + type: broken_file + message: "Link target is a directory: /fixtures/repo/docs/teams" +`; + const result = extractGenericErrors(vatYamlOutput); + + expect(result.summary).toBe('Command failed - see output'); + expect(result.errorSummary).toContain('status: failed'); + expect(result.errorSummary).toContain('filesScanned: 248'); + expect(result.errorSummary).toContain('/fixtures/repo/CLAUDE.md'); + expect(result.errorSummary).toContain('line: 28'); + }); + }); + + describe('Non-YAML output still keyword-filtered', () => { + it('should not include non-error log lines for free-form logs', () => { + const pytestLog = `Loading config from /etc/pytest.ini +Collecting tests +Loading plugin: coverage +Starting test session +FAILED tests/test_foo.py::test_divide - ZeroDivisionError +FAILED tests/test_bar.py::test_validate - AssertionError +2 failed, 3 passed +Done`; + + const result = extractGenericErrors(pytestLog); + + expect(result.errorSummary).toContain('FAILED'); + expect(result.errorSummary).not.toContain('Loading config'); + expect(result.errorSummary).not.toContain('Loading plugin'); + }); + }); + describe('Plugin Samples', () => { it('should pass all registered samples', () => { for (const sample of genericExtractor.samples) { diff --git a/packages/extractors/src/extractors/generic/index.ts b/packages/extractors/src/extractors/generic/index.ts index 6943a4fa..e7969407 100644 --- a/packages/extractors/src/extractors/generic/index.ts +++ b/packages/extractors/src/extractors/generic/index.ts @@ -9,6 +9,8 @@ import type { ExtractorPlugin, DetectionResult, ErrorExtractorResult } from '../../types.js'; +const GENERIC_FALLBACK_PATTERN = 'Generic fallback'; + /** * Error keyword patterns for intelligent extraction */ @@ -33,17 +35,183 @@ const NOISE_PATTERNS = [ /^Already up[- ]to[- ]date/i, ]; +/** + * Max lines retained when preserving structured YAML output. + * Higher than the keyword-filter cap (20) because structured data needs + * surrounding context to remain meaningful. + */ +const YAML_MAX_LINES = 80; + +const TOP_LEVEL_YAML_KEY = /^[A-Za-z_][\w-]*:(?:\s|$)/; + +const LOG_INDICATORS = [ + 'Traceback (most recent', + 'npm ERR!', +]; + +/** + * Per-alternative regexes for YAML line classification. Each is anchored and + * matches a single non-overlapping leading-character class, so checking each + * in sequence is linear and ReDoS-safe (no nested quantifiers, no + * greedy/lazy interplay). + * + * Split into individual literals (rather than one fat alternation) to keep + * each pattern's complexity low and readable. + */ +const YAML_BLANK_LINE = /^\s*$/; // empty / whitespace-only line +const YAML_COMMENT_LINE = /^\s*#/; // comment +const YAML_DOC_MARKER = /^(?:---|\.\.\.)\s*$/; // document start/end marker +const YAML_INDENTED_LINE = /^[ \t]+\S/; // indented content (continuation/nested key/sequence value) +const YAML_ROOT_SEQUENCE = /^-(?: |$)/; // root-level sequence item ("- foo" or bare "-") +const YAML_PLAIN_ROOT_KEY = /^[A-Za-z_][\w.-]*\s*:(?: |$)/; // plain root key ("name:" or "name: value") +const YAML_QUOTED_ROOT_KEY = /^(?:"[^"\n]*"|'[^'\n]*')\s*:(?: |$)/; // quoted root key + +/** True if the line is content-bearing YAML (key or root sequence item). */ +function isYamlContentLine(line: string): boolean { + return ( + YAML_ROOT_SEQUENCE.test(line) || + YAML_PLAIN_ROOT_KEY.test(line) || + YAML_QUOTED_ROOT_KEY.test(line) + ); +} + +/** True if the line "looks like YAML" structure (content, indent, marker, blank, or comment). */ +function isYamlStructuralLine(line: string): boolean { + return ( + YAML_BLANK_LINE.test(line) || + YAML_COMMENT_LINE.test(line) || + YAML_DOC_MARKER.test(line) || + YAML_INDENTED_LINE.test(line) || + isYamlContentLine(line) + ); +} + +/** + * After the opening `---`, verifies that the first non-blank/non-comment line + * is a content-bearing YAML line (key or root sequence item). + */ +function hasYamlContentAfter(lines: string[], startIndex: number): boolean { + for (const line of lines.slice(startIndex)) { + if (line.trim() === '') continue; + if (YAML_COMMENT_LINE.test(line)) continue; + return isYamlContentLine(line); + } + return false; +} + +/** + * Collects lines starting at `openingIndex` (the opening `---`) forward, + * stopping inclusively at a closing doc marker (`---` / `...`) or exclusively + * before the first non-YAML-structural line. + */ +function collectYamlBlockLines(lines: string[], openingIndex: number): string[] { + const kept: string[] = [lines[openingIndex]]; + for (const line of lines.slice(openingIndex + 1)) { + const trimmed = line.trim(); + if (trimmed === '---' || trimmed === '...') { + kept.push(line); + return kept; + } + if (!isYamlStructuralLine(line)) return kept; + kept.push(line); + } + return kept; +} + +/** + * Detects a `---`-bracketed YAML block embedded in noisy output and extracts + * only the YAML portion (opening `---` through closing doc marker or first + * non-YAML line). Returns null if no valid block is found. + */ +function extractDelimitedYamlBlock(lines: string[]): string[] | null { + const openingIndex = lines.findIndex((line) => line.trim() === '---'); + if (openingIndex < 0) return null; + if (!hasYamlContentAfter(lines, openingIndex + 1)) return null; + + const kept = collectYamlBlockLines(lines, openingIndex); + if (kept.length < 3) return null; + return kept; +} + +/** + * Heuristic: does this output look like structured YAML rather than a free-form log? + * + * Free-form logs occasionally contain `key:` fragments, so we require multiple + * top-level keys plus the absence of log-line giveaways (stack traces, npm + * errors, file-path "at /..." frames, caret indicators). + */ +function looksLikeYaml(lines: string[]): boolean { + let topLevelKeyCount = 0; + let meaningfulLineCount = 0; + + for (const line of lines) { + if (line.trim() === '') continue; + if (NOISE_PATTERNS.some((pattern) => pattern.test(line))) continue; + meaningfulLineCount++; + + if (TOP_LEVEL_YAML_KEY.test(line)) { + topLevelKeyCount++; + } + + if (LOG_INDICATORS.some((indicator) => line.includes(indicator))) { + return false; + } + if (/^\s*at \//.test(line)) return false; + if (/^\s*\^+\s*$/.test(line)) return false; + } + + return topLevelKeyCount >= 3 && meaningfulLineCount >= 5; +} + /** * Generic extractor always accepts (lowest priority fallback) */ export function detectGeneric(_output: string): DetectionResult { return { confidence: 10, // Lowest priority - patterns: ['Generic fallback'], + patterns: [GENERIC_FALLBACK_PATTERN], reason: 'Fallback extractor for unknown formats', }; } +/** + * Shared detection metadata block used by both the YAML-preservation and + * keyword-filter result paths. Kept in sync with `detectGeneric` confidence. + */ +function genericDetectionMetadata() { + return { + extractor: 'generic', + confidence: 10, + patterns: [GENERIC_FALLBACK_PATTERN], + reason: 'Fallback extractor', + }; +} + +function buildYamlResult(lines: string[]): ErrorExtractorResult { + const denoised = lines.filter( + (line) => line.trim() === '' || !NOISE_PATTERNS.some((pattern) => pattern.test(line)) + ); + const truncated = denoised.length > YAML_MAX_LINES; + const kept = truncated ? denoised.slice(0, YAML_MAX_LINES) : denoised; + const errorSummary = truncated + ? `${kept.join('\n')}\n... (truncated at ${YAML_MAX_LINES} lines)` + : kept.join('\n'); + + return { + errors: [], + summary: 'Command failed - see output', + totalErrors: 0, + guidance: 'Review the output above and fix the errors', + errorSummary, + metadata: { + detection: genericDetectionMetadata(), + confidence: 50, + completeness: 50, + issues: [], + }, + }; +} + /** * Generic error extractor (fallback) * @@ -56,6 +224,15 @@ export function detectGeneric(_output: string): DetectionResult { */ export function extractGeneric(output: string, _command?: string): ErrorExtractorResult { const lines = output.split('\n'); + + const delimited = extractDelimitedYamlBlock(lines); + if (delimited) { + return buildYamlResult(delimited); + } + if (looksLikeYaml(lines)) { + return buildYamlResult(lines); + } + const relevantLines: string[] = []; for (const line of lines) { @@ -103,12 +280,7 @@ export function extractGeneric(output: string, _command?: string): ErrorExtracto guidance: hasErrors ? 'Review the output above and fix the errors' : '', errorSummary, metadata: { - detection: { - extractor: 'generic', - confidence: 10, - patterns: ['Generic fallback'], - reason: 'Fallback extractor', - }, + detection: genericDetectionMetadata(), confidence: 50, completeness: 50, issues: [], From e8f0870aa5fcc552fa371f92539629ff0245ae20 Mon Sep 17 00:00:00 2001 From: Jeff Dutton Date: Mon, 18 May 2026 13:11:59 -0400 Subject: [PATCH 2/4] test(extractors): collapse repeated errorSummary assertions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add module-scope expectErrorSummary({has, missing}) helper to remove duplicated toContain/not.toContain chains across the new YAML extraction tests. Sonar flagged 14.5% duplication on the test file (48 lines, 2 overlapping blocks at lines 191-238 in the prior commit). No behavior change. All 21 generic-extractor tests pass. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/extractors/generic/index.test.ts | 88 +++++++++---------- 1 file changed, 42 insertions(+), 46 deletions(-) diff --git a/packages/extractors/src/extractors/generic/index.test.ts b/packages/extractors/src/extractors/generic/index.test.ts index 81e35913..ab92b571 100644 --- a/packages/extractors/src/extractors/generic/index.test.ts +++ b/packages/extractors/src/extractors/generic/index.test.ts @@ -10,11 +10,24 @@ import { expectDetection, expectPluginMetadata, } from '../../test/helpers/extractor-test-helpers.js'; +import type { ErrorExtractorResult } from '../../types.js'; import genericExtractor from './index.js'; const { extract: extractGenericErrors } = genericExtractor; +function expectErrorSummary( + result: ErrorExtractorResult, + expectations: { has?: string[]; missing?: string[] } +): void { + for (const text of expectations.has ?? []) { + expect(result.errorSummary).toContain(text); + } + for (const text of expectations.missing ?? []) { + expect(result.errorSummary).not.toContain(text); + } +} + describe('Generic Extractor Plugin', () => { describe('detect', () => { it('should always return low confidence (fallback)', () => { @@ -150,23 +163,20 @@ errors: `; it('should preserve YAML value lines, not just keys', () => { - const result = extractGenericErrors(vatYamlOutput); - - expect(result.errorSummary).toContain('broken_file: 1'); - expect(result.errorSummary).toContain('/fixtures/repo/CLAUDE.md'); - expect(result.errorSummary).toContain('line: 28'); - expect(result.errorSummary).toContain('message: "Link target is a directory'); + expectErrorSummary(extractGenericErrors(vatYamlOutput), { + has: [ + 'broken_file: 1', + '/fixtures/repo/CLAUDE.md', + 'line: 28', + 'message: "Link target is a directory', + ], + }); }); it('should preserve all top-level YAML keys', () => { - const result = extractGenericErrors(vatYamlOutput); - - expect(result.errorSummary).toContain('status:'); - expect(result.errorSummary).toContain('filesScanned:'); - expect(result.errorSummary).toContain('filesWithErrors:'); - expect(result.errorSummary).toContain('errorsFound:'); - expect(result.errorSummary).toContain('errorSummary:'); - expect(result.errorSummary).toContain('errors:'); + expectErrorSummary(extractGenericErrors(vatYamlOutput), { + has: ['status:', 'filesScanned:', 'filesWithErrors:', 'errorsFound:', 'errorSummary:', 'errors:'], + }); }); it('should report Command failed summary when YAML output indicates failure', () => { @@ -198,15 +208,10 @@ key: value trailing log line more trailing `; - const result = extractGenericErrors(input); - - expect(result.errorSummary).toContain('---'); - expect(result.errorSummary).toContain('status: ok'); - expect(result.errorSummary).toContain('key: value'); - expect(result.errorSummary).not.toContain('preamble line 1'); - expect(result.errorSummary).not.toContain('> npm install'); - expect(result.errorSummary).not.toContain('trailing log line'); - expect(result.errorSummary).not.toContain('more trailing'); + expectErrorSummary(extractGenericErrors(input), { + has: ['---', 'status: ok', 'key: value'], + missing: ['preamble line 1', '> npm install', 'trailing log line', 'more trailing'], + }); }); it('stops at non-YAML line when no closing --- exists', () => { @@ -216,13 +221,10 @@ bar: 2 this is not yaml more log `; - const result = extractGenericErrors(input); - - expect(result.errorSummary).toContain('---'); - expect(result.errorSummary).toContain('foo: 1'); - expect(result.errorSummary).toContain('bar: 2'); - expect(result.errorSummary).not.toContain('this is not yaml'); - expect(result.errorSummary).not.toContain('more log'); + expectErrorSummary(extractGenericErrors(input), { + has: ['---', 'foo: 1', 'bar: 2'], + missing: ['this is not yaml', 'more log'], + }); }); it('comments and blank lines between --- and first key are allowed', () => { @@ -231,11 +233,9 @@ more log key: value `; - const result = extractGenericErrors(input); - - expect(result.errorSummary).toContain('---'); - expect(result.errorSummary).toContain('# header comment'); - expect(result.errorSummary).toContain('key: value'); + expectErrorSummary(extractGenericErrors(input), { + has: ['---', '# header comment', 'key: value'], + }); }); it('--- alone followed by a non-YAML line is rejected (falls through)', () => { @@ -256,12 +256,10 @@ key: value ... after the ellipsis line `; - const result = extractGenericErrors(input); - - expect(result.errorSummary).toContain('---'); - expect(result.errorSummary).toContain('key: value'); - expect(result.errorSummary).toContain('...'); - expect(result.errorSummary).not.toContain('after the ellipsis line'); + expectErrorSummary(extractGenericErrors(input), { + has: ['---', 'key: value', '...'], + missing: ['after the ellipsis line'], + }); }); it('VAT-style output (no ---) still goes through the multi-key heuristic', () => { @@ -287,12 +285,10 @@ errors: message: "Link target is a directory: /fixtures/repo/docs/teams" `; const result = extractGenericErrors(vatYamlOutput); - expect(result.summary).toBe('Command failed - see output'); - expect(result.errorSummary).toContain('status: failed'); - expect(result.errorSummary).toContain('filesScanned: 248'); - expect(result.errorSummary).toContain('/fixtures/repo/CLAUDE.md'); - expect(result.errorSummary).toContain('line: 28'); + expectErrorSummary(result, { + has: ['status: failed', 'filesScanned: 248', '/fixtures/repo/CLAUDE.md', 'line: 28'], + }); }); }); From 5fc13509b6f163ad3c897687e9d46787577a431c Mon Sep 17 00:00:00 2001 From: Jeff Dutton Date: Mon, 18 May 2026 13:57:33 -0400 Subject: [PATCH 3/4] =?UTF-8?q?fix(extractors):=20address=20review=20findi?= =?UTF-8?q?ngs=20=E2=80=94=20anchored=20log=20indicators,=20marker=20safet?= =?UTF-8?q?y,=20test=20gaps?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five fixes from PR #168 review: 1. Anchor LOG_INDICATORS via regex (`^Traceback \(most recent`, `^npm ERR!`) so that legitimate YAML containing these tokens inside a string value (e.g. `message: "Traceback in user code"`) is not mistaken for a log. 2. Run disqualifier checks before the noise filter in `looksLikeYaml` so `npm ERR!` — which is also a NOISE_PATTERN — can still disqualify a block instead of being silently skipped. 3. Replace the YAML truncation marker `... (truncated at 80 lines)` with `# ... (truncated, N additional lines omitted)`. The leading `# ` keeps the marker a YAML comment rather than a stray doc-end marker (`...`), and the count is now informative. 4. Tighten the JSDoc on the per-alternative YAML regexes to describe the actual ReDoS-safety property (bounded repetition over fixed character classes, no nested quantifiers or ambiguous alternation) rather than the inaccurate "non-overlapping leading-character class" phrasing. 5. Add seven tests in the YAML preservation suite: one per log-indicator disqualifier (Traceback, npm ERR!, stack frame, caret) and three threshold-boundary tests (exactly 3 keys + 5 lines preserved; 2 keys + 10 lines and 3 keys + 4 lines fall through). All 28 generic-extractor tests pass; full `pnpm validate` green. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/extractors/generic/index.test.ts | 144 +++++++++++++++++- .../src/extractors/generic/index.ts | 40 +++-- 2 files changed, 168 insertions(+), 16 deletions(-) diff --git a/packages/extractors/src/extractors/generic/index.test.ts b/packages/extractors/src/extractors/generic/index.test.ts index ab92b571..9a0547d0 100644 --- a/packages/extractors/src/extractors/generic/index.test.ts +++ b/packages/extractors/src/extractors/generic/index.test.ts @@ -184,6 +184,139 @@ errors: expect(result.summary).toBe('Command failed - see output'); }); + // ---- Log-indicator disqualifier tests ---------------------------------- + // + // Each input below satisfies the multi-key heuristic on its own + // (>= 3 top-level keys, >= 5 meaningful lines) AND contains exactly one + // log-indicator line. If the log indicator did not disqualify the input, + // `looksLikeYaml` would return true and `summary` would be set to + // 'Command failed - see output' (YAML preservation path). We detect + // "fell through to keyword filter" by asserting `summary` is the + // benign value the keyword filter produces when no error keywords match. + // + // Note: each input below intentionally uses error-keyword-free body lines + // (`status: ok`, `phase: build`, etc.) and a log indicator that is NOT + // itself an error keyword recognised by the keyword filter — so when the + // disqualifier rejects YAML preservation, the keyword filter sees no + // matches and reports 'No errors detected'. + + it('Traceback line disqualifies YAML preservation', () => { + // 'Traceback' is an ERROR_KEYWORD, so the keyword filter would still + // mark this as 'Command failed - see output'. Distinguish the two + // paths by asserting a body line that only YAML preservation keeps. + const input = `status: ok +phase: build +step: compile +detail: foo +note: bar +Traceback (most recent call last): +`; + const result = extractGenericErrors(input); + // YAML path would preserve `phase: build`; keyword filter drops it + // because it has no error keyword / file:line / summary token. + expectErrorSummary(result, { + missing: ['phase: build', 'detail: foo', 'note: bar'], + }); + }); + + it('npm ERR! line disqualifies YAML preservation', () => { + // `npm ERR!` is also a NOISE_PATTERN, so it gets stripped by both + // paths. The distinguishing signal is `summary`: YAML preservation + // sets 'Command failed - see output'; the keyword filter with no + // error keywords matched sets 'No errors detected'. + const input = `status: ok +phase: build +step: compile +detail: foo +note: bar +npm ERR! code ELIFECYCLE +`; + const result = extractGenericErrors(input); + expect(result.summary).toBe('No errors detected'); + }); + + it('stack frame ("at /...") line disqualifies YAML preservation', () => { + // 'at ' (with trailing space) is an ERROR_KEYWORD, so distinguish via + // a YAML-only body line as in the Traceback test above. + const input = `status: ok +phase: build +step: compile +detail: foo +note: bar + at /home/user/app/index.js:42:9 +`; + const result = extractGenericErrors(input); + expectErrorSummary(result, { + missing: ['phase: build', 'detail: foo', 'note: bar'], + }); + }); + + it('caret indicator line disqualifies YAML preservation', () => { + // A bare `^^^^` line matches no error keyword, so `summary` cleanly + // distinguishes the two paths. + const input = `status: ok +phase: build +step: compile +detail: foo +note: bar + ^^^^ +`; + const result = extractGenericErrors(input); + expect(result.summary).toBe('No errors detected'); + }); + + // ---- Threshold-boundary tests ------------------------------------------ + + it('exactly 3 top-level keys + 5 meaningful lines is preserved as YAML', () => { + // 3 root keys (status, phase, step) + 2 indented continuation lines = + // exactly 5 meaningful lines. Indented lines do not count as + // top-level keys, so this lands exactly on both floors. + const input = `status: ok +phase: build +step: compile + detail: foo + note: bar +`; + const result = extractGenericErrors(input); + expect(result.summary).toBe('Command failed - see output'); + // Indented body lines are preserved on the YAML path; the keyword + // filter would drop them. + expectErrorSummary(result, { + has: ['status: ok', 'phase: build', 'detail: foo', 'note: bar'], + }); + }); + + it('2 top-level keys + 10 meaningful lines falls through to keyword filter', () => { + // Two top-level keys; remaining lines are indented (so they do NOT + // increment the top-level key count) but they are non-blank / + // non-noise, so meaningfulLineCount >= 5. + const input = `status: ok +details: + one: 1 + two: 2 + three: 3 + four: 4 + five: 5 + six: 6 + seven: 7 + eight: 8 +`; + const result = extractGenericErrors(input); + expect(result.summary).toBe('No errors detected'); + }); + + it('3 top-level keys + 4 meaningful lines falls through to keyword filter', () => { + // 3 root keys (status, phase, step) + 1 indented continuation line = + // 4 meaningful lines total, below the 5-line floor. + const input = `status: ok +phase: build +step: compile + nested: thing +`; + const result = extractGenericErrors(input); + expect(result.summary).toBe('No errors detected'); + }); + it('should truncate structured YAML output longer than 80 lines', () => { const header = 'status: failed\nphase: build\nstep: compile\n'; const longBody = Array.from({ length: 200 }, (_, i) => ` key${i}: value${i}`).join('\n'); @@ -193,7 +326,16 @@ errors: const lines = (result.errorSummary ?? '').split('\n'); expect(lines.length).toBeLessThanOrEqual(81); - expect(lines.at(-1)).toContain('truncated at 80 lines'); + const lastLine = lines.at(-1) ?? ''; + // Marker must be a YAML comment (`# ...`) so downstream YAML parsers + // treat the truncation note as a comment rather than a doc-end marker. + expect(lastLine.startsWith('# ')).toBe(true); + // Header contributes 4 lines ("status:", "phase:", "step:", "details:"), + // plus 200 body lines, plus 1 trailing empty line from the closing `\n` + // (`buildYamlResult` keeps blank lines). 205 - 80 = 125 omitted. + const denoisedTotal = 4 + 200 + 1; + const omitted = denoisedTotal - 80; + expect(lastLine).toContain(`${omitted} additional lines omitted`); }); }); diff --git a/packages/extractors/src/extractors/generic/index.ts b/packages/extractors/src/extractors/generic/index.ts index e7969407..49889cf6 100644 --- a/packages/extractors/src/extractors/generic/index.ts +++ b/packages/extractors/src/extractors/generic/index.ts @@ -44,19 +44,22 @@ const YAML_MAX_LINES = 80; const TOP_LEVEL_YAML_KEY = /^[A-Za-z_][\w-]*:(?:\s|$)/; +// Anchored at `^` so that legitimate YAML containing these tokens inside a +// string value (e.g. `message: "Traceback in user code"`) is not mistaken for +// a free-form log. const LOG_INDICATORS = [ - 'Traceback (most recent', - 'npm ERR!', + /^Traceback \(most recent/, + /^npm ERR!/, ]; /** - * Per-alternative regexes for YAML line classification. Each is anchored and - * matches a single non-overlapping leading-character class, so checking each - * in sequence is linear and ReDoS-safe (no nested quantifiers, no - * greedy/lazy interplay). + * Per-alternative regexes for YAML line classification. * - * Split into individual literals (rather than one fat alternation) to keep - * each pattern's complexity low and readable. + * Each pattern is anchored at `^`, uses only bounded repetition over fixed + * character classes, and has no nested quantifiers or ambiguous alternation — + * so each is O(line length) and ReDoS-safe. Split into individual literals + * (rather than one combined alternation) to keep each pattern small enough + * to audit by eye and to satisfy `sonarjs/regex-complexity`. */ const YAML_BLANK_LINE = /^\s*$/; // empty / whitespace-only line const YAML_COMMENT_LINE = /^\s*#/; // comment @@ -146,18 +149,22 @@ function looksLikeYaml(lines: string[]): boolean { for (const line of lines) { if (line.trim() === '') continue; + + // Disqualifier checks run BEFORE the noise filter so that lines which + // happen to overlap a noise pattern (e.g. `npm ERR!` is both a noise + // pattern and a log indicator) can still disqualify the block. + if (LOG_INDICATORS.some((re) => re.test(line))) { + return false; + } + if (/^\s*at \//.test(line)) return false; + if (/^\s*\^+\s*$/.test(line)) return false; + if (NOISE_PATTERNS.some((pattern) => pattern.test(line))) continue; meaningfulLineCount++; if (TOP_LEVEL_YAML_KEY.test(line)) { topLevelKeyCount++; } - - if (LOG_INDICATORS.some((indicator) => line.includes(indicator))) { - return false; - } - if (/^\s*at \//.test(line)) return false; - if (/^\s*\^+\s*$/.test(line)) return false; } return topLevelKeyCount >= 3 && meaningfulLineCount >= 5; @@ -193,8 +200,11 @@ function buildYamlResult(lines: string[]): ErrorExtractorResult { ); const truncated = denoised.length > YAML_MAX_LINES; const kept = truncated ? denoised.slice(0, YAML_MAX_LINES) : denoised; + // Prefix the marker with `# ` so it remains valid YAML (a comment) rather + // than a stray document-end marker (`...`) that would terminate parsing. + const omitted = denoised.length - YAML_MAX_LINES; const errorSummary = truncated - ? `${kept.join('\n')}\n... (truncated at ${YAML_MAX_LINES} lines)` + ? `${kept.join('\n')}\n# ... (truncated, ${omitted} additional lines omitted)` : kept.join('\n'); return { From d083498e4c3148b6f84d17ed4e194a1ac58a723e Mon Sep 17 00:00:00 2001 From: Jeff Dutton Date: Mon, 18 May 2026 14:05:42 -0400 Subject: [PATCH 4/4] test(extractors): factor qualifying-YAML fixture for disqualifier tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Collapse the repeated 5-line YAML body used across the four log-indicator disqualifier tests and the threshold-boundary test. Module-scope qualifyingYamlPlus(trailer) helper plus a YAML_ONLY_BODY constant. Sonar flagged 20.7% duplication on the test file after the prior commit; this refactor brings the duplicated block under the gate. No behavior change. All 28 generic-extractor tests pass. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/extractors/generic/index.test.ts | 82 ++++++++----------- 1 file changed, 33 insertions(+), 49 deletions(-) diff --git a/packages/extractors/src/extractors/generic/index.test.ts b/packages/extractors/src/extractors/generic/index.test.ts index 9a0547d0..c0efc23b 100644 --- a/packages/extractors/src/extractors/generic/index.test.ts +++ b/packages/extractors/src/extractors/generic/index.test.ts @@ -28,6 +28,25 @@ function expectErrorSummary( } } +/** + * Build an input that on its own satisfies the multi-key YAML heuristic + * (3 top-level keys + 5 meaningful lines), then appends the given trailer + * line. Used by both the log-indicator disqualifier tests and the + * threshold-boundary tests. + */ +function qualifyingYamlPlus(trailer: string): string { + return `status: ok +phase: build +step: compile +detail: foo +note: bar +${trailer} +`; +} + +/** Body-only lines that ARE preserved by the YAML path but dropped by the keyword filter. */ +const YAML_ONLY_BODY = ['phase: build', 'detail: foo', 'note: bar']; + describe('Generic Extractor Plugin', () => { describe('detect', () => { it('should always return low confidence (fallback)', () => { @@ -201,67 +220,32 @@ errors: // matches and reports 'No errors detected'. it('Traceback line disqualifies YAML preservation', () => { - // 'Traceback' is an ERROR_KEYWORD, so the keyword filter would still - // mark this as 'Command failed - see output'. Distinguish the two - // paths by asserting a body line that only YAML preservation keeps. - const input = `status: ok -phase: build -step: compile -detail: foo -note: bar -Traceback (most recent call last): -`; - const result = extractGenericErrors(input); - // YAML path would preserve `phase: build`; keyword filter drops it - // because it has no error keyword / file:line / summary token. - expectErrorSummary(result, { - missing: ['phase: build', 'detail: foo', 'note: bar'], + // 'Traceback' is itself an ERROR_KEYWORD, so the keyword filter would still + // set summary to 'Command failed'. Distinguish the two paths by asserting + // YAML-only body lines were dropped. + expectErrorSummary(extractGenericErrors(qualifyingYamlPlus('Traceback (most recent call last):')), { + missing: YAML_ONLY_BODY, }); }); it('npm ERR! line disqualifies YAML preservation', () => { - // `npm ERR!` is also a NOISE_PATTERN, so it gets stripped by both - // paths. The distinguishing signal is `summary`: YAML preservation - // sets 'Command failed - see output'; the keyword filter with no - // error keywords matched sets 'No errors detected'. - const input = `status: ok -phase: build -step: compile -detail: foo -note: bar -npm ERR! code ELIFECYCLE -`; - const result = extractGenericErrors(input); + // `npm ERR!` is also a NOISE_PATTERN, so both paths strip the line itself. + // Summary distinguishes: YAML path sets 'Command failed'; keyword filter + // with no matches sets 'No errors detected'. + const result = extractGenericErrors(qualifyingYamlPlus('npm ERR! code ELIFECYCLE')); expect(result.summary).toBe('No errors detected'); }); it('stack frame ("at /...") line disqualifies YAML preservation', () => { - // 'at ' (with trailing space) is an ERROR_KEYWORD, so distinguish via - // a YAML-only body line as in the Traceback test above. - const input = `status: ok -phase: build -step: compile -detail: foo -note: bar - at /home/user/app/index.js:42:9 -`; - const result = extractGenericErrors(input); - expectErrorSummary(result, { - missing: ['phase: build', 'detail: foo', 'note: bar'], + // 'at ' is an ERROR_KEYWORD, so distinguish via YAML-only body lines. + expectErrorSummary(extractGenericErrors(qualifyingYamlPlus(' at /home/user/app/index.js:42:9')), { + missing: YAML_ONLY_BODY, }); }); it('caret indicator line disqualifies YAML preservation', () => { - // A bare `^^^^` line matches no error keyword, so `summary` cleanly - // distinguishes the two paths. - const input = `status: ok -phase: build -step: compile -detail: foo -note: bar - ^^^^ -`; - const result = extractGenericErrors(input); + // Bare `^^^^` matches no error keyword, so `summary` cleanly distinguishes. + const result = extractGenericErrors(qualifyingYamlPlus(' ^^^^')); expect(result.summary).toBe('No errors detected'); });