diff --git a/packages/extractors/src/extractors/generic/index.test.ts b/packages/extractors/src/extractors/generic/index.test.ts index fe6916e6..c0efc23b 100644 --- a/packages/extractors/src/extractors/generic/index.test.ts +++ b/packages/extractors/src/extractors/generic/index.test.ts @@ -10,11 +10,43 @@ import { expectDetection, expectPluginMetadata, } from '../../test/helpers/extractor-test-helpers.js'; +import type { ErrorExtractorResult } from '../../types.js'; import genericExtractor from './index.js'; const { extract: extractGenericErrors } = genericExtractor; +function expectErrorSummary( + result: ErrorExtractorResult, + expectations: { has?: string[]; missing?: string[] } +): void { + for (const text of expectations.has ?? []) { + expect(result.errorSummary).toContain(text); + } + for (const text of expectations.missing ?? []) { + expect(result.errorSummary).not.toContain(text); + } +} + +/** + * Build an input that on its own satisfies the multi-key YAML heuristic + * (3 top-level keys + 5 meaningful lines), then appends the given trailer + * line. Used by both the log-indicator disqualifier tests and the + * threshold-boundary tests. + */ +function qualifyingYamlPlus(trailer: string): string { + return `status: ok +phase: build +step: compile +detail: foo +note: bar +${trailer} +`; +} + +/** Body-only lines that ARE preserved by the YAML path but dropped by the keyword filter. */ +const YAML_ONLY_BODY = ['phase: build', 'detail: foo', 'note: bar']; + describe('Generic Extractor Plugin', () => { describe('detect', () => { it('should always return low confidence (fallback)', () => { @@ -114,6 +146,297 @@ FAIL example.com/project 0.123s }); }); + describe('Structured YAML output preservation', () => { + const vatYamlOutput = `status: failed +filesScanned: 248 +filesWithErrors: 1 +errorsFound: 1 +errorSummary: + broken_file: 1 +durationSecs: 1.45 +validationMode: strict +collections: + adrs: + resourceCount: 5 + hasSchema: true + validationMode: permissive + processes: + resourceCount: 10 + hasSchema: true + validationMode: permissive + systems: + resourceCount: 28 + hasSchema: true + validationMode: permissive + teams: + resourceCount: 2 + hasSchema: true + validationMode: permissive +errors: + - file: /fixtures/repo/CLAUDE.md + errors: + - line: 28 + column: 1 + type: broken_file + message: "Link target is a directory: /fixtures/repo/docs/teams" +`; + + it('should preserve YAML value lines, not just keys', () => { + expectErrorSummary(extractGenericErrors(vatYamlOutput), { + has: [ + 'broken_file: 1', + '/fixtures/repo/CLAUDE.md', + 'line: 28', + 'message: "Link target is a directory', + ], + }); + }); + + it('should preserve all top-level YAML keys', () => { + expectErrorSummary(extractGenericErrors(vatYamlOutput), { + has: ['status:', 'filesScanned:', 'filesWithErrors:', 'errorsFound:', 'errorSummary:', 'errors:'], + }); + }); + + it('should report Command failed summary when YAML output indicates failure', () => { + const result = extractGenericErrors(vatYamlOutput); + expect(result.summary).toBe('Command failed - see output'); + }); + + // ---- Log-indicator disqualifier tests ---------------------------------- + // + // Each input below satisfies the multi-key heuristic on its own + // (>= 3 top-level keys, >= 5 meaningful lines) AND contains exactly one + // log-indicator line. If the log indicator did not disqualify the input, + // `looksLikeYaml` would return true and `summary` would be set to + // 'Command failed - see output' (YAML preservation path). We detect + // "fell through to keyword filter" by asserting `summary` is the + // benign value the keyword filter produces when no error keywords match. + // + // Note: each input below intentionally uses error-keyword-free body lines + // (`status: ok`, `phase: build`, etc.) and a log indicator that is NOT + // itself an error keyword recognised by the keyword filter — so when the + // disqualifier rejects YAML preservation, the keyword filter sees no + // matches and reports 'No errors detected'. + + it('Traceback line disqualifies YAML preservation', () => { + // 'Traceback' is itself an ERROR_KEYWORD, so the keyword filter would still + // set summary to 'Command failed'. Distinguish the two paths by asserting + // YAML-only body lines were dropped. + expectErrorSummary(extractGenericErrors(qualifyingYamlPlus('Traceback (most recent call last):')), { + missing: YAML_ONLY_BODY, + }); + }); + + it('npm ERR! line disqualifies YAML preservation', () => { + // `npm ERR!` is also a NOISE_PATTERN, so both paths strip the line itself. + // Summary distinguishes: YAML path sets 'Command failed'; keyword filter + // with no matches sets 'No errors detected'. + const result = extractGenericErrors(qualifyingYamlPlus('npm ERR! code ELIFECYCLE')); + expect(result.summary).toBe('No errors detected'); + }); + + it('stack frame ("at /...") line disqualifies YAML preservation', () => { + // 'at ' is an ERROR_KEYWORD, so distinguish via YAML-only body lines. + expectErrorSummary(extractGenericErrors(qualifyingYamlPlus(' at /home/user/app/index.js:42:9')), { + missing: YAML_ONLY_BODY, + }); + }); + + it('caret indicator line disqualifies YAML preservation', () => { + // Bare `^^^^` matches no error keyword, so `summary` cleanly distinguishes. + const result = extractGenericErrors(qualifyingYamlPlus(' ^^^^')); + expect(result.summary).toBe('No errors detected'); + }); + + // ---- Threshold-boundary tests ------------------------------------------ + + it('exactly 3 top-level keys + 5 meaningful lines is preserved as YAML', () => { + // 3 root keys (status, phase, step) + 2 indented continuation lines = + // exactly 5 meaningful lines. Indented lines do not count as + // top-level keys, so this lands exactly on both floors. + const input = `status: ok +phase: build +step: compile + detail: foo + note: bar +`; + const result = extractGenericErrors(input); + expect(result.summary).toBe('Command failed - see output'); + // Indented body lines are preserved on the YAML path; the keyword + // filter would drop them. + expectErrorSummary(result, { + has: ['status: ok', 'phase: build', 'detail: foo', 'note: bar'], + }); + }); + + it('2 top-level keys + 10 meaningful lines falls through to keyword filter', () => { + // Two top-level keys; remaining lines are indented (so they do NOT + // increment the top-level key count) but they are non-blank / + // non-noise, so meaningfulLineCount >= 5. + const input = `status: ok +details: + one: 1 + two: 2 + three: 3 + four: 4 + five: 5 + six: 6 + seven: 7 + eight: 8 +`; + const result = extractGenericErrors(input); + expect(result.summary).toBe('No errors detected'); + }); + + it('3 top-level keys + 4 meaningful lines falls through to keyword filter', () => { + // 3 root keys (status, phase, step) + 1 indented continuation line = + // 4 meaningful lines total, below the 5-line floor. + const input = `status: ok +phase: build +step: compile + nested: thing +`; + const result = extractGenericErrors(input); + expect(result.summary).toBe('No errors detected'); + }); + + it('should truncate structured YAML output longer than 80 lines', () => { + const header = 'status: failed\nphase: build\nstep: compile\n'; + const longBody = Array.from({ length: 200 }, (_, i) => ` key${i}: value${i}`).join('\n'); + const longYaml = `${header}details:\n${longBody}\n`; + + const result = extractGenericErrors(longYaml); + + const lines = (result.errorSummary ?? '').split('\n'); + expect(lines.length).toBeLessThanOrEqual(81); + const lastLine = lines.at(-1) ?? ''; + // Marker must be a YAML comment (`# ...`) so downstream YAML parsers + // treat the truncation note as a comment rather than a doc-end marker. + expect(lastLine.startsWith('# ')).toBe(true); + // Header contributes 4 lines ("status:", "phase:", "step:", "details:"), + // plus 200 body lines, plus 1 trailing empty line from the closing `\n` + // (`buildYamlResult` keeps blank lines). 205 - 80 = 125 omitted. + const denoisedTotal = 4 + 200 + 1; + const omitted = denoisedTotal - 80; + expect(lastLine).toContain(`${omitted} additional lines omitted`); + }); + }); + + describe('Delimited (---bracketed) YAML extraction', () => { + it('detects ---bracketed YAML and extracts only the YAML block', () => { + const input = `preamble line 1 +> npm install +--- +status: ok +key: value +--- +trailing log line +more trailing +`; + expectErrorSummary(extractGenericErrors(input), { + has: ['---', 'status: ok', 'key: value'], + missing: ['preamble line 1', '> npm install', 'trailing log line', 'more trailing'], + }); + }); + + it('stops at non-YAML line when no closing --- exists', () => { + const input = `--- +foo: 1 +bar: 2 +this is not yaml +more log +`; + expectErrorSummary(extractGenericErrors(input), { + has: ['---', 'foo: 1', 'bar: 2'], + missing: ['this is not yaml', 'more log'], + }); + }); + + it('comments and blank lines between --- and first key are allowed', () => { + const input = `--- +# header comment + +key: value +`; + expectErrorSummary(extractGenericErrors(input), { + has: ['---', '# header comment', 'key: value'], + }); + }); + + it('--- alone followed by a non-YAML line is rejected (falls through)', () => { + const input = `--- +this is garbage not yaml +more stuff +`; + const result = extractGenericErrors(input); + + // YAML preservation path would set summary to 'Command failed - see output'. + // Falling through to keyword filter with no error keywords yields 'No errors detected'. + expect(result.summary).toBe('No errors detected'); + }); + + it('closing ... terminates the block', () => { + const input = `--- +key: value +... +after the ellipsis line +`; + expectErrorSummary(extractGenericErrors(input), { + has: ['---', 'key: value', '...'], + missing: ['after the ellipsis line'], + }); + }); + + it('VAT-style output (no ---) still goes through the multi-key heuristic', () => { + const vatYamlOutput = `status: failed +filesScanned: 248 +filesWithErrors: 1 +errorsFound: 1 +errorSummary: + broken_file: 1 +durationSecs: 1.45 +validationMode: strict +collections: + adrs: + resourceCount: 5 + hasSchema: true + validationMode: permissive +errors: + - file: /fixtures/repo/CLAUDE.md + errors: + - line: 28 + column: 1 + type: broken_file + message: "Link target is a directory: /fixtures/repo/docs/teams" +`; + const result = extractGenericErrors(vatYamlOutput); + expect(result.summary).toBe('Command failed - see output'); + expectErrorSummary(result, { + has: ['status: failed', 'filesScanned: 248', '/fixtures/repo/CLAUDE.md', 'line: 28'], + }); + }); + }); + + describe('Non-YAML output still keyword-filtered', () => { + it('should not include non-error log lines for free-form logs', () => { + const pytestLog = `Loading config from /etc/pytest.ini +Collecting tests +Loading plugin: coverage +Starting test session +FAILED tests/test_foo.py::test_divide - ZeroDivisionError +FAILED tests/test_bar.py::test_validate - AssertionError +2 failed, 3 passed +Done`; + + const result = extractGenericErrors(pytestLog); + + expect(result.errorSummary).toContain('FAILED'); + expect(result.errorSummary).not.toContain('Loading config'); + expect(result.errorSummary).not.toContain('Loading plugin'); + }); + }); + describe('Plugin Samples', () => { it('should pass all registered samples', () => { for (const sample of genericExtractor.samples) { diff --git a/packages/extractors/src/extractors/generic/index.ts b/packages/extractors/src/extractors/generic/index.ts index 6943a4fa..49889cf6 100644 --- a/packages/extractors/src/extractors/generic/index.ts +++ b/packages/extractors/src/extractors/generic/index.ts @@ -9,6 +9,8 @@ import type { ExtractorPlugin, DetectionResult, ErrorExtractorResult } from '../../types.js'; +const GENERIC_FALLBACK_PATTERN = 'Generic fallback'; + /** * Error keyword patterns for intelligent extraction */ @@ -33,17 +35,193 @@ const NOISE_PATTERNS = [ /^Already up[- ]to[- ]date/i, ]; +/** + * Max lines retained when preserving structured YAML output. + * Higher than the keyword-filter cap (20) because structured data needs + * surrounding context to remain meaningful. + */ +const YAML_MAX_LINES = 80; + +const TOP_LEVEL_YAML_KEY = /^[A-Za-z_][\w-]*:(?:\s|$)/; + +// Anchored at `^` so that legitimate YAML containing these tokens inside a +// string value (e.g. `message: "Traceback in user code"`) is not mistaken for +// a free-form log. +const LOG_INDICATORS = [ + /^Traceback \(most recent/, + /^npm ERR!/, +]; + +/** + * Per-alternative regexes for YAML line classification. + * + * Each pattern is anchored at `^`, uses only bounded repetition over fixed + * character classes, and has no nested quantifiers or ambiguous alternation — + * so each is O(line length) and ReDoS-safe. Split into individual literals + * (rather than one combined alternation) to keep each pattern small enough + * to audit by eye and to satisfy `sonarjs/regex-complexity`. + */ +const YAML_BLANK_LINE = /^\s*$/; // empty / whitespace-only line +const YAML_COMMENT_LINE = /^\s*#/; // comment +const YAML_DOC_MARKER = /^(?:---|\.\.\.)\s*$/; // document start/end marker +const YAML_INDENTED_LINE = /^[ \t]+\S/; // indented content (continuation/nested key/sequence value) +const YAML_ROOT_SEQUENCE = /^-(?: |$)/; // root-level sequence item ("- foo" or bare "-") +const YAML_PLAIN_ROOT_KEY = /^[A-Za-z_][\w.-]*\s*:(?: |$)/; // plain root key ("name:" or "name: value") +const YAML_QUOTED_ROOT_KEY = /^(?:"[^"\n]*"|'[^'\n]*')\s*:(?: |$)/; // quoted root key + +/** True if the line is content-bearing YAML (key or root sequence item). */ +function isYamlContentLine(line: string): boolean { + return ( + YAML_ROOT_SEQUENCE.test(line) || + YAML_PLAIN_ROOT_KEY.test(line) || + YAML_QUOTED_ROOT_KEY.test(line) + ); +} + +/** True if the line "looks like YAML" structure (content, indent, marker, blank, or comment). */ +function isYamlStructuralLine(line: string): boolean { + return ( + YAML_BLANK_LINE.test(line) || + YAML_COMMENT_LINE.test(line) || + YAML_DOC_MARKER.test(line) || + YAML_INDENTED_LINE.test(line) || + isYamlContentLine(line) + ); +} + +/** + * After the opening `---`, verifies that the first non-blank/non-comment line + * is a content-bearing YAML line (key or root sequence item). + */ +function hasYamlContentAfter(lines: string[], startIndex: number): boolean { + for (const line of lines.slice(startIndex)) { + if (line.trim() === '') continue; + if (YAML_COMMENT_LINE.test(line)) continue; + return isYamlContentLine(line); + } + return false; +} + +/** + * Collects lines starting at `openingIndex` (the opening `---`) forward, + * stopping inclusively at a closing doc marker (`---` / `...`) or exclusively + * before the first non-YAML-structural line. + */ +function collectYamlBlockLines(lines: string[], openingIndex: number): string[] { + const kept: string[] = [lines[openingIndex]]; + for (const line of lines.slice(openingIndex + 1)) { + const trimmed = line.trim(); + if (trimmed === '---' || trimmed === '...') { + kept.push(line); + return kept; + } + if (!isYamlStructuralLine(line)) return kept; + kept.push(line); + } + return kept; +} + +/** + * Detects a `---`-bracketed YAML block embedded in noisy output and extracts + * only the YAML portion (opening `---` through closing doc marker or first + * non-YAML line). Returns null if no valid block is found. + */ +function extractDelimitedYamlBlock(lines: string[]): string[] | null { + const openingIndex = lines.findIndex((line) => line.trim() === '---'); + if (openingIndex < 0) return null; + if (!hasYamlContentAfter(lines, openingIndex + 1)) return null; + + const kept = collectYamlBlockLines(lines, openingIndex); + if (kept.length < 3) return null; + return kept; +} + +/** + * Heuristic: does this output look like structured YAML rather than a free-form log? + * + * Free-form logs occasionally contain `key:` fragments, so we require multiple + * top-level keys plus the absence of log-line giveaways (stack traces, npm + * errors, file-path "at /..." frames, caret indicators). + */ +function looksLikeYaml(lines: string[]): boolean { + let topLevelKeyCount = 0; + let meaningfulLineCount = 0; + + for (const line of lines) { + if (line.trim() === '') continue; + + // Disqualifier checks run BEFORE the noise filter so that lines which + // happen to overlap a noise pattern (e.g. `npm ERR!` is both a noise + // pattern and a log indicator) can still disqualify the block. + if (LOG_INDICATORS.some((re) => re.test(line))) { + return false; + } + if (/^\s*at \//.test(line)) return false; + if (/^\s*\^+\s*$/.test(line)) return false; + + if (NOISE_PATTERNS.some((pattern) => pattern.test(line))) continue; + meaningfulLineCount++; + + if (TOP_LEVEL_YAML_KEY.test(line)) { + topLevelKeyCount++; + } + } + + return topLevelKeyCount >= 3 && meaningfulLineCount >= 5; +} + /** * Generic extractor always accepts (lowest priority fallback) */ export function detectGeneric(_output: string): DetectionResult { return { confidence: 10, // Lowest priority - patterns: ['Generic fallback'], + patterns: [GENERIC_FALLBACK_PATTERN], reason: 'Fallback extractor for unknown formats', }; } +/** + * Shared detection metadata block used by both the YAML-preservation and + * keyword-filter result paths. Kept in sync with `detectGeneric` confidence. + */ +function genericDetectionMetadata() { + return { + extractor: 'generic', + confidence: 10, + patterns: [GENERIC_FALLBACK_PATTERN], + reason: 'Fallback extractor', + }; +} + +function buildYamlResult(lines: string[]): ErrorExtractorResult { + const denoised = lines.filter( + (line) => line.trim() === '' || !NOISE_PATTERNS.some((pattern) => pattern.test(line)) + ); + const truncated = denoised.length > YAML_MAX_LINES; + const kept = truncated ? denoised.slice(0, YAML_MAX_LINES) : denoised; + // Prefix the marker with `# ` so it remains valid YAML (a comment) rather + // than a stray document-end marker (`...`) that would terminate parsing. + const omitted = denoised.length - YAML_MAX_LINES; + const errorSummary = truncated + ? `${kept.join('\n')}\n# ... (truncated, ${omitted} additional lines omitted)` + : kept.join('\n'); + + return { + errors: [], + summary: 'Command failed - see output', + totalErrors: 0, + guidance: 'Review the output above and fix the errors', + errorSummary, + metadata: { + detection: genericDetectionMetadata(), + confidence: 50, + completeness: 50, + issues: [], + }, + }; +} + /** * Generic error extractor (fallback) * @@ -56,6 +234,15 @@ export function detectGeneric(_output: string): DetectionResult { */ export function extractGeneric(output: string, _command?: string): ErrorExtractorResult { const lines = output.split('\n'); + + const delimited = extractDelimitedYamlBlock(lines); + if (delimited) { + return buildYamlResult(delimited); + } + if (looksLikeYaml(lines)) { + return buildYamlResult(lines); + } + const relevantLines: string[] = []; for (const line of lines) { @@ -103,12 +290,7 @@ export function extractGeneric(output: string, _command?: string): ErrorExtracto guidance: hasErrors ? 'Review the output above and fix the errors' : '', errorSummary, metadata: { - detection: { - extractor: 'generic', - confidence: 10, - patterns: ['Generic fallback'], - reason: 'Fallback extractor', - }, + detection: genericDetectionMetadata(), confidence: 50, completeness: 50, issues: [],