From f3efa92856ad1434d24de69c37288dd602d74712 Mon Sep 17 00:00:00 2001 From: Matjaz Domen Pecan Date: Fri, 17 Apr 2026 15:10:28 +0200 Subject: [PATCH] refactor(sexp): table-drive ANSI-C escape dispatch process_ansi_c_content shrinks from ~140 lines to a 20-line loop that delegates to per-escape-kind helpers: - simple_escape(esc) -> Option: const table of 1-char escapes (n/t/r/a/b/f/v/e/E/backslash/doublequote) - handle_escape: dispatcher returning bool (true on NUL truncation) - handle_hex: \xNN (up to 2 hex digits, 0x80+ -> U+FFFD, 0x01/0x7F CTLESC) - handle_unicode: \uNNNN (width=4) and \UNNNNNNNN (width=8) share one helper - handle_octal: octal escapes followed by up to 2 octal digits, CTLESC for 0x01/0x7F - handle_control: \cX -> chr(X & 0x1F); \c@ silently dropped - push_with_ctlesc: centralises 0x01/0x7F prefix logic (previously duped across hex and octal branches) - push_escaped_quote: inlines the 4-char output for escaped single quote; removes the unnecessary recursive process_ansi_c_continue (the recursion was semantically equivalent to letting the outer loop continue) Drops the #[allow(clippy::too_many_lines)] attribute. Decoding is byte-identical: all parable corpus tests (including the 325-line 24_ansi_c_quoting.tests) and both ansi_c oracle test files pass. Part of #61 (v0.2.0 cycle). Closes #56 Co-Authored-By: Claude Opus 4.7 (1M context) --- src/sexp/ansi_c.rs | 283 ++++++++++++++++++++++++--------------------- 1 file changed, 149 insertions(+), 134 deletions(-) diff --git a/src/sexp/ansi_c.rs b/src/sexp/ansi_c.rs index 990a191..cc36b02 100644 --- a/src/sexp/ansi_c.rs +++ b/src/sexp/ansi_c.rs @@ -11,154 +11,169 @@ /// `chars` is the full character array, `pos` points to the first char after `$'`. /// Returns the processed content (without surrounding quotes). /// Advances `pos` past the closing `'`. -#[allow(clippy::too_many_lines)] pub fn process_ansi_c_content(chars: &[char], pos: &mut usize) -> String { let mut out = String::new(); while *pos < chars.len() { let c = chars[*pos]; if c == '\'' { - *pos += 1; // skip closing ' + *pos += 1; return out; } - if c == '\\' && *pos + 1 < chars.len() { - *pos += 1; - let esc = chars[*pos]; - *pos += 1; - match esc { - 'n' => out.push('\n'), - 't' => out.push('\t'), - 'r' => out.push('\r'), - 'a' => out.push('\x07'), - 'b' => out.push('\x08'), - 'f' => out.push('\x0C'), - 'v' => out.push('\x0B'), - 'e' | 'E' => out.push('\x1B'), - '\\' => out.push('\\'), - 'c' => { - // Control character: \cX → chr(X & 0x1F) - if *pos < chars.len() { - let ctrl = chars[*pos]; - *pos += 1; - let val = (ctrl as u32) & 0x1F; - if val > 0 - && let Some(ch) = char::from_u32(val) - { - out.push(ch); - } - // \c@ or val==0 → NUL, which is dropped - } else { - // \c at end of string — output literal \c - out.push('\\'); - out.push('c'); - } - } - '\'' => { - // Escaped single quote: output as '\\'' - out.push('\''); - out.push('\\'); - out.push('\''); - out.push('\''); - return process_ansi_c_continue(chars, pos, out); - } - '"' => out.push('"'), - 'x' => { - // Hex escape: \xNN — if no valid hex digits, output literal \x - let before = *pos; - let hex = read_hex(chars, pos, 2); - if *pos == before { - // No hex digits consumed — output literal \x - out.push('\\'); - out.push('x'); - } else if hex == 0 { - // NUL byte truncates the string - skip_to_closing_quote(chars, pos); - return out; - } else if hex > 0x7F { - // High bytes are invalid standalone UTF-8 — replacement char - out.push('\u{FFFD}'); - } else if let Some(ch) = char::from_u32(hex) { - // Bash prefixes CTLESC (0x01) and CTLNUL (0x7F) with - // CTLESC in its internal representation - if ch == '\x01' || ch == '\x7F' { - out.push('\x01'); - } - out.push(ch); - } - } - 'u' => { - // Unicode: \uNNNN — if no hex digits, output literal \u - let before = *pos; - let val = read_hex(chars, pos, 4); - if *pos == before { - out.push('\\'); - out.push('u'); - } else if val > 0 - && let Some(ch) = char::from_u32(val) - { - out.push(ch); - } - // val==0 with digits → NUL, truncate - else if val == 0 && *pos > before { - skip_to_closing_quote(chars, pos); - return out; - } - } - 'U' => { - // Unicode long: \UNNNNNNNN — if no hex digits, output literal \U - let before = *pos; - let val = read_hex(chars, pos, 8); - if *pos == before { - out.push('\\'); - out.push('U'); - } else if val > 0 - && let Some(ch) = char::from_u32(val) - { - out.push(ch); - } - // val==0 with digits → NUL, truncate - else if val == 0 && *pos > before { - skip_to_closing_quote(chars, pos); - return out; - } - } - '0'..='7' => { - // Octal escape — NUL terminates the string (bash behavior) - let mut val = u32::from(esc as u8 - b'0'); - for _ in 0..2 { - if *pos < chars.len() && chars[*pos] >= '0' && chars[*pos] <= '7' { - val = val * 8 + u32::from(chars[*pos] as u8 - b'0'); - *pos += 1; - } - } - if val == 0 { - skip_to_closing_quote(chars, pos); - return out; - } - if let Some(ch) = char::from_u32(val) { - if ch == '\x01' || ch == '\x7F' { - out.push('\x01'); - } - out.push(ch); - } - } - _ => { - out.push('\\'); - out.push(esc); - } - } - } else { + if c != '\\' || *pos + 1 >= chars.len() { out.push(c); *pos += 1; + continue; + } + *pos += 1; + let esc = chars[*pos]; + *pos += 1; + if handle_escape(chars, pos, esc, &mut out) { + // NUL-valued escape truncated the string; skip_to_closing_quote + // was already called by the specific handler. + return out; } } out } -/// Continue processing after an escaped quote split. -fn process_ansi_c_continue(chars: &[char], pos: &mut usize, mut out: String) -> String { - // After \' we output '\\'' and need to continue in a new quote context - out.push_str(&process_ansi_c_content(chars, pos)); - out +/// Dispatch a single escape sequence. Returns `true` when the sequence +/// NUL-truncated the quoted string so the caller can exit the loop. +fn handle_escape(chars: &[char], pos: &mut usize, esc: char, out: &mut String) -> bool { + if let Some(ch) = simple_escape(esc) { + out.push(ch); + return false; + } + match esc { + 'c' => handle_control(chars, pos, out), + '\'' => push_escaped_quote(out), + 'x' => return handle_hex(chars, pos, out), + 'u' => return handle_unicode(chars, pos, out, 4), + 'U' => return handle_unicode(chars, pos, out, 8), + '0'..='7' => return handle_octal(chars, pos, esc, out), + _ => { + out.push('\\'); + out.push(esc); + } + } + false +} + +/// Table of single-character escapes. Returns `None` for escapes that +/// need context (hex/unicode/octal/control) or are unrecognised. +const fn simple_escape(esc: char) -> Option { + Some(match esc { + 'n' => '\n', + 't' => '\t', + 'r' => '\r', + 'a' => '\x07', + 'b' => '\x08', + 'f' => '\x0C', + 'v' => '\x0B', + 'e' | 'E' => '\x1B', + '\\' => '\\', + '"' => '"', + _ => return None, + }) +} + +/// `\xNN` — up to 2 hex digits. Returns `true` on NUL truncation. +fn handle_hex(chars: &[char], pos: &mut usize, out: &mut String) -> bool { + let before = *pos; + let hex = read_hex(chars, pos, 2); + if *pos == before { + out.push('\\'); + out.push('x'); + return false; + } + if hex == 0 { + skip_to_closing_quote(chars, pos); + return true; + } + if hex > 0x7F { + // High bytes are invalid standalone UTF-8 — replacement char. + out.push('\u{FFFD}'); + } else if let Some(ch) = char::from_u32(hex) { + push_with_ctlesc(out, ch); + } + false +} + +/// `\uNNNN` (width=4) or `\UNNNNNNNN` (width=8). Returns `true` on NUL +/// truncation. +fn handle_unicode(chars: &[char], pos: &mut usize, out: &mut String, width: usize) -> bool { + let before = *pos; + let val = read_hex(chars, pos, width); + if *pos == before { + out.push('\\'); + out.push(if width == 4 { 'u' } else { 'U' }); + return false; + } + if val == 0 { + skip_to_closing_quote(chars, pos); + return true; + } + if let Some(ch) = char::from_u32(val) { + out.push(ch); + } + false +} + +/// `\0`–`\7` followed by up to 2 additional octal digits. Returns `true` +/// on NUL truncation. +fn handle_octal(chars: &[char], pos: &mut usize, first: char, out: &mut String) -> bool { + let mut val = u32::from(first as u8 - b'0'); + for _ in 0..2 { + if *pos < chars.len() && chars[*pos] >= '0' && chars[*pos] <= '7' { + val = val * 8 + u32::from(chars[*pos] as u8 - b'0'); + *pos += 1; + } + } + if val == 0 { + skip_to_closing_quote(chars, pos); + return true; + } + if let Some(ch) = char::from_u32(val) { + push_with_ctlesc(out, ch); + } + false +} + +/// `\cX` — emits `chr(X & 0x1F)`. `\c@` (value 0) is silently dropped, +/// matching the existing behavior (not a NUL-truncation case). +fn handle_control(chars: &[char], pos: &mut usize, out: &mut String) { + if *pos >= chars.len() { + // `\c` at end of input — output literal backslash + c. + out.push('\\'); + out.push('c'); + return; + } + let ctrl = chars[*pos]; + *pos += 1; + let val = (ctrl as u32) & 0x1F; + if val > 0 + && let Some(ch) = char::from_u32(val) + { + out.push(ch); + } +} + +/// Pushes `ch` to `out`, prefixing `0x01` (CTLESC) for bytes that bash +/// escapes internally (`0x01` and `0x7F`). +fn push_with_ctlesc(out: &mut String, ch: char) { + if ch == '\x01' || ch == '\x7F' { + out.push('\x01'); + } + out.push(ch); +} + +/// `\'` inside `$'...'` expands to the 4-character sequence `'\''` — close +/// the current quote, escape a single quote, reopen. The outer loop +/// continues from the next character; no recursive re-entry needed. +fn push_escaped_quote(out: &mut String) { + out.push('\''); + out.push('\\'); + out.push('\''); + out.push('\''); } /// Read up to `max` hex digits from chars at pos.