From f3efa92856ad1434d24de69c37288dd602d74712 Mon Sep 17 00:00:00 2001
From: Matjaz Domen Pecan <matjaz.pecan@gmail.com>
Date: Fri, 17 Apr 2026 15:10:28 +0200
Subject: [PATCH] refactor(sexp): table-drive ANSI-C escape dispatch

process_ansi_c_content shrinks from ~140 lines to a 20-line loop that
delegates to per-escape-kind helpers:

- simple_escape(esc) -> Option<char>: const table of 1-char escapes
  (n/t/r/a/b/f/v/e/E/backslash/doublequote)
- handle_escape: dispatcher returning bool (true on NUL truncation)
- handle_hex: \xNN (up to 2 hex digits, 0x80+ -> U+FFFD, 0x01/0x7F CTLESC)
- handle_unicode: \uNNNN (width=4) and \UNNNNNNNN (width=8) share one helper
- handle_octal: octal escapes followed by up to 2 octal digits, CTLESC
  for 0x01/0x7F
- handle_control: \cX -> chr(X & 0x1F); \c@ silently dropped
- push_with_ctlesc: centralises 0x01/0x7F prefix logic (previously duped
  across hex and octal branches)
- push_escaped_quote: inlines the 4-char output for escaped single quote;
  removes the unnecessary recursive process_ansi_c_continue (the
  recursion was semantically equivalent to letting the outer loop
  continue)

Drops the #[allow(clippy::too_many_lines)] attribute. Decoding is
byte-identical: all parable corpus tests (including the 325-line
24_ansi_c_quoting.tests) and both ansi_c oracle test files pass.

Part of #61 (v0.2.0 cycle).

Closes #56

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/sexp/ansi_c.rs | 283 ++++++++++++++++++++++++---------------------
 1 file changed, 149 insertions(+), 134 deletions(-)
diff --git a/src/sexp/ansi_c.rs b/src/sexp/ansi_c.rs
index 990a191..cc36b02 100644
--- a/src/sexp/ansi_c.rs
+++ b/src/sexp/ansi_c.rs
@@ -11,154 +11,169 @@
 /// `chars` is the full character array, `pos` points to the first char after `$'`.
 /// Returns the processed content (without surrounding quotes).
 /// Advances `pos` past the closing `'`.
-#[allow(clippy::too_many_lines)]
 pub fn process_ansi_c_content(chars: &[char], pos: &mut usize) -> String {
     let mut out = String::new();
     while *pos < chars.len() {
         let c = chars[*pos];
         if c == '\'' {
-            *pos += 1; // skip closing '
+            *pos += 1;
             return out;
         }
-        if c == '\\' && *pos + 1 < chars.len() {
-            *pos += 1;
-            let esc = chars[*pos];
-            *pos += 1;
-            match esc {
-                'n' => out.push('\n'),
-                't' => out.push('\t'),
-                'r' => out.push('\r'),
-                'a' => out.push('\x07'),
-                'b' => out.push('\x08'),
-                'f' => out.push('\x0C'),
-                'v' => out.push('\x0B'),
-                'e' | 'E' => out.push('\x1B'),
-                '\\' => out.push('\\'),
-                'c' => {
-                    // Control character: \cX → chr(X & 0x1F)
-                    if *pos < chars.len() {
-                        let ctrl = chars[*pos];
-                        *pos += 1;
-                        let val = (ctrl as u32) & 0x1F;
-                        if val > 0
-                            && let Some(ch) = char::from_u32(val)
-                        {
-                            out.push(ch);
-                        }
-                        // \c@ or val==0 → NUL, which is dropped
-                    } else {
-                        // \c at end of string — output literal \c
-                        out.push('\\');
-                        out.push('c');
-                    }
-                }
-                '\'' => {
-                    // Escaped single quote: output as '\\''
-                    out.push('\'');
-                    out.push('\\');
-                    out.push('\'');
-                    out.push('\'');
-                    return process_ansi_c_continue(chars, pos, out);
-                }
-                '"' => out.push('"'),
-                'x' => {
-                    // Hex escape: \xNN — if no valid hex digits, output literal \x
-                    let before = *pos;
-                    let hex = read_hex(chars, pos, 2);
-                    if *pos == before {
-                        // No hex digits consumed — output literal \x
-                        out.push('\\');
-                        out.push('x');
-                    } else if hex == 0 {
-                        // NUL byte truncates the string
-                        skip_to_closing_quote(chars, pos);
-                        return out;
-                    } else if hex > 0x7F {
-                        // High bytes are invalid standalone UTF-8 — replacement char
-                        out.push('\u{FFFD}');
-                    } else if let Some(ch) = char::from_u32(hex) {
-                        // Bash prefixes CTLESC (0x01) and CTLNUL (0x7F) with
-                        // CTLESC in its internal representation
-                        if ch == '\x01' || ch == '\x7F' {
-                            out.push('\x01');
-                        }
-                        out.push(ch);
-                    }
-                }
-                'u' => {
-                    // Unicode: \uNNNN — if no hex digits, output literal \u
-                    let before = *pos;
-                    let val = read_hex(chars, pos, 4);
-                    if *pos == before {
-                        out.push('\\');
-                        out.push('u');
-                    } else if val > 0
-                        && let Some(ch) = char::from_u32(val)
-                    {
-                        out.push(ch);
-                    }
-                    // val==0 with digits → NUL, truncate
-                    else if val == 0 && *pos > before {
-                        skip_to_closing_quote(chars, pos);
-                        return out;
-                    }
-                }
-                'U' => {
-                    // Unicode long: \UNNNNNNNN — if no hex digits, output literal \U
-                    let before = *pos;
-                    let val = read_hex(chars, pos, 8);
-                    if *pos == before {
-                        out.push('\\');
-                        out.push('U');
-                    } else if val > 0
-                        && let Some(ch) = char::from_u32(val)
-                    {
-                        out.push(ch);
-                    }
-                    // val==0 with digits → NUL, truncate
-                    else if val == 0 && *pos > before {
-                        skip_to_closing_quote(chars, pos);
-                        return out;
-                    }
-                }
-                '0'..='7' => {
-                    // Octal escape — NUL terminates the string (bash behavior)
-                    let mut val = u32::from(esc as u8 - b'0');
-                    for _ in 0..2 {
-                        if *pos < chars.len() && chars[*pos] >= '0' && chars[*pos] <= '7' {
-                            val = val * 8 + u32::from(chars[*pos] as u8 - b'0');
-                            *pos += 1;
-                        }
-                    }
-                    if val == 0 {
-                        skip_to_closing_quote(chars, pos);
-                        return out;
-                    }
-                    if let Some(ch) = char::from_u32(val) {
-                        if ch == '\x01' || ch == '\x7F' {
-                            out.push('\x01');
-                        }
-                        out.push(ch);
-                    }
-                }
-                _ => {
-                    out.push('\\');
-                    out.push(esc);
-                }
-            }
-        } else {
+        if c != '\\' || *pos + 1 >= chars.len() {
             out.push(c);
             *pos += 1;
+            continue;
+        }
+        *pos += 1;
+        let esc = chars[*pos];
+        *pos += 1;
+        if handle_escape(chars, pos, esc, &mut out) {
+            // NUL-valued escape truncated the string; skip_to_closing_quote
+            // was already called by the specific handler.
+            return out;
         }
     }
     out
 }
 
-/// Continue processing after an escaped quote split.
-fn process_ansi_c_continue(chars: &[char], pos: &mut usize, mut out: String) -> String {
-    // After \' we output '\\'' and need to continue in a new quote context
-    out.push_str(&process_ansi_c_content(chars, pos));
-    out
+/// Dispatch a single escape sequence. Returns `true` when the sequence
+/// NUL-truncated the quoted string so the caller can exit the loop.
+fn handle_escape(chars: &[char], pos: &mut usize, esc: char, out: &mut String) -> bool {
+    if let Some(ch) = simple_escape(esc) {
+        out.push(ch);
+        return false;
+    }
+    match esc {
+        'c' => handle_control(chars, pos, out),
+        '\'' => push_escaped_quote(out),
+        'x' => return handle_hex(chars, pos, out),
+        'u' => return handle_unicode(chars, pos, out, 4),
+        'U' => return handle_unicode(chars, pos, out, 8),
+        '0'..='7' => return handle_octal(chars, pos, esc, out),
+        _ => {
+            out.push('\\');
+            out.push(esc);
+        }
+    }
+    false
+}
+
+/// Table of single-character escapes. Returns `None` for escapes that
+/// need context (hex/unicode/octal/control) or are unrecognised.
+const fn simple_escape(esc: char) -> Option<char> {
+    Some(match esc {
+        'n' => '\n',
+        't' => '\t',
+        'r' => '\r',
+        'a' => '\x07',
+        'b' => '\x08',
+        'f' => '\x0C',
+        'v' => '\x0B',
+        'e' | 'E' => '\x1B',
+        '\\' => '\\',
+        '"' => '"',
+        _ => return None,
+    })
+}
+
+/// `\xNN` — up to 2 hex digits. Returns `true` on NUL truncation.
+fn handle_hex(chars: &[char], pos: &mut usize, out: &mut String) -> bool {
+    let before = *pos;
+    let hex = read_hex(chars, pos, 2);
+    if *pos == before {
+        out.push('\\');
+        out.push('x');
+        return false;
+    }
+    if hex == 0 {
+        skip_to_closing_quote(chars, pos);
+        return true;
+    }
+    if hex > 0x7F {
+        // High bytes are invalid standalone UTF-8 — replacement char.
+        out.push('\u{FFFD}');
+    } else if let Some(ch) = char::from_u32(hex) {
+        push_with_ctlesc(out, ch);
+    }
+    false
+}
+
+/// `\uNNNN` (width=4) or `\UNNNNNNNN` (width=8). Returns `true` on NUL
+/// truncation.
+fn handle_unicode(chars: &[char], pos: &mut usize, out: &mut String, width: usize) -> bool {
+    let before = *pos;
+    let val = read_hex(chars, pos, width);
+    if *pos == before {
+        out.push('\\');
+        out.push(if width == 4 { 'u' } else { 'U' });
+        return false;
+    }
+    if val == 0 {
+        skip_to_closing_quote(chars, pos);
+        return true;
+    }
+    if let Some(ch) = char::from_u32(val) {
+        out.push(ch);
+    }
+    false
+}
+
+/// `\0`–`\7` followed by up to 2 additional octal digits. Returns `true`
+/// on NUL truncation.
+fn handle_octal(chars: &[char], pos: &mut usize, first: char, out: &mut String) -> bool {
+    let mut val = u32::from(first as u8 - b'0');
+    for _ in 0..2 {
+        if *pos < chars.len() && chars[*pos] >= '0' && chars[*pos] <= '7' {
+            val = val * 8 + u32::from(chars[*pos] as u8 - b'0');
+            *pos += 1;
+        }
+    }
+    if val == 0 {
+        skip_to_closing_quote(chars, pos);
+        return true;
+    }
+    if let Some(ch) = char::from_u32(val) {
+        push_with_ctlesc(out, ch);
+    }
+    false
+}
+
+/// `\cX` — emits `chr(X & 0x1F)`. `\c@` (value 0) is silently dropped,
+/// matching the existing behavior (not a NUL-truncation case).
+fn handle_control(chars: &[char], pos: &mut usize, out: &mut String) {
+    if *pos >= chars.len() {
+        // `\c` at end of input — output literal backslash + c.
+        out.push('\\');
+        out.push('c');
+        return;
+    }
+    let ctrl = chars[*pos];
+    *pos += 1;
+    let val = (ctrl as u32) & 0x1F;
+    if val > 0
+        && let Some(ch) = char::from_u32(val)
+    {
+        out.push(ch);
+    }
+}
+
+/// Pushes `ch` to `out`, prefixing `0x01` (CTLESC) for bytes that bash
+/// escapes internally (`0x01` and `0x7F`).
+fn push_with_ctlesc(out: &mut String, ch: char) {
+    if ch == '\x01' || ch == '\x7F' {
+        out.push('\x01');
+    }
+    out.push(ch);
+}
+
+/// `\'` inside `$'...'` expands to the 4-character sequence `'\''` — close
+/// the current quote, escape a single quote, reopen. The outer loop
+/// continues from the next character; no recursive re-entry needed.
+fn push_escaped_quote(out: &mut String) {
+    out.push('\'');
+    out.push('\\');
+    out.push('\'');
+    out.push('\'');
 }
 
 /// Read up to `max` hex digits from chars at pos.