Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,4 @@ hypnoscript-docs/static/install.sh
*.dylib
*.wasm
*.wat
hypnoscript-compiler/hypnoscript_output*
Binary file removed hypnoscript-compiler/hypnoscript_output
Binary file not shown.
96 changes: 96 additions & 0 deletions hypnoscript-lexer-parser/src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,8 @@ impl Lexer {
'r' => string.push('\r'),
'\\' => string.push('\\'),
'"' => string.push('"'),
'u' => string.push(self.read_hex_escape(4, "\\u")?),
'x' => string.push(self.read_hex_escape(2, "\\x")?),
_ => string.push(escaped),
}
}
Expand All @@ -451,6 +453,62 @@ impl Lexer {
Err(format!("Unterminated string at line {}", self.line))
}

/// Reads a fixed-width hexadecimal escape sequence from the current position.
///
/// `digits` controls how many hexadecimal digits are consumed after the
/// escape prefix (for example 4 for `\uXXXX` and 2 for `\xXX`). The method
/// returns the decoded Unicode scalar value or an error if the escape is
/// truncated, contains non-hex digits, or decodes to an invalid scalar such
/// as a UTF-16 surrogate.
fn read_hex_escape(&mut self, digits: usize, escape_prefix: &str) -> Result<char, String> {
let mut hex = String::with_capacity(digits);

for _ in 0..digits {
if self.is_at_end() {
return Err(format!(
"Unterminated {} escape at line {}, column {}",
escape_prefix, self.line, self.column
));
}

let digit = self.advance();
if !digit.is_ascii_hexdigit() {
return Err(format!(
"Invalid {} escape digit '{}' at line {}, column {}",
escape_prefix,
digit,
self.line,
self.column.saturating_sub(1)
));
}

hex.push(digit);
}

// Safe because each digit was already validated with `is_ascii_hexdigit`.
let value = u32::from_str_radix(&hex, 16).unwrap();

if (0xD800..=0xDFFF).contains(&value) {
return Err(format!(
"Invalid Unicode scalar value for {} escape '{}' at line {}, column {}: surrogate code points (U+D800 to U+DFFF) are not valid scalar values",
escape_prefix,
hex,
self.line,
self.column.saturating_sub(digits)
));
}

char::from_u32(value).ok_or_else(|| {
format!(
"Invalid Unicode scalar value for {} escape '{}' at line {}, column {}",
escape_prefix,
hex,
self.line,
self.column.saturating_sub(digits)
)
})
}

fn keyword_or_identifier(&self, s: &str) -> (TokenType, String) {
if let Some(definition) = TokenType::keyword_definition(s) {
(definition.token, definition.canonical_lexeme.to_string())
Expand Down Expand Up @@ -480,6 +538,44 @@ mod tests {
assert_eq!(tokens[0].lexeme, "Hello, World!");
}

#[test]
fn test_string_literal_unicode_escapes() {
let mut unicode_lexer = Lexer::new(r#""\u0041\u0042\u0043""#);
let unicode_tokens = unicode_lexer.lex().unwrap();
assert_eq!(unicode_tokens[0].token_type, TokenType::StringLiteral);
assert_eq!(unicode_tokens[0].lexeme, "ABC");

let mut hex_lexer = Lexer::new(r#""Hello\x20World\x21""#);
let hex_tokens = hex_lexer.lex().unwrap();
assert_eq!(hex_tokens[0].token_type, TokenType::StringLiteral);
assert_eq!(hex_tokens[0].lexeme, "Hello World!");
}

#[test]
fn test_string_literal_invalid_unicode_escape() {
let mut lexer = Lexer::new(r#""\u12G4""#);
let error = lexer.lex().unwrap_err();
assert!(error.contains("Invalid \\u escape digit"));
}

#[test]
fn test_string_literal_unterminated_unicode_escape() {
let mut unicode_lexer = Lexer::new("\"\\u12");
let unicode_error = unicode_lexer.lex().unwrap_err();
assert!(unicode_error.contains("Unterminated \\u escape"));

let mut hex_lexer = Lexer::new("\"\\x4");
let hex_error = hex_lexer.lex().unwrap_err();
assert!(hex_error.contains("Unterminated \\x escape"));
}

#[test]
fn test_string_literal_invalid_unicode_scalar_escape() {
let mut lexer = Lexer::new(r#""\uD800""#);
let error = lexer.lex().unwrap_err();
assert!(error.contains("Invalid Unicode scalar value"));
}

#[test]
fn test_operator_synonym_tokenization() {
let mut lexer = Lexer::new("if (a youAreFeelingVerySleepy b) { }");
Expand Down
14 changes: 12 additions & 2 deletions hypnoscript-runtime/src/string_builtins.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,13 +199,13 @@ impl StringBuiltins {

/// Pad left with character
pub fn pad_left(s: &str, total_width: usize, pad_char: char) -> String {
let padding = total_width.saturating_sub(s.len());
let padding = total_width.saturating_sub(s.chars().count());
format!("{}{}", pad_char.to_string().repeat(padding), s)
}

/// Pad right with character
pub fn pad_right(s: &str, total_width: usize, pad_char: char) -> String {
let padding = total_width.saturating_sub(s.len());
let padding = total_width.saturating_sub(s.chars().count());
format!("{}{}", s, pad_char.to_string().repeat(padding))
}

Expand Down Expand Up @@ -435,4 +435,14 @@ mod tests {
let lines = StringBuiltins::wrap_text(text, 20);
assert!(lines.iter().all(|line| line.chars().count() <= 20));
}

#[test]
fn test_padding_is_unicode_aware() {
assert_eq!(StringBuiltins::pad_left("hello", 10, '-'), "-----hello");
assert_eq!(StringBuiltins::pad_right("hello", 10, '-'), "hello-----");
assert_eq!(StringBuiltins::pad_left("🎯", 4, '-'), "---🎯");
assert_eq!(StringBuiltins::pad_right("🎯", 4, '-'), "🎯---");
assert_eq!(StringBuiltins::pad_left("café", 6, ' '), " café");
assert_eq!(StringBuiltins::pad_right("café", 6, ' '), "café ");
}
}
Loading