diff --git a/src/lib.rs b/src/lib.rs index a1c12b2..b3c6987 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -124,6 +124,7 @@ fn tn_normalize_for_lang(input: &str, lang: &str) -> String { let input = input.trim(); match lang { + "en" => tn_normalize(input), "fr" => tn_normalize_lang_fr(input), "es" => tn_normalize_lang_es(input), "de" => tn_normalize_lang_de(input), @@ -381,6 +382,9 @@ fn tn_parse_span_lang(span: &str, lang: &str) -> Option<(String, u8)> { } match lang { + "en" => { + try_lang_taggers!(tts::en); + } "fr" => { try_lang_taggers!(tts::fr); } @@ -569,34 +573,34 @@ pub fn normalize_sentence_with_max_span(input: &str, max_span_tokens: usize) -> pub fn tn_normalize(input: &str) -> String { let input = input.trim(); - if let Some(result) = tts::whitelist::parse(input) { + if let Some(result) = tts::en::whitelist::parse(input) { return result; } - if let Some(result) = tts::money::parse(input) { + if let Some(result) = tts::en::money::parse(input) { return result; } - if let Some(result) = tts::measure::parse(input) { + if let Some(result) = tts::en::measure::parse(input) { return result; } - if let Some(result) = tts::date::parse(input) { + if let Some(result) = tts::en::date::parse(input) { return result; } - if let Some(result) = tts::time::parse(input) { + if let Some(result) = tts::en::time::parse(input) { return result; } - if let Some(result) = tts::electronic::parse(input) { + if let Some(result) = tts::en::electronic::parse(input) { return result; } - if let Some(result) = tts::telephone::parse(input) { + if let Some(result) = tts::en::telephone::parse(input) { return result; } - if let Some(result) = tts::ordinal::parse(input) { + if let Some(result) = tts::en::ordinal::parse(input) { return result; } - if let Some(result) = tts::decimal::parse(input) { + if let Some(result) = tts::en::decimal::parse(input) { return result; } - if let Some(result) = tts::cardinal::parse(input) { + if let Some(result) = tts::en::cardinal::parse(input) { return result; } @@ -611,34 +615,34 @@ fn tn_parse_span(span: &str) -> Option<(String, u8)> { return None; } - if let Some(result) = tts::whitelist::parse(span) { + if let Some(result) = tts::en::whitelist::parse(span) { return Some((result, 100)); } - if let Some(result) = tts::money::parse(span) { + if let Some(result) = tts::en::money::parse(span) { return Some((result, 95)); } - if let Some(result) = tts::measure::parse(span) { + if let Some(result) = tts::en::measure::parse(span) { return Some((result, 90)); } - if let Some(result) = tts::date::parse(span) { + if let Some(result) = tts::en::date::parse(span) { return Some((result, 88)); } - if let Some(result) = tts::time::parse(span) { + if let Some(result) = tts::en::time::parse(span) { return Some((result, 85)); } - if let Some(result) = tts::electronic::parse(span) { + if let Some(result) = tts::en::electronic::parse(span) { return Some((result, 82)); } - if let Some(result) = tts::telephone::parse(span) { + if let Some(result) = tts::en::telephone::parse(span) { return Some((result, 78)); } - if let Some(result) = tts::ordinal::parse(span) { + if let Some(result) = tts::en::ordinal::parse(span) { return Some((result, 75)); } - if let Some(result) = tts::decimal::parse(span) { + if let Some(result) = tts::en::decimal::parse(span) { return Some((result, 73)); } - if let Some(result) = tts::cardinal::parse(span) { + if let Some(result) = tts::en::cardinal::parse(span) { return Some((result, 70)); } diff --git a/src/tts/de/decimal.rs b/src/tts/de/decimal.rs index 3d8d940..983e252 100644 --- a/src/tts/de/decimal.rs +++ b/src/tts/de/decimal.rs @@ -3,7 +3,10 @@ //! Converts written decimal numbers to spoken German: //! - "3,14" → "drei komma eins vier" //! - "0,5" → "null komma fuenf" -//! - "3.14" → "drei komma eins vier" +//! - "-3,14" → "minus drei komma eins vier" +//! +//! German uses comma (,) as the decimal separator. +//! Period (.) is used as thousands separator in cardinal numbers. use super::{number_to_words, spell_digits}; @@ -30,14 +33,11 @@ pub fn parse(input: &str) -> Option { // Check for quantity suffix: "1,5 milliarden" let (number_part, suffix) = extract_suffix(trimmed); - // German uses comma as decimal separator, but also accept period - let sep = if number_part.contains(',') && !number_part.contains('.') { - ',' - } else if number_part.contains('.') { - '.' - } else { + // German uses comma as decimal separator (period is thousands separator) + if !number_part.contains(',') { return None; - }; + } + let sep = ','; let parts: Vec<&str> = number_part.splitn(2, sep).collect(); if parts.len() != 2 { @@ -106,11 +106,6 @@ mod tests { assert_eq!(parse("0,5"), Some("null komma fuenf".to_string())); } - #[test] - fn test_period_decimal() { - assert_eq!(parse("3.14"), Some("drei komma eins vier".to_string())); - } - #[test] fn test_negative() { assert_eq!( diff --git a/src/tts/de/money.rs b/src/tts/de/money.rs index ba5d657..091736e 100644 --- a/src/tts/de/money.rs +++ b/src/tts/de/money.rs @@ -169,12 +169,10 @@ fn parse_amount(amount_str: &str, currency: &Currency) -> Option { return None; } - // Determine decimal separator: German uses comma - let sep = if amount_str.contains(',') { ',' } else { '.' }; - - if amount_str.contains(sep) && sep != '.' || amount_str.contains('.') { - let actual_sep = if amount_str.contains(',') { ',' } else { '.' }; - let parts: Vec<&str> = amount_str.splitn(2, actual_sep).collect(); + // Check for decimal separator (comma or period) + if amount_str.contains(',') || amount_str.contains('.') { + let sep = if amount_str.contains(',') { ',' } else { '.' }; + let parts: Vec<&str> = amount_str.splitn(2, sep).collect(); if parts.len() == 2 { let int_clean: String = parts[0].chars().filter(|c| c.is_ascii_digit()).collect(); let dollars: i64 = if int_clean.is_empty() { diff --git a/src/tts/cardinal.rs b/src/tts/en/cardinal.rs similarity index 100% rename from src/tts/cardinal.rs rename to src/tts/en/cardinal.rs diff --git a/src/tts/date.rs b/src/tts/en/date.rs similarity index 100% rename from src/tts/date.rs rename to src/tts/en/date.rs diff --git a/src/tts/decimal.rs b/src/tts/en/decimal.rs similarity index 100% rename from src/tts/decimal.rs rename to src/tts/en/decimal.rs diff --git a/src/tts/electronic.rs b/src/tts/en/electronic.rs similarity index 100% rename from src/tts/electronic.rs rename to src/tts/en/electronic.rs diff --git a/src/tts/measure.rs b/src/tts/en/measure.rs similarity index 100% rename from src/tts/measure.rs rename to src/tts/en/measure.rs diff --git a/src/tts/en/mod.rs b/src/tts/en/mod.rs new file mode 100644 index 0000000..e8258a9 --- /dev/null +++ b/src/tts/en/mod.rs @@ -0,0 +1,212 @@ +//! Text Normalization taggers for English. +//! +//! Converts written-form text to spoken English: +//! - "200" → "two hundred" +//! - "$5.50" → "five dollars and fifty cents" +//! - "January 5, 2025" → "january fifth twenty twenty five" + +pub mod cardinal; +pub mod date; +pub mod decimal; +pub mod electronic; +pub mod measure; +pub mod money; +pub mod ordinal; +pub mod telephone; +pub mod time; +pub mod whitelist; + +/// Ones words indexed by value (0..20). +const ONES: [&str; 20] = [ + "zero", + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + "ten", + "eleven", + "twelve", + "thirteen", + "fourteen", + "fifteen", + "sixteen", + "seventeen", + "eighteen", + "nineteen", +]; + +/// Tens words indexed by tens digit (2..10 → index 0..8). +const TENS: [&str; 8] = [ + "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", +]; + +/// Convert an integer to English words. +/// +/// Examples: +/// - `0` → `"zero"` +/// - `21` → `"twenty one"` +/// - `123` → `"one hundred twenty three"` +/// - `1000` → `"one thousand"` +/// - `-42` → `"minus forty two"` +pub fn number_to_words(n: i64) -> String { + if n == 0 { + return "zero".to_string(); + } + + if n < 0 { + // Use wrapping negation and cast to u64 to handle i64::MIN safely, + // since -i64::MIN overflows i64 but fits in u64. + let abs_val = (n as u64).wrapping_neg(); + let mut parts: Vec = Vec::new(); + let mut remaining = abs_val; + + let scales: &[(u64, &str)] = &[ + (1_000_000_000_000_000_000, "quintillion"), + (1_000_000_000_000_000, "quadrillion"), + (1_000_000_000_000, "trillion"), + (1_000_000_000, "billion"), + (1_000_000, "million"), + (1_000, "thousand"), + ]; + + for &(scale_value, scale_name) in scales { + if remaining >= scale_value { + let chunk = remaining / scale_value; + remaining %= scale_value; + parts.push(format!("{} {}", chunk_to_words(chunk as u32), scale_name)); + } + } + + if remaining > 0 { + parts.push(chunk_to_words(remaining as u32)); + } + + return format!("minus {}", parts.join(" ")); + } + + let mut parts: Vec = Vec::new(); + let mut remaining = n as u64; + + // Process scale groups from largest to smallest + let scales: &[(u64, &str)] = &[ + (1_000_000_000_000_000_000, "quintillion"), + (1_000_000_000_000_000, "quadrillion"), + (1_000_000_000_000, "trillion"), + (1_000_000_000, "billion"), + (1_000_000, "million"), + (1_000, "thousand"), + ]; + + for &(scale_value, scale_name) in scales { + if remaining >= scale_value { + let chunk = remaining / scale_value; + remaining %= scale_value; + parts.push(format!("{} {}", chunk_to_words(chunk as u32), scale_name)); + } + } + + // Remainder (0..999) + if remaining > 0 { + parts.push(chunk_to_words(remaining as u32)); + } + + parts.join(" ") +} + +/// Convert a number 1..999 to words. +fn chunk_to_words(n: u32) -> String { + debug_assert!(n > 0 && n < 1000); + let mut parts: Vec<&str> = Vec::new(); + + let hundreds = n / 100; + let rest = n % 100; + + if hundreds > 0 { + parts.push(ONES[hundreds as usize]); + parts.push("hundred"); + } + + if rest >= 20 { + let tens_idx = (rest / 10 - 2) as usize; + parts.push(TENS[tens_idx]); + let ones = rest % 10; + if ones > 0 { + parts.push(ONES[ones as usize]); + } + } else if rest > 0 { + parts.push(ONES[rest as usize]); + } + + parts.join(" ") +} + +/// Spell each digit of a string individually. +/// +/// "14" → "one four" +pub fn spell_digits(s: &str) -> String { + s.chars() + .filter_map(|c| c.to_digit(10).map(|d| ONES[d as usize])) + .collect::>() + .join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_number_to_words_basic() { + assert_eq!(number_to_words(0), "zero"); + assert_eq!(number_to_words(1), "one"); + assert_eq!(number_to_words(10), "ten"); + assert_eq!(number_to_words(11), "eleven"); + assert_eq!(number_to_words(19), "nineteen"); + assert_eq!(number_to_words(20), "twenty"); + assert_eq!(number_to_words(21), "twenty one"); + assert_eq!(number_to_words(99), "ninety nine"); + } + + #[test] + fn test_number_to_words_hundreds() { + assert_eq!(number_to_words(100), "one hundred"); + assert_eq!(number_to_words(101), "one hundred one"); + assert_eq!(number_to_words(123), "one hundred twenty three"); + assert_eq!(number_to_words(999), "nine hundred ninety nine"); + } + + #[test] + fn test_number_to_words_thousands() { + assert_eq!(number_to_words(1000), "one thousand"); + assert_eq!(number_to_words(1001), "one thousand one"); + assert_eq!( + number_to_words(1234), + "one thousand two hundred thirty four" + ); + assert_eq!(number_to_words(10000), "ten thousand"); + assert_eq!(number_to_words(100000), "one hundred thousand"); + } + + #[test] + fn test_number_to_words_millions() { + assert_eq!(number_to_words(1000000), "one million"); + assert_eq!(number_to_words(2000003), "two million three"); + } + + #[test] + fn test_number_to_words_negative() { + assert_eq!(number_to_words(-42), "minus forty two"); + assert_eq!(number_to_words(-1000), "minus one thousand"); + } + + #[test] + fn test_spell_digits() { + assert_eq!(spell_digits("14"), "one four"); + assert_eq!(spell_digits("0"), "zero"); + assert_eq!(spell_digits("987"), "nine eight seven"); + } +} diff --git a/src/tts/money.rs b/src/tts/en/money.rs similarity index 100% rename from src/tts/money.rs rename to src/tts/en/money.rs diff --git a/src/tts/ordinal.rs b/src/tts/en/ordinal.rs similarity index 100% rename from src/tts/ordinal.rs rename to src/tts/en/ordinal.rs diff --git a/src/tts/telephone.rs b/src/tts/en/telephone.rs similarity index 100% rename from src/tts/telephone.rs rename to src/tts/en/telephone.rs diff --git a/src/tts/time.rs b/src/tts/en/time.rs similarity index 100% rename from src/tts/time.rs rename to src/tts/en/time.rs diff --git a/src/tts/whitelist.rs b/src/tts/en/whitelist.rs similarity index 100% rename from src/tts/whitelist.rs rename to src/tts/en/whitelist.rs diff --git a/src/tts/es/money.rs b/src/tts/es/money.rs index 3d8605f..1e8bb4a 100644 --- a/src/tts/es/money.rs +++ b/src/tts/es/money.rs @@ -172,12 +172,10 @@ fn parse_amount(amount_str: &str, currency: &Currency) -> Option { return None; } - // Determine decimal separator: Spanish uses comma, but accept period too - let sep = if amount_str.contains(',') { ',' } else { '.' }; - - if amount_str.contains(sep) && sep != '.' || amount_str.contains('.') { - let actual_sep = if amount_str.contains(',') { ',' } else { '.' }; - let parts: Vec<&str> = amount_str.splitn(2, actual_sep).collect(); + // Check for decimal separator (comma or period) + if amount_str.contains(',') || amount_str.contains('.') { + let sep = if amount_str.contains(',') { ',' } else { '.' }; + let parts: Vec<&str> = amount_str.splitn(2, sep).collect(); if parts.len() == 2 { let int_clean: String = parts[0].chars().filter(|c| c.is_ascii_digit()).collect(); let dollars: i64 = if int_clean.is_empty() { diff --git a/src/tts/fr/money.rs b/src/tts/fr/money.rs index 13b2ec8..4007b03 100644 --- a/src/tts/fr/money.rs +++ b/src/tts/fr/money.rs @@ -173,12 +173,10 @@ fn parse_amount(amount_str: &str, currency: &Currency) -> Option { return None; } - // Determine decimal separator: French uses comma - let sep = if amount_str.contains(',') { ',' } else { '.' }; - - if amount_str.contains(sep) && sep != '.' || amount_str.contains('.') { - let actual_sep = if amount_str.contains(',') { ',' } else { '.' }; - let parts: Vec<&str> = amount_str.splitn(2, actual_sep).collect(); + // Check for decimal separator (comma or period) + if amount_str.contains(',') || amount_str.contains('.') { + let sep = if amount_str.contains(',') { ',' } else { '.' }; + let parts: Vec<&str> = amount_str.splitn(2, sep).collect(); if parts.len() == 2 { let int_clean: String = parts[0].chars().filter(|c| c.is_ascii_digit()).collect(); let dollars: i64 = if int_clean.is_empty() { diff --git a/src/tts/mod.rs b/src/tts/mod.rs index 41fea87..46c2848 100644 --- a/src/tts/mod.rs +++ b/src/tts/mod.rs @@ -7,217 +7,11 @@ //! //! Supports multiple languages via submodules. -// English (default) -pub mod cardinal; -pub mod date; -pub mod decimal; -pub mod electronic; -pub mod measure; -pub mod money; -pub mod ordinal; -pub mod telephone; -pub mod time; -pub mod whitelist; - -// Additional languages +// Languages pub mod de; +pub mod en; pub mod es; pub mod fr; pub mod hi; pub mod ja; pub mod zh; - -/// Ones words indexed by value (0..20). -const ONES: [&str; 20] = [ - "zero", - "one", - "two", - "three", - "four", - "five", - "six", - "seven", - "eight", - "nine", - "ten", - "eleven", - "twelve", - "thirteen", - "fourteen", - "fifteen", - "sixteen", - "seventeen", - "eighteen", - "nineteen", -]; - -/// Tens words indexed by tens digit (2..10 → index 0..8). -const TENS: [&str; 8] = [ - "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", -]; - -/// Convert an integer to English words. -/// -/// Examples: -/// - `0` → `"zero"` -/// - `21` → `"twenty one"` -/// - `123` → `"one hundred twenty three"` -/// - `1000` → `"one thousand"` -/// - `-42` → `"minus forty two"` -pub fn number_to_words(n: i64) -> String { - if n == 0 { - return "zero".to_string(); - } - - if n < 0 { - // Use wrapping negation and cast to u64 to handle i64::MIN safely, - // since -i64::MIN overflows i64 but fits in u64. - let abs_val = (n as u64).wrapping_neg(); - let mut parts: Vec = Vec::new(); - let mut remaining = abs_val; - - let scales: &[(u64, &str)] = &[ - (1_000_000_000_000_000_000, "quintillion"), - (1_000_000_000_000_000, "quadrillion"), - (1_000_000_000_000, "trillion"), - (1_000_000_000, "billion"), - (1_000_000, "million"), - (1_000, "thousand"), - ]; - - for &(scale_value, scale_name) in scales { - if remaining >= scale_value { - let chunk = remaining / scale_value; - remaining %= scale_value; - parts.push(format!("{} {}", chunk_to_words(chunk as u32), scale_name)); - } - } - - if remaining > 0 { - parts.push(chunk_to_words(remaining as u32)); - } - - return format!("minus {}", parts.join(" ")); - } - - let mut parts: Vec = Vec::new(); - let mut remaining = n as u64; - - // Process scale groups from largest to smallest - let scales: &[(u64, &str)] = &[ - (1_000_000_000_000_000_000, "quintillion"), - (1_000_000_000_000_000, "quadrillion"), - (1_000_000_000_000, "trillion"), - (1_000_000_000, "billion"), - (1_000_000, "million"), - (1_000, "thousand"), - ]; - - for &(scale_value, scale_name) in scales { - if remaining >= scale_value { - let chunk = remaining / scale_value; - remaining %= scale_value; - parts.push(format!("{} {}", chunk_to_words(chunk as u32), scale_name)); - } - } - - // Remainder (0..999) - if remaining > 0 { - parts.push(chunk_to_words(remaining as u32)); - } - - parts.join(" ") -} - -/// Convert a number 1..999 to words. -fn chunk_to_words(n: u32) -> String { - debug_assert!(n > 0 && n < 1000); - let mut parts: Vec<&str> = Vec::new(); - - let hundreds = n / 100; - let rest = n % 100; - - if hundreds > 0 { - parts.push(ONES[hundreds as usize]); - parts.push("hundred"); - } - - if rest >= 20 { - let tens_idx = (rest / 10 - 2) as usize; - parts.push(TENS[tens_idx]); - let ones = rest % 10; - if ones > 0 { - parts.push(ONES[ones as usize]); - } - } else if rest > 0 { - parts.push(ONES[rest as usize]); - } - - parts.join(" ") -} - -/// Spell each digit of a string individually. -/// -/// "14" → "one four" -pub fn spell_digits(s: &str) -> String { - s.chars() - .filter_map(|c| c.to_digit(10).map(|d| ONES[d as usize])) - .collect::>() - .join(" ") -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_number_to_words_basic() { - assert_eq!(number_to_words(0), "zero"); - assert_eq!(number_to_words(1), "one"); - assert_eq!(number_to_words(10), "ten"); - assert_eq!(number_to_words(11), "eleven"); - assert_eq!(number_to_words(19), "nineteen"); - assert_eq!(number_to_words(20), "twenty"); - assert_eq!(number_to_words(21), "twenty one"); - assert_eq!(number_to_words(99), "ninety nine"); - } - - #[test] - fn test_number_to_words_hundreds() { - assert_eq!(number_to_words(100), "one hundred"); - assert_eq!(number_to_words(101), "one hundred one"); - assert_eq!(number_to_words(123), "one hundred twenty three"); - assert_eq!(number_to_words(999), "nine hundred ninety nine"); - } - - #[test] - fn test_number_to_words_thousands() { - assert_eq!(number_to_words(1000), "one thousand"); - assert_eq!(number_to_words(1001), "one thousand one"); - assert_eq!( - number_to_words(1234), - "one thousand two hundred thirty four" - ); - assert_eq!(number_to_words(10000), "ten thousand"); - assert_eq!(number_to_words(100000), "one hundred thousand"); - } - - #[test] - fn test_number_to_words_millions() { - assert_eq!(number_to_words(1000000), "one million"); - assert_eq!(number_to_words(2000003), "two million three"); - } - - #[test] - fn test_number_to_words_negative() { - assert_eq!(number_to_words(-42), "minus forty two"); - assert_eq!(number_to_words(-1000), "minus one thousand"); - } - - #[test] - fn test_spell_digits() { - assert_eq!(spell_digits("14"), "one four"); - assert_eq!(spell_digits("0"), "zero"); - assert_eq!(spell_digits("987"), "nine eight seven"); - } -} diff --git a/tests/extensive_tests.rs b/tests/extensive_tests.rs index e03c5a1..fb89620 100644 --- a/tests/extensive_tests.rs +++ b/tests/extensive_tests.rs @@ -1453,7 +1453,7 @@ fn test_number_to_words_i64_min() { // through the telephone tagger (the "-" is treated as a separator). // i64::MIN = -9223372036854775808: negating overflows i64 but our fix // uses wrapping_neg + u64 to handle it safely. - use text_processing_rs::tts::number_to_words; + use text_processing_rs::tts::en::number_to_words; let result = number_to_words(i64::MIN); assert!(