diff --git a/src/ffi.rs b/src/ffi.rs index 772f774..137b389 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -5,7 +5,8 @@ use std::ptr; use crate::{ custom_rules, normalize, normalize_sentence, normalize_sentence_with_max_span, tn_normalize, - tn_normalize_sentence, tn_normalize_sentence_with_max_span, + tn_normalize_lang, tn_normalize_sentence, tn_normalize_sentence_lang, + tn_normalize_sentence_with_max_span, tn_normalize_sentence_with_max_span_lang, }; /// Normalize spoken-form text to written form. @@ -248,6 +249,106 @@ pub unsafe extern "C" fn nemo_tn_normalize_sentence_with_max_span( } } +// ── Language-aware TN FFI ────────────────────────────────────────────── + +/// Normalize written-form text to spoken form for a specific language. +/// +/// Supported language codes: "en", "fr", "es", "de", "zh", "hi", "ja". +/// Falls back to English for unrecognized codes. +/// +/// # Safety +/// - `input` and `lang` must be valid null-terminated UTF-8 strings +/// - Returns a newly allocated string that must be freed with `nemo_free_string` +#[no_mangle] +pub unsafe extern "C" fn nemo_tn_normalize_lang( + input: *const c_char, + lang: *const c_char, +) -> *mut c_char { + if input.is_null() || lang.is_null() { + return ptr::null_mut(); + } + + let input_str = match CStr::from_ptr(input).to_str() { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + let lang_str = match CStr::from_ptr(lang).to_str() { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + + let result = tn_normalize_lang(input_str, lang_str); + + match CString::new(result) { + Ok(c_string) => c_string.into_raw(), + Err(_) => ptr::null_mut(), + } +} + +/// Normalize a full sentence (TN) for a specific language. +/// +/// # Safety +/// - `input` and `lang` must be valid null-terminated UTF-8 strings +/// - Returns a newly allocated string that must be freed with `nemo_free_string` +#[no_mangle] +pub unsafe extern "C" fn nemo_tn_normalize_sentence_lang( + input: *const c_char, + lang: *const c_char, +) -> *mut c_char { + if input.is_null() || lang.is_null() { + return ptr::null_mut(); + } + + let input_str = match CStr::from_ptr(input).to_str() { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + let lang_str = match CStr::from_ptr(lang).to_str() { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + + let result = tn_normalize_sentence_lang(input_str, lang_str); + + match CString::new(result) { + Ok(c_string) => c_string.into_raw(), + Err(_) => ptr::null_mut(), + } +} + +/// Normalize a full sentence (TN) for a specific language with configurable max span. +/// +/// # Safety +/// - `input` and `lang` must be valid null-terminated UTF-8 strings +/// - Returns a newly allocated string that must be freed with `nemo_free_string` +#[no_mangle] +pub unsafe extern "C" fn nemo_tn_normalize_sentence_with_max_span_lang( + input: *const c_char, + lang: *const c_char, + max_span_tokens: u32, +) -> *mut c_char { + if input.is_null() || lang.is_null() { + return ptr::null_mut(); + } + + let input_str = match CStr::from_ptr(input).to_str() { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + let lang_str = match CStr::from_ptr(lang).to_str() { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + + let result = + tn_normalize_sentence_with_max_span_lang(input_str, lang_str, max_span_tokens as usize); + + match CString::new(result) { + Ok(c_string) => c_string.into_raw(), + Err(_) => ptr::null_mut(), + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/lib.rs b/src/lib.rs index dd867b4..a1c12b2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -111,10 +111,302 @@ pub fn normalize(input: &str) -> String { /// Normalize with language selection (future use). pub fn normalize_with_lang(input: &str, _lang: &str) -> String { - // TODO: Language-specific taggers + // TODO: Language-specific ITN taggers normalize(input) } +// ── Multi-language TN helpers ────────────────────────────────────────── + +/// Try TN taggers for a specific language. +/// +/// Each language module provides: money, measure, date, time, ordinal, decimal, cardinal. +fn tn_normalize_for_lang(input: &str, lang: &str) -> String { + let input = input.trim(); + + match lang { + "fr" => tn_normalize_lang_fr(input), + "es" => tn_normalize_lang_es(input), + "de" => tn_normalize_lang_de(input), + "zh" => tn_normalize_lang_zh(input), + "hi" => tn_normalize_lang_hi(input), + "ja" => tn_normalize_lang_ja(input), + _ => tn_normalize(input), + } +} + +fn tn_normalize_lang_fr(input: &str) -> String { + if let Some(r) = tts::fr::whitelist::parse(input) { + return r; + } + if let Some(r) = tts::fr::money::parse(input) { + return r; + } + if let Some(r) = tts::fr::measure::parse(input) { + return r; + } + if let Some(r) = tts::fr::date::parse(input) { + return r; + } + if let Some(r) = tts::fr::time::parse(input) { + return r; + } + if let Some(r) = tts::fr::electronic::parse(input) { + return r; + } + if let Some(r) = tts::fr::telephone::parse(input) { + return r; + } + if let Some(r) = tts::fr::ordinal::parse(input) { + return r; + } + if let Some(r) = tts::fr::decimal::parse(input) { + return r; + } + if let Some(r) = tts::fr::cardinal::parse(input) { + return r; + } + input.to_string() +} + +fn tn_normalize_lang_es(input: &str) -> String { + if let Some(r) = tts::es::whitelist::parse(input) { + return r; + } + if let Some(r) = tts::es::money::parse(input) { + return r; + } + if let Some(r) = tts::es::measure::parse(input) { + return r; + } + if let Some(r) = tts::es::date::parse(input) { + return r; + } + if let Some(r) = tts::es::time::parse(input) { + return r; + } + if let Some(r) = tts::es::electronic::parse(input) { + return r; + } + if let Some(r) = tts::es::telephone::parse(input) { + return r; + } + if let Some(r) = tts::es::ordinal::parse(input) { + return r; + } + if let Some(r) = tts::es::decimal::parse(input) { + return r; + } + if let Some(r) = tts::es::cardinal::parse(input) { + return r; + } + input.to_string() +} + +fn tn_normalize_lang_de(input: &str) -> String { + if let Some(r) = tts::de::whitelist::parse(input) { + return r; + } + if let Some(r) = tts::de::money::parse(input) { + return r; + } + if let Some(r) = tts::de::measure::parse(input) { + return r; + } + if let Some(r) = tts::de::date::parse(input) { + return r; + } + if let Some(r) = tts::de::time::parse(input) { + return r; + } + if let Some(r) = tts::de::electronic::parse(input) { + return r; + } + if let Some(r) = tts::de::telephone::parse(input) { + return r; + } + if let Some(r) = tts::de::ordinal::parse(input) { + return r; + } + if let Some(r) = tts::de::decimal::parse(input) { + return r; + } + if let Some(r) = tts::de::cardinal::parse(input) { + return r; + } + input.to_string() +} + +fn tn_normalize_lang_zh(input: &str) -> String { + if let Some(r) = tts::zh::whitelist::parse(input) { + return r; + } + if let Some(r) = tts::zh::money::parse(input) { + return r; + } + if let Some(r) = tts::zh::measure::parse(input) { + return r; + } + if let Some(r) = tts::zh::date::parse(input) { + return r; + } + if let Some(r) = tts::zh::time::parse(input) { + return r; + } + if let Some(r) = tts::zh::electronic::parse(input) { + return r; + } + if let Some(r) = tts::zh::telephone::parse(input) { + return r; + } + if let Some(r) = tts::zh::ordinal::parse(input) { + return r; + } + if let Some(r) = tts::zh::decimal::parse(input) { + return r; + } + if let Some(r) = tts::zh::cardinal::parse(input) { + return r; + } + input.to_string() +} + +fn tn_normalize_lang_hi(input: &str) -> String { + if let Some(r) = tts::hi::whitelist::parse(input) { + return r; + } + if let Some(r) = tts::hi::money::parse(input) { + return r; + } + if let Some(r) = tts::hi::measure::parse(input) { + return r; + } + if let Some(r) = tts::hi::date::parse(input) { + return r; + } + if let Some(r) = tts::hi::time::parse(input) { + return r; + } + if let Some(r) = tts::hi::electronic::parse(input) { + return r; + } + if let Some(r) = tts::hi::telephone::parse(input) { + return r; + } + if let Some(r) = tts::hi::ordinal::parse(input) { + return r; + } + if let Some(r) = tts::hi::decimal::parse(input) { + return r; + } + if let Some(r) = tts::hi::cardinal::parse(input) { + return r; + } + input.to_string() +} + +fn tn_normalize_lang_ja(input: &str) -> String { + if let Some(r) = tts::ja::whitelist::parse(input) { + return r; + } + if let Some(r) = tts::ja::money::parse(input) { + return r; + } + if let Some(r) = tts::ja::measure::parse(input) { + return r; + } + if let Some(r) = tts::ja::date::parse(input) { + return r; + } + if let Some(r) = tts::ja::time::parse(input) { + return r; + } + if let Some(r) = tts::ja::electronic::parse(input) { + return r; + } + if let Some(r) = tts::ja::telephone::parse(input) { + return r; + } + if let Some(r) = tts::ja::ordinal::parse(input) { + return r; + } + if let Some(r) = tts::ja::decimal::parse(input) { + return r; + } + if let Some(r) = tts::ja::cardinal::parse(input) { + return r; + } + input.to_string() +} + +/// TN parse span for a specific language. +fn tn_parse_span_lang(span: &str, lang: &str) -> Option<(String, u8)> { + if span.is_empty() { + return None; + } + + macro_rules! try_lang_taggers { + ($mod:path) => {{ + use $mod as lang; + if let Some(r) = lang::whitelist::parse(span) { + return Some((r, 100)); + } + if let Some(r) = lang::money::parse(span) { + return Some((r, 95)); + } + if let Some(r) = lang::measure::parse(span) { + return Some((r, 90)); + } + if let Some(r) = lang::date::parse(span) { + return Some((r, 88)); + } + if let Some(r) = lang::time::parse(span) { + return Some((r, 85)); + } + if let Some(r) = lang::electronic::parse(span) { + return Some((r, 82)); + } + if let Some(r) = lang::telephone::parse(span) { + return Some((r, 78)); + } + if let Some(r) = lang::ordinal::parse(span) { + return Some((r, 75)); + } + if let Some(r) = lang::decimal::parse(span) { + return Some((r, 73)); + } + if let Some(r) = lang::cardinal::parse(span) { + return Some((r, 70)); + } + }}; + } + + match lang { + "fr" => { + try_lang_taggers!(tts::fr); + } + "es" => { + try_lang_taggers!(tts::es); + } + "de" => { + try_lang_taggers!(tts::de); + } + "zh" => { + try_lang_taggers!(tts::zh); + } + "hi" => { + try_lang_taggers!(tts::hi); + } + "ja" => { + try_lang_taggers!(tts::ja); + } + _ => { + return tn_parse_span(span); + } + } + + None +} + /// Default maximum token span to consider when scanning a sentence. const DEFAULT_MAX_SPAN_TOKENS: usize = 16; @@ -367,6 +659,97 @@ pub fn tn_normalize_sentence(input: &str) -> String { tn_normalize_sentence_with_max_span(input, DEFAULT_MAX_SPAN_TOKENS) } +/// Normalize written-form text to spoken form for a specific language. +/// +/// Supported languages: "en", "fr", "es", "de", "zh", "hi", "ja". +/// Falls back to English for unrecognized language codes. +/// +/// ``` +/// use text_processing_rs::tn_normalize_lang; +/// +/// assert_eq!(tn_normalize_lang("123", "fr"), "cent vingt-trois"); +/// assert_eq!(tn_normalize_lang("123", "en"), "one hundred twenty three"); +/// ``` +pub fn tn_normalize_lang(input: &str, lang: &str) -> String { + tn_normalize_for_lang(input, lang) +} + +/// Normalize a full sentence (TN) for a specific language. +/// +/// Supported languages: "en", "fr", "es", "de", "zh", "hi", "ja". +/// Falls back to English for unrecognized language codes. +pub fn tn_normalize_sentence_lang(input: &str, lang: &str) -> String { + tn_normalize_sentence_with_max_span_lang(input, lang, DEFAULT_MAX_SPAN_TOKENS) +} + +/// Normalize a full sentence (TN) for a specific language with configurable max span. +pub fn tn_normalize_sentence_with_max_span_lang( + input: &str, + lang: &str, + max_span_tokens: usize, +) -> String { + match lang { + "en" | "" => tn_normalize_sentence_with_max_span(input, max_span_tokens), + _ => { + let trimmed = input.trim(); + if trimmed.is_empty() { + return trimmed.to_string(); + } + + let max_span = if max_span_tokens == 0 { + 1 + } else { + max_span_tokens + }; + let tokens: Vec<&str> = trimmed.split_whitespace().collect(); + let mut out: Vec = Vec::with_capacity(tokens.len()); + let mut i = 0usize; + + while i < tokens.len() { + let max_end = usize::min(tokens.len(), i + max_span); + let mut best: Option<(usize, String, u8)> = None; + + for end in (i + 1..=max_end).rev() { + let span = tokens[i..end].join(" "); + let Some((candidate, score)) = tn_parse_span_lang(&span, lang) else { + continue; + }; + + let candidate_trimmed = candidate.trim(); + if candidate_trimmed.is_empty() || candidate_trimmed == span { + continue; + } + + let candidate_len = end - i; + match &best { + None => { + best = Some((end, candidate, score)); + } + Some((best_end, _, best_score)) => { + let best_len = *best_end - i; + if candidate_len > best_len + || (candidate_len == best_len && score > *best_score) + { + best = Some((end, candidate, score)); + } + } + } + } + + if let Some((end, replacement, _)) = best { + out.push(replacement); + i = end; + } else { + out.push(tokens[i].to_string()); + i += 1; + } + } + + out.join(" ") + } + } +} + /// Normalize a full sentence (TN) with a configurable max span size. pub fn tn_normalize_sentence_with_max_span(input: &str, max_span_tokens: usize) -> String { let trimmed = input.trim(); diff --git a/src/tts/de/cardinal.rs b/src/tts/de/cardinal.rs new file mode 100644 index 0000000..5aa3ab1 --- /dev/null +++ b/src/tts/de/cardinal.rs @@ -0,0 +1,82 @@ +//! Cardinal TN tagger for German. +//! +//! Converts written cardinal numbers to spoken German: +//! - "123" → "einhundertdreiundzwanzig" +//! - "-42" → "minus zweiundvierzig" +//! - "1.000" → "eintausend" (dot as thousands separator) + +use super::number_to_words; + +/// Parse a written cardinal number to spoken German words. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + let (is_negative, digits_part) = if let Some(rest) = trimmed.strip_prefix('-') { + (true, rest) + } else { + (false, trimmed) + }; + + // Must be digits (with optional dots, spaces, or non-breaking spaces as thousands separators) + // German uses dot or space as thousands separator (e.g. "1.000" or "1 000") + if !digits_part + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ' ' || c == '\u{a0}') + { + return None; + } + + if !digits_part.chars().any(|c| c.is_ascii_digit()) { + return None; + } + + // Strip thousands separators + let clean: String = digits_part.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + + if is_negative { + Some(format!("minus {}", number_to_words(n))) + } else { + Some(number_to_words(n)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(parse("0"), Some("null".to_string())); + assert_eq!(parse("1"), Some("eins".to_string())); + assert_eq!(parse("21"), Some("einundzwanzig".to_string())); + assert_eq!(parse("100"), Some("einhundert".to_string())); + assert_eq!(parse("123"), Some("einhundertdreiundzwanzig".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!(parse("-42"), Some("minus zweiundvierzig".to_string())); + assert_eq!(parse("-1"), Some("minus eins".to_string())); + assert_eq!(parse("-1000"), Some("minus eintausend".to_string())); + } + + #[test] + fn test_thousands_separator() { + assert_eq!(parse("1.000"), Some("eintausend".to_string())); + assert_eq!( + parse("2.025"), + Some("zweitausendfuenfundzwanzig".to_string()) + ); + } + + #[test] + fn test_non_numbers() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("12abc"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/de/date.rs b/src/tts/de/date.rs new file mode 100644 index 0000000..b62debc --- /dev/null +++ b/src/tts/de/date.rs @@ -0,0 +1,347 @@ +//! Date TN tagger for German. +//! +//! Converts written date expressions to spoken German: +//! - "5. Januar 2025" → "fuenfter januar zweitausendfuenfundzwanzig" +//! - "05.01.2025" → "fuenfter erster zweitausendfuenfundzwanzig" +//! - "January 5, 2025" → "fuenfter januar zweitausendfuenfundzwanzig" + +use super::number_to_words; +use super::ordinal::ordinal_word_ter; + +const MONTHS_DE: &[(&str, &str)] = &[ + ("januar", "januar"), + ("februar", "februar"), + ("maerz", "maerz"), + ("april", "april"), + ("mai", "mai"), + ("juni", "juni"), + ("juli", "juli"), + ("august", "august"), + ("september", "september"), + ("oktober", "oktober"), + ("november", "november"), + ("dezember", "dezember"), +]; + +const MONTHS_EN: &[(&str, u32)] = &[ + ("january", 1), + ("february", 2), + ("march", 3), + ("april", 4), + ("may", 5), + ("june", 6), + ("july", 7), + ("august", 8), + ("september", 9), + ("october", 10), + ("november", 11), + ("december", 12), +]; + +const MONTH_NAMES: &[&str] = &[ + "", + "januar", + "februar", + "maerz", + "april", + "mai", + "juni", + "juli", + "august", + "september", + "oktober", + "november", + "dezember", +]; + +/// Parse a written date to spoken German. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Try decade: "1980s" → "die achtziger jahre" + if let Some(result) = parse_decade(trimmed) { + return Some(result); + } + + // Try German format: "5. Januar 2025" + if let Some(result) = parse_german_date(trimmed) { + return Some(result); + } + + // Try English month format: "January 5, 2025" + if let Some(result) = parse_english_month_date(trimmed) { + return Some(result); + } + + // Try numeric DD.MM.YYYY (German uses dots) + if let Some(result) = parse_numeric_date(trimmed) { + return Some(result); + } + + None +} + +/// Parse decade: "1980s" → "die achtziger jahre" +fn parse_decade(input: &str) -> Option { + let s = input.strip_suffix('s')?; + if s.len() != 4 || !s.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let year: u32 = s.parse().ok()?; + if year < 1000 { + return None; + } + + // Must be a round decade (ends in 0) + if year % 10 != 0 { + return None; + } + + // German: "die [year] jahre" + let year_words = number_to_words(year as i64); + Some(format!("die {}er jahre", year_words)) +} + +fn parse_german_date(input: &str) -> Option { + let lower = input.to_lowercase(); + let tokens: Vec<&str> = lower.split_whitespace().collect(); + if tokens.len() < 2 { + return None; + } + + // "5. Januar" or "5. Januar 2025" + // Day must end with "." for German ordinal + let day_str = tokens[0].strip_suffix('.')?; + if !day_str.chars().all(|c| c.is_ascii_digit()) || day_str.is_empty() { + return None; + } + + let day: u32 = day_str.parse().ok()?; + if day == 0 || day > 31 { + return None; + } + + // Find month + let month_name = MONTHS_DE.iter().find(|(name, _)| *name == tokens[1]); + let month_spoken = month_name?.1; + + let day_word = ordinal_word_ter(day); + + if tokens.len() >= 3 { + let year_str = + tokens[2].trim_end_matches(|c: char| c == '.' || c == ',' || c == '!' || c == '?'); + if year_str.chars().all(|c| c.is_ascii_digit()) && year_str.len() == 4 { + let year: u32 = year_str.parse().ok()?; + let year_words = verbalize_year(year)?; + return Some(format!("{} {} {}", day_word, month_spoken, year_words)); + } + } + + Some(format!("{} {}", day_word, month_spoken)) +} + +fn parse_english_month_date(input: &str) -> Option { + let lower = input.to_lowercase(); + + let mut month_num = None; + let mut rest = ""; + for &(name, num) in MONTHS_EN { + if let Some(r) = lower.strip_prefix(name) { + if r.is_empty() || r.starts_with(' ') || r.starts_with(',') { + month_num = Some(num); + rest = r.trim_start_matches(|c: char| c == ' ' || c == ','); + break; + } + } + } + + let month_num = month_num?; + if rest.is_empty() { + return None; + } + + let month_name = MONTH_NAMES[month_num as usize]; + + // Parse day + let (day_str, year_part) = if let Some(comma_pos) = rest.find(',') { + (&rest[..comma_pos], Some(rest[comma_pos + 1..].trim())) + } else { + let parts: Vec<&str> = rest.splitn(2, ' ').collect(); + if parts.len() == 2 + && parts[0] + .trim_end_matches("st") + .trim_end_matches("nd") + .trim_end_matches("rd") + .trim_end_matches("th") + .chars() + .all(|c| c.is_ascii_digit()) + { + let year_clean = + parts[1].trim_end_matches(|c: char| c == '.' || c == ',' || c == '!' || c == '?'); + if year_clean.chars().all(|c| c.is_ascii_digit()) && year_clean.len() == 4 { + (parts[0], Some(year_clean)) + } else { + (rest, None) + } + } else { + (rest, None) + } + }; + + let day_digits = day_str + .trim() + .trim_end_matches("st") + .trim_end_matches("nd") + .trim_end_matches("rd") + .trim_end_matches("th"); + + if !day_digits.chars().all(|c| c.is_ascii_digit()) || day_digits.is_empty() { + return None; + } + + let day: u32 = day_digits.parse().ok()?; + if day == 0 || day > 31 { + return None; + } + + let day_word = ordinal_word_ter(day); + + if let Some(year_str) = year_part { + let year_str = year_str + .trim() + .trim_end_matches(|c: char| c == '.' || c == ',' || c == '!' || c == '?'); + if !year_str.is_empty() && year_str.chars().all(|c| c.is_ascii_digit()) { + let year: u32 = year_str.parse().ok()?; + let year_words = verbalize_year(year)?; + return Some(format!("{} {} {}", day_word, month_name, year_words)); + } + } + + Some(format!("{} {}", day_word, month_name)) +} + +/// Parse numeric date DD.MM.YYYY (German convention uses dots). +fn parse_numeric_date(input: &str) -> Option { + let sep = if input.contains('.') && input.chars().filter(|c| *c == '.').count() == 2 { + '.' + } else if input.contains('/') { + '/' + } else if input.contains('-') && input.chars().filter(|c| *c == '-').count() == 2 { + '-' + } else { + return None; + }; + + let parts: Vec<&str> = input.splitn(3, sep).collect(); + if parts.len() != 3 { + return None; + } + + if !parts + .iter() + .all(|p| !p.is_empty() && p.chars().all(|c| c.is_ascii_digit())) + { + return None; + } + + let day: u32 = parts[0].parse().ok()?; + let month_num: u32 = parts[1].parse().ok()?; + let year: u32 = parts[2].parse().ok()?; + + if month_num == 0 || month_num > 12 || day == 0 || day > 31 { + return None; + } + + let month_name = MONTH_NAMES[month_num as usize]; + let day_word = ordinal_word_ter(day); + let year_words = verbalize_year(year)?; + + Some(format!("{} {} {}", day_word, month_name, year_words)) +} + +/// Verbalize a year in German. +/// - 2025 → "zweitausendfuenfundzwanzig" +/// - 2000 → "zweitausend" +/// - 1990 → "neunzehnhundertneunzig" +fn verbalize_year(year: u32) -> Option { + if year == 0 { + return Some("null".to_string()); + } + // German typically says the full number for years + Some(number_to_words(year as i64)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_german_date() { + assert_eq!( + parse("5. Januar 2025"), + Some("fuenfter januar zweitausendfuenfundzwanzig".to_string()) + ); + assert_eq!(parse("1. Mai"), Some("erster mai".to_string())); + assert_eq!( + parse("3. Dezember 2000"), + Some("dritter dezember zweitausend".to_string()) + ); + } + + #[test] + fn test_english_month() { + assert_eq!( + parse("January 5, 2025"), + Some("fuenfter januar zweitausendfuenfundzwanzig".to_string()) + ); + } + + #[test] + fn test_numeric_date() { + assert_eq!( + parse("05.01.2025"), + Some("fuenfter januar zweitausendfuenfundzwanzig".to_string()) + ); + assert_eq!( + parse("31.12.1999"), + Some("einunddreissigster dezember eintausend neunhundertneunundneunzig".to_string()) + ); + } + + #[test] + fn test_decade() { + assert_eq!( + parse("1980s"), + Some("die eintausend neunhundertachtziger jahre".to_string()) + ); + assert_eq!(parse("2000s"), Some("die zweitausender jahre".to_string())); + assert_eq!( + parse("1990s"), + Some("die eintausend neunhundertneunziger jahre".to_string()) + ); + } + + #[test] + fn test_year_verbalization() { + assert_eq!( + verbalize_year(2025), + Some("zweitausendfuenfundzwanzig".to_string()) + ); + assert_eq!(verbalize_year(2000), Some("zweitausend".to_string())); + assert_eq!( + verbalize_year(1990), + Some("eintausend neunhundertneunzig".to_string()) + ); + assert_eq!( + verbalize_year(1900), + Some("eintausend neunhundert".to_string()) + ); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + } +} diff --git a/src/tts/de/decimal.rs b/src/tts/de/decimal.rs new file mode 100644 index 0000000..3d8d940 --- /dev/null +++ b/src/tts/de/decimal.rs @@ -0,0 +1,139 @@ +//! Decimal TN tagger for German. +//! +//! Converts written decimal numbers to spoken German: +//! - "3,14" → "drei komma eins vier" +//! - "0,5" → "null komma fuenf" +//! - "3.14" → "drei komma eins vier" + +use super::{number_to_words, spell_digits}; + +/// German quantity suffixes recognized after a decimal number. +const QUANTITY_SUFFIXES: &[&str] = &[ + "billiarden", + "billiarde", + "billionen", + "billion", + "milliarden", + "milliarde", + "millionen", + "million", + "tausend", +]; + +/// Parse a written decimal number to spoken German. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Check for quantity suffix: "1,5 milliarden" + let (number_part, suffix) = extract_suffix(trimmed); + + // German uses comma as decimal separator, but also accept period + let sep = if number_part.contains(',') && !number_part.contains('.') { + ',' + } else if number_part.contains('.') { + '.' + } else { + return None; + }; + + let parts: Vec<&str> = number_part.splitn(2, sep).collect(); + if parts.len() != 2 { + return None; + } + + let int_str = parts[0]; + let frac_str = parts[1]; + + let (is_negative, int_digits) = if let Some(rest) = int_str.strip_prefix('-') { + (true, rest) + } else { + (false, int_str) + }; + + if !int_digits.chars().all(|c| c.is_ascii_digit()) { + return None; + } + if frac_str.is_empty() || !frac_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let int_val: i64 = if int_digits.is_empty() { + 0 + } else { + int_digits.parse().ok()? + }; + + let int_words = number_to_words(int_val); + let frac_words = spell_digits(frac_str); + + let mut result = if is_negative { + format!("minus {} komma {}", int_words, frac_words) + } else { + format!("{} komma {}", int_words, frac_words) + }; + + if let Some(suf) = suffix { + result.push(' '); + result.push_str(suf); + } + + Some(result) +} + +/// Extract a quantity suffix from the end if present. +fn extract_suffix(input: &str) -> (&str, Option<&str>) { + for &suf in QUANTITY_SUFFIXES { + if let Some(before) = input.strip_suffix(suf) { + let before = before.trim_end(); + if !before.is_empty() { + return (before, Some(suf)); + } + } + } + (input, None) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_comma_decimal() { + assert_eq!(parse("3,14"), Some("drei komma eins vier".to_string())); + assert_eq!(parse("0,5"), Some("null komma fuenf".to_string())); + } + + #[test] + fn test_period_decimal() { + assert_eq!(parse("3.14"), Some("drei komma eins vier".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!( + parse("-3,14"), + Some("minus drei komma eins vier".to_string()) + ); + } + + #[test] + fn test_with_quantity() { + assert_eq!( + parse("1,5 milliarden"), + Some("eins komma fuenf milliarden".to_string()) + ); + assert_eq!( + parse("4,85 millionen"), + Some("vier komma acht fuenf millionen".to_string()) + ); + } + + #[test] + fn test_non_decimal() { + assert_eq!(parse("123"), None); + assert_eq!(parse("hello"), None); + } +} diff --git a/src/tts/de/electronic.rs b/src/tts/de/electronic.rs new file mode 100644 index 0000000..896ea53 --- /dev/null +++ b/src/tts/de/electronic.rs @@ -0,0 +1,159 @@ +//! Electronic TN tagger for German. +//! +//! Converts written emails and URLs to spoken German form: +//! - "test@gmail.com" -> "t e s t at g m a i l punkt c o m" +//! - "http://www.example.com" -> "h t t p doppelpunkt schraegstrich schraegstrich w w w punkt e x a m p l e punkt c o m" + +/// Parse an email or URL to spoken German form. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Email detection: contains @ with text on both sides + if trimmed.contains('@') { + return parse_email(trimmed); + } + + // URL detection: starts with http://, https://, or www. + let lower = trimmed.to_lowercase(); + if lower.starts_with("http://") || lower.starts_with("https://") || lower.starts_with("www.") { + return parse_url(trimmed); + } + + None +} + +/// Parse an email address to spoken German form. +fn parse_email(input: &str) -> Option { + let parts: Vec<&str> = input.splitn(2, '@').collect(); + if parts.len() != 2 || parts[0].is_empty() || parts[1].is_empty() { + return None; + } + + let local = spell_domain(parts[0]); + let domain = spell_domain(parts[1]); + + Some(format!("{} at {}", local, domain)) +} + +/// Parse a URL to spoken German form. +fn parse_url(input: &str) -> Option { + let mut result = String::new(); + let lower = input.to_lowercase(); + + let rest = if lower.starts_with("https://") { + result.push_str("h t t p s doppelpunkt schraegstrich schraegstrich"); + &input["https://".len()..] + } else if lower.starts_with("http://") { + result.push_str("h t t p doppelpunkt schraegstrich schraegstrich"); + &input["http://".len()..] + } else { + input + }; + + if !result.is_empty() && !rest.is_empty() { + result.push(' '); + } + + result.push_str(&spell_domain(rest)); + + Some(result) +} + +/// Spell out a domain name, using "punkt" for periods. +fn spell_domain(domain: &str) -> String { + let parts: Vec<&str> = domain.split('.').collect(); + let spelled: Vec = parts.iter().map(|p| spell_electronic(p)).collect(); + spelled.join(" punkt ") +} + +/// Spell out an electronic string in German. +/// +/// Letters are spelled individually with spaces. +/// Digit runs are spelled individually using German digit words. +/// Special characters are mapped to German words. +fn spell_electronic(s: &str) -> String { + let mut parts: Vec = Vec::new(); + + for c in s.chars() { + match c { + '-' => parts.push("bindestrich".to_string()), + '_' => parts.push("unterstrich".to_string()), + '/' => parts.push("schraegstrich".to_string()), + '~' => parts.push("tilde".to_string()), + ':' => parts.push("doppelpunkt".to_string()), + c if c.is_ascii_alphabetic() => { + parts.push(c.to_lowercase().to_string()); + } + c if c.is_ascii_digit() => { + parts.push(digit_word_de(c)); + } + _ => { + // Skip unknown characters + } + } + } + + parts.join(" ") +} + +fn digit_word_de(c: char) -> String { + match c { + '0' => "null", + '1' => "eins", + '2' => "zwei", + '3' => "drei", + '4' => "vier", + '5' => "fuenf", + '6' => "sechs", + '7' => "sieben", + '8' => "acht", + '9' => "neun", + _ => "", + } + .to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_email() { + assert_eq!( + parse("test@gmail.com"), + Some("t e s t at g m a i l punkt c o m".to_string()) + ); + assert_eq!( + parse("info@example.de"), + Some("i n f o at e x a m p l e punkt d e".to_string()) + ); + } + + #[test] + fn test_url_http() { + assert_eq!( + parse("http://www.example.com"), + Some( + "h t t p doppelpunkt schraegstrich schraegstrich w w w punkt e x a m p l e punkt c o m" + .to_string() + ) + ); + } + + #[test] + fn test_url_www() { + assert_eq!( + parse("www.example.de"), + Some("w w w punkt e x a m p l e punkt d e".to_string()) + ); + } + + #[test] + fn test_non_electronic() { + assert_eq!(parse("hallo"), None); + assert_eq!(parse("12345"), None); + } +} diff --git a/src/tts/de/measure.rs b/src/tts/de/measure.rs new file mode 100644 index 0000000..aa4a0dd --- /dev/null +++ b/src/tts/de/measure.rs @@ -0,0 +1,238 @@ +//! Measure TN tagger for German. +//! +//! Converts written measurements to spoken German: +//! - "200 km/h" → "zweihundert kilometer pro stunde" +//! - "1 kg" → "ein kilogramm" +//! - "72°C" → "zweiundsiebzig grad celsius" + +use super::number_to_words; + +use lazy_static::lazy_static; +use std::collections::HashMap; + +struct UnitInfo { + singular: &'static str, + plural: &'static str, +} + +lazy_static! { + static ref UNITS: HashMap<&'static str, UnitInfo> = { + let mut m = HashMap::new(); + + // Length + m.insert("mm", UnitInfo { singular: "millimeter", plural: "millimeter" }); + m.insert("cm", UnitInfo { singular: "zentimeter", plural: "zentimeter" }); + m.insert("m", UnitInfo { singular: "meter", plural: "meter" }); + m.insert("km", UnitInfo { singular: "kilometer", plural: "kilometer" }); + m.insert("in", UnitInfo { singular: "zoll", plural: "zoll" }); + m.insert("ft", UnitInfo { singular: "fuss", plural: "fuss" }); + m.insert("mi", UnitInfo { singular: "meile", plural: "meilen" }); + + // Weight + m.insert("mg", UnitInfo { singular: "milligramm", plural: "milligramm" }); + m.insert("g", UnitInfo { singular: "gramm", plural: "gramm" }); + m.insert("kg", UnitInfo { singular: "kilogramm", plural: "kilogramm" }); + m.insert("lb", UnitInfo { singular: "pfund", plural: "pfund" }); + m.insert("oz", UnitInfo { singular: "unze", plural: "unzen" }); + m.insert("t", UnitInfo { singular: "tonne", plural: "tonnen" }); + + // Volume + m.insert("ml", UnitInfo { singular: "milliliter", plural: "milliliter" }); + m.insert("l", UnitInfo { singular: "liter", plural: "liter" }); + m.insert("L", UnitInfo { singular: "liter", plural: "liter" }); + + // Speed — "pro" instead of "per" for rates + m.insert("km/h", UnitInfo { singular: "kilometer pro stunde", plural: "kilometer pro stunde" }); + m.insert("mph", UnitInfo { singular: "meile pro stunde", plural: "meilen pro stunde" }); + m.insert("m/s", UnitInfo { singular: "meter pro sekunde", plural: "meter pro sekunde" }); + + // Time + m.insert("s", UnitInfo { singular: "sekunde", plural: "sekunden" }); + m.insert("sec", UnitInfo { singular: "sekunde", plural: "sekunden" }); + m.insert("min", UnitInfo { singular: "minute", plural: "minuten" }); + m.insert("h", UnitInfo { singular: "stunde", plural: "stunden" }); + m.insert("hr", UnitInfo { singular: "stunde", plural: "stunden" }); + + // Temperature + m.insert("°C", UnitInfo { singular: "grad celsius", plural: "grad celsius" }); + m.insert("°F", UnitInfo { singular: "grad fahrenheit", plural: "grad fahrenheit" }); + + // Data + m.insert("KB", UnitInfo { singular: "kilobyte", plural: "kilobyte" }); + m.insert("MB", UnitInfo { singular: "megabyte", plural: "megabyte" }); + m.insert("GB", UnitInfo { singular: "gigabyte", plural: "gigabyte" }); + m.insert("TB", UnitInfo { singular: "terabyte", plural: "terabyte" }); + + // Percentage — "prozent" + m.insert("%", UnitInfo { singular: "prozent", plural: "prozent" }); + + // Frequency + m.insert("Hz", UnitInfo { singular: "hertz", plural: "hertz" }); + m.insert("kHz", UnitInfo { singular: "kilohertz", plural: "kilohertz" }); + m.insert("MHz", UnitInfo { singular: "megahertz", plural: "megahertz" }); + m.insert("GHz", UnitInfo { singular: "gigahertz", plural: "gigahertz" }); + + m + }; +} + +/// Parse a written measurement to spoken German. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + let mut unit_matches: Vec<(&str, &UnitInfo)> = UNITS + .iter() + .filter(|(unit, _)| { + trimmed.ends_with(*unit) + && (trimmed.len() == unit.len() || { + let before = &trimmed[..trimmed.len() - unit.len()]; + if unit.len() == 1 && unit.chars().all(|c| c.is_ascii_alphabetic()) { + before.ends_with(' ') + } else { + before.ends_with(' ') || before.ends_with(|c: char| c.is_ascii_digit()) + } + }) + }) + .map(|(k, v)| (*k, v)) + .collect(); + + unit_matches.sort_by(|a, b| b.0.len().cmp(&a.0.len())); + + for (unit_str, unit_info) in unit_matches { + let num_part = trimmed[..trimmed.len() - unit_str.len()].trim(); + if num_part.is_empty() { + continue; + } + + let (is_negative, digits) = if let Some(rest) = num_part.strip_prefix('-') { + (true, rest.trim()) + } else { + (false, num_part) + }; + + let clean: String = digits + .chars() + .filter(|c| c.is_ascii_digit() || *c == '.' || *c == ',') + .collect(); + + if clean.is_empty() + || !clean + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',') + { + continue; + } + + // Handle decimals + let decimal_sep = if clean.contains(',') { ',' } else { '.' }; + if clean.contains(decimal_sep) { + let parts: Vec<&str> = clean.splitn(2, decimal_sep).collect(); + if parts.len() == 2 { + let int_val: i64 = if parts[0].is_empty() { + 0 + } else { + let Ok(v) = parts[0].parse::() else { + continue; + }; + v + }; + let int_words = number_to_words(int_val); + let frac_words = super::spell_digits(parts[1]); + let unit_word = unit_info.plural; + let num_words = if is_negative { + format!("minus {} komma {}", int_words, frac_words) + } else { + format!("{} komma {}", int_words, frac_words) + }; + return Some(format!("{} {}", num_words, unit_word)); + } + continue; + } + + let Ok(n) = clean.parse::() else { + continue; + }; + + // Use "ein" instead of "eins" when before a unit + let num_words = if n == 1 && !is_negative { + "ein".to_string() + } else if n == 1 && is_negative { + "minus ein".to_string() + } else if is_negative { + format!("minus {}", number_to_words(n)) + } else { + number_to_words(n) + }; + + let abs_n = n.unsigned_abs(); + let unit_word = if abs_n == 1 { + unit_info.singular + } else { + unit_info.plural + }; + + return Some(format!("{} {}", num_words, unit_word)); + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!( + parse("200 km/h"), + Some("zweihundert kilometer pro stunde".to_string()) + ); + assert_eq!(parse("1 kg"), Some("ein kilogramm".to_string())); + assert_eq!(parse("2 kg"), Some("zwei kilogramm".to_string())); + } + + #[test] + fn test_temperature() { + assert_eq!( + parse("72°C"), + Some("zweiundsiebzig grad celsius".to_string()) + ); + } + + #[test] + fn test_percentage() { + assert_eq!(parse("50%"), Some("fuenfzig prozent".to_string())); + assert_eq!(parse("100%"), Some("einhundert prozent".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!( + parse("-66 kg"), + Some("minus sechsundsechzig kilogramm".to_string()) + ); + } + + #[test] + fn test_data() { + assert_eq!(parse("500 MB"), Some("fuenfhundert megabyte".to_string())); + assert_eq!(parse("1 GB"), Some("ein gigabyte".to_string())); + } + + #[test] + fn test_decimal_with_empty_integer() { + assert_eq!( + parse(".5 kg"), + Some("null komma fuenf kilogramm".to_string()) + ); + } + + #[test] + fn test_non_measure() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + } +} diff --git a/src/tts/de/mod.rs b/src/tts/de/mod.rs new file mode 100644 index 0000000..c4cbdbc --- /dev/null +++ b/src/tts/de/mod.rs @@ -0,0 +1,275 @@ +//! Text Normalization taggers for German. +//! +//! Converts written-form text to spoken German: +//! - "200" → "zweihundert" +//! - "5,50 €" → "fuenf euro und fuenfzig cent" +//! - "5. Januar 2025" → "fuenfter januar zweitausendfuenfundzwanzig" + +pub mod cardinal; +pub mod date; +pub mod decimal; +pub mod electronic; +pub mod measure; +pub mod money; +pub mod ordinal; +pub mod telephone; +pub mod time; +pub mod whitelist; + +/// Ones words indexed by value (0..13). +/// "eins" is the standalone form; "ein" is used in compounds. +const ONES: [&str; 13] = [ + "null", "eins", "zwei", "drei", "vier", "fuenf", "sechs", "sieben", "acht", "neun", "zehn", + "elf", "zwoelf", +]; + +/// Ones words used inside compounds (einundzwanzig, einhundert). +const ONES_COMPOUND: [&str; 10] = [ + "", "ein", "zwei", "drei", "vier", "fuenf", "sechs", "sieben", "acht", "neun", +]; + +/// Teens 13..19. +const TEENS: [&str; 7] = [ + "dreizehn", + "vierzehn", + "fuenfzehn", + "sechzehn", + "siebzehn", + "achtzehn", + "neunzehn", +]; + +/// Tens words indexed by tens digit (2..9 → index 0..7). +const TENS: [&str; 8] = [ + "zwanzig", "dreissig", "vierzig", "fuenfzig", "sechzig", "siebzig", "achtzig", "neunzig", +]; + +/// Convert an integer to German words. +/// +/// Examples: +/// - `0` → `"null"` +/// - `21` → `"einundzwanzig"` +/// - `200` → `"zweihundert"` +/// - `1000` → `"eintausend"` +/// - `-42` → `"minus zweiundvierzig"` +pub fn number_to_words(n: i64) -> String { + if n == 0 { + return "null".to_string(); + } + + if n < 0 { + let abs_val = (n as u64).wrapping_neg(); + return format!("minus {}", unsigned_to_words(abs_val)); + } + + unsigned_to_words(n as u64) +} + +fn unsigned_to_words(n: u64) -> String { + if n == 0 { + return "null".to_string(); + } + + let mut parts: Vec = Vec::new(); + let mut remaining = n; + + // German uses long scale: Billion = 10^12, Milliarde = 10^9 + // "eine Million", "zwei Millionen" etc. + let scales: &[(u64, &str, &str)] = &[ + (1_000_000_000_000, "billion", "billionen"), + (1_000_000_000, "milliarde", "milliarden"), + (1_000_000, "million", "millionen"), + (1_000, "tausend", "tausend"), + ]; + + for &(scale_value, singular, plural) in scales { + if remaining >= scale_value { + let chunk = remaining / scale_value; + remaining %= scale_value; + + if scale_value == 1_000 { + let thousands_word = if chunk == 1 { + "eintausend".to_string() + } else { + format!("{}tausend", chunk_to_words(chunk as u32)) + }; + + // In German, remainder < 100 is concatenated directly to thousands: + // "zweitausendfuenfundzwanzig" (2025) but + // "eintausend zweihundertvierunddreissig" (1234) + if remaining > 0 && remaining < 100 { + let rest_words = two_digit_to_words(remaining as u32); + parts.push(format!("{}{}", thousands_word, rest_words)); + remaining = 0; + } else { + parts.push(thousands_word); + } + } else { + // Million, Milliarde, etc. are separate words + let chunk_words = if chunk == 1 { + "eine".to_string() + } else { + chunk_to_words(chunk as u32) + }; + let scale_word = if chunk == 1 { singular } else { plural }; + parts.push(format!("{} {}", chunk_words, scale_word)); + } + } + } + + if remaining > 0 { + parts.push(chunk_to_words(remaining as u32)); + } + + parts.join(" ") +} + +/// Convert a number 1..999 to German words. +fn chunk_to_words(n: u32) -> String { + debug_assert!(n > 0 && n < 1000); + let hundreds = n / 100; + let rest = n % 100; + + let mut result = String::new(); + + if hundreds > 0 { + result.push_str(ONES_COMPOUND[hundreds as usize]); + result.push_str("hundert"); + } + + if rest > 0 { + result.push_str(&two_digit_to_words(rest)); + } + + result +} + +/// Convert 1..99 to German words. +fn two_digit_to_words(n: u32) -> String { + debug_assert!(n > 0 && n < 100); + + if n <= 12 { + // Use standalone form for 1 in compound context + if n == 1 { + return "eins".to_string(); + } + return ONES[n as usize].to_string(); + } + + if n < 20 { + return TEENS[(n - 13) as usize].to_string(); + } + + let tens_idx = (n / 10 - 2) as usize; + let ones = n % 10; + + if ones == 0 { + TENS[tens_idx].to_string() + } else { + // German: ones-und-tens (reversed order) + format!("{}und{}", ONES_COMPOUND[ones as usize], TENS[tens_idx]) + } +} + +/// Spell each digit of a string individually in German. +pub fn spell_digits(s: &str) -> String { + s.chars() + .filter_map(|c| c.to_digit(10).map(|d| ONES[d as usize])) + .collect::>() + .join(" ") +} + +/// Convert 1..99 to German words using compound form for 1 (ein instead of eins). +/// Used internally by other modules (e.g., money for "ein euro"). +#[allow(dead_code)] +pub(crate) fn two_digit_compound(n: u32) -> String { + if n == 0 { + return "null".to_string(); + } + if n == 1 { + return "ein".to_string(); + } + if n <= 12 { + return ONES[n as usize].to_string(); + } + if n < 20 { + return TEENS[(n - 13) as usize].to_string(); + } + let tens_idx = (n / 10 - 2) as usize; + let ones = n % 10; + if ones == 0 { + TENS[tens_idx].to_string() + } else { + format!("{}und{}", ONES_COMPOUND[ones as usize], TENS[tens_idx]) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(number_to_words(0), "null"); + assert_eq!(number_to_words(1), "eins"); + assert_eq!(number_to_words(10), "zehn"); + assert_eq!(number_to_words(11), "elf"); + assert_eq!(number_to_words(12), "zwoelf"); + assert_eq!(number_to_words(13), "dreizehn"); + assert_eq!(number_to_words(17), "siebzehn"); + assert_eq!(number_to_words(20), "zwanzig"); + assert_eq!(number_to_words(21), "einundzwanzig"); + assert_eq!(number_to_words(32), "zweiunddreissig"); + } + + #[test] + fn test_tens() { + assert_eq!(number_to_words(30), "dreissig"); + assert_eq!(number_to_words(40), "vierzig"); + assert_eq!(number_to_words(50), "fuenfzig"); + assert_eq!(number_to_words(60), "sechzig"); + assert_eq!(number_to_words(70), "siebzig"); + assert_eq!(number_to_words(80), "achtzig"); + assert_eq!(number_to_words(90), "neunzig"); + assert_eq!(number_to_words(99), "neunundneunzig"); + } + + #[test] + fn test_hundreds() { + assert_eq!(number_to_words(100), "einhundert"); + assert_eq!(number_to_words(200), "zweihundert"); + assert_eq!(number_to_words(201), "zweihunderteins"); + assert_eq!(number_to_words(123), "einhundertdreiundzwanzig"); + assert_eq!(number_to_words(999), "neunhundertneunundneunzig"); + } + + #[test] + fn test_thousands() { + assert_eq!(number_to_words(1000), "eintausend"); + assert_eq!(number_to_words(2000), "zweitausend"); + assert_eq!(number_to_words(2025), "zweitausendfuenfundzwanzig"); + assert_eq!( + number_to_words(1234), + "eintausend zweihundertvierunddreissig" + ); + } + + #[test] + fn test_millions() { + assert_eq!(number_to_words(1000000), "eine million"); + assert_eq!(number_to_words(2000000), "zwei millionen"); + assert_eq!(number_to_words(2000003), "zwei millionen drei"); + } + + #[test] + fn test_negative() { + assert_eq!(number_to_words(-42), "minus zweiundvierzig"); + } + + #[test] + fn test_spell_digits() { + assert_eq!(spell_digits("14"), "eins vier"); + assert_eq!(spell_digits("0"), "null"); + assert_eq!(spell_digits("987"), "neun acht sieben"); + } +} diff --git a/src/tts/de/money.rs b/src/tts/de/money.rs new file mode 100644 index 0000000..ba5d657 --- /dev/null +++ b/src/tts/de/money.rs @@ -0,0 +1,315 @@ +//! Money TN tagger for German. +//! +//! Converts written currency expressions to spoken German: +//! - "5,50 €" → "fuenf euro und fuenfzig cent" +//! - "€5.50" → "fuenf euro und fuenfzig cent" +//! - "$100" → "hundert dollar" +//! - "£1" → "ein pfund" + +use super::number_to_words; + +struct Currency { + singular: &'static str, + plural: &'static str, + cent_singular: &'static str, + cent_plural: &'static str, +} + +const EURO: Currency = Currency { + singular: "euro", + plural: "euro", + cent_singular: "cent", + cent_plural: "cent", +}; + +const DOLLAR: Currency = Currency { + singular: "dollar", + plural: "dollar", + cent_singular: "cent", + cent_plural: "cent", +}; + +const POUND: Currency = Currency { + singular: "pfund", + plural: "pfund", + cent_singular: "penny", + cent_plural: "pence", +}; + +const YEN: Currency = Currency { + singular: "yen", + plural: "yen", + cent_singular: "sen", + cent_plural: "sen", +}; + +/// Scale suffixes recognized after a currency amount. +const SCALE_SUFFIXES: &[&str] = &[ + "billiarden", + "billiarde", + "billionen", + "billion", + "milliarden", + "milliarde", + "millionen", + "million", + "tausend", +]; + +/// Parse a written money expression to spoken German. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Try suffix symbol: "5,50 €", "100 €" + if let Some(result) = parse_suffix_currency(trimmed) { + return Some(result); + } + + // Try prefix symbol: "$5.50", "€100", "£1" + if let Some(result) = parse_prefix_currency(trimmed) { + return Some(result); + } + + None +} + +fn parse_suffix_currency(input: &str) -> Option { + let (amount_str, currency) = if let Some(s) = input.strip_suffix('€') { + (s.trim(), &EURO) + } else if let Some(s) = input.strip_suffix("EUR") { + (s.trim(), &EURO) + } else { + return None; + }; + + parse_amount(amount_str, currency) +} + +fn parse_prefix_currency(input: &str) -> Option { + let (currency, rest) = if let Some(r) = input.strip_prefix('$') { + (&DOLLAR, r) + } else if let Some(r) = input.strip_prefix('€') { + (&EURO, r) + } else if let Some(r) = input.strip_prefix('£') { + (&POUND, r) + } else if let Some(r) = input.strip_prefix('¥') { + (&YEN, r) + } else { + return None; + }; + + let rest = rest.trim(); + if rest.is_empty() { + return None; + } + + // Check for scale suffix: "$2,5 milliarden" + let (amount_str, scale) = extract_scale(rest); + + // Without a scale suffix, the amount must be purely numeric + if scale.is_none() + && !amount_str + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == ' ') + { + return None; + } + + if let Some(scale_word) = scale { + // With scale: "$2,5 milliarden" → "zwei komma fuenf milliarden dollar" + let decimal_sep = if amount_str.contains(',') { + ',' + } else if amount_str.contains('.') { + '.' + } else { + // No decimal: "$50 millionen" → "fuenfzig millionen dollar" + let clean: String = amount_str.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + let words = number_to_words(n); + return Some(format!("{} {} {}", words, scale_word, currency.plural)); + }; + + let parts: Vec<&str> = amount_str.splitn(2, decimal_sep).collect(); + if parts.len() == 2 { + let int_val: i64 = parts[0].parse().ok()?; + let int_words = number_to_words(int_val); + let frac_words = super::spell_digits(parts[1]); + return Some(format!( + "{} komma {} {} {}", + int_words, frac_words, scale_word, currency.plural + )); + } + } + + parse_amount(amount_str, currency) +} + +/// Extract scale suffix from the amount string. +fn extract_scale(input: &str) -> (&str, Option<&str>) { + for &scale in SCALE_SUFFIXES { + if let Some(before) = input.strip_suffix(scale) { + let before = before.trim_end(); + if !before.is_empty() + && before + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == ' ') + { + return (before, Some(scale)); + } + } + } + (input, None) +} + +fn parse_amount(amount_str: &str, currency: &Currency) -> Option { + if amount_str.is_empty() { + return None; + } + + // Determine decimal separator: German uses comma + let sep = if amount_str.contains(',') { ',' } else { '.' }; + + if amount_str.contains(sep) && sep != '.' || amount_str.contains('.') { + let actual_sep = if amount_str.contains(',') { ',' } else { '.' }; + let parts: Vec<&str> = amount_str.splitn(2, actual_sep).collect(); + if parts.len() == 2 { + let int_clean: String = parts[0].chars().filter(|c| c.is_ascii_digit()).collect(); + let dollars: i64 = if int_clean.is_empty() { + 0 + } else { + int_clean.parse().ok()? + }; + + let cents_str = parts[1].trim(); + let cents: i64 = if cents_str.is_empty() { + 0 + } else if cents_str.len() == 1 { + cents_str.parse::().ok()? * 10 + } else if cents_str.len() == 2 { + cents_str.parse().ok()? + } else { + cents_str[..2].parse().ok()? + }; + + return Some(format_currency(dollars, cents, currency)); + } + } + + let clean: String = amount_str.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + Some(format_currency(n, 0, currency)) +} + +fn format_currency(dollars: i64, cents: i64, currency: &Currency) -> String { + // Use "ein" instead of "eins" for currency amounts + let dollar_words = if dollars == 1 { + "ein".to_string() + } else { + number_to_words(dollars) + }; + + if dollars == 0 && cents == 0 { + return format!("null {}", currency.plural); + } + + if dollars == 0 { + let cents_words = number_to_words(cents); + let unit = if cents == 1 { + currency.cent_singular + } else { + currency.cent_plural + }; + return format!("{} {}", cents_words, unit); + } + + if cents == 0 { + let unit = if dollars == 1 { + currency.singular + } else { + currency.plural + }; + return format!("{} {}", dollar_words, unit); + } + + let dollar_unit = if dollars == 1 { + currency.singular + } else { + currency.plural + }; + let cents_words = number_to_words(cents); + let cent_unit = if cents == 1 { + currency.cent_singular + } else { + currency.cent_plural + }; + + format!( + "{} {} und {} {}", + dollar_words, dollar_unit, cents_words, cent_unit + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_euro_suffix() { + assert_eq!(parse("5 €"), Some("fuenf euro".to_string())); + assert_eq!(parse("1 €"), Some("ein euro".to_string())); + assert_eq!( + parse("5,50 €"), + Some("fuenf euro und fuenfzig cent".to_string()) + ); + } + + #[test] + fn test_prefix_currencies() { + assert_eq!(parse("$100"), Some("einhundert dollar".to_string())); + assert_eq!(parse("£1"), Some("ein pfund".to_string())); + assert_eq!(parse("€100"), Some("einhundert euro".to_string())); + } + + #[test] + fn test_cents_only() { + assert_eq!(parse("€0.50"), Some("fuenfzig cent".to_string())); + assert_eq!(parse("$0.01"), Some("eins cent".to_string())); + } + + #[test] + fn test_dollars_and_cents() { + assert_eq!( + parse("$5.50"), + Some("fuenf dollar und fuenfzig cent".to_string()) + ); + assert_eq!(parse("$1.01"), Some("ein dollar und eins cent".to_string())); + assert_eq!(parse("$0.99"), Some("neunundneunzig cent".to_string())); + } + + #[test] + fn test_large_amounts() { + assert_eq!( + parse("$2,5 milliarden"), + Some("zwei komma fuenf milliarden dollar".to_string()) + ); + assert_eq!( + parse("$50 millionen"), + Some("fuenfzig millionen dollar".to_string()) + ); + } + + #[test] + fn test_trailing_dot() { + assert_eq!(parse("$5."), Some("fuenf dollar".to_string())); + assert_eq!(parse("$1."), Some("ein dollar".to_string())); + } + + #[test] + fn test_non_money() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + } +} diff --git a/src/tts/de/ordinal.rs b/src/tts/de/ordinal.rs new file mode 100644 index 0000000..99933b1 --- /dev/null +++ b/src/tts/de/ordinal.rs @@ -0,0 +1,126 @@ +//! Ordinal TN tagger for German. +//! +//! Converts written ordinal numbers to spoken German: +//! - "1." → "erste" +//! - "2." → "zweite" +//! - "3." → "dritte" +//! - "20." → "zwanzigste" + +use super::number_to_words; + +/// Special ordinal forms for 1-19 (irregular). +const SPECIAL_ORDINALS: &[(u32, &str)] = &[ + (1, "erste"), + (2, "zweite"), + (3, "dritte"), + (4, "vierte"), + (5, "fuenfte"), + (6, "sechste"), + (7, "siebte"), + (8, "achte"), +]; + +/// Parse a written ordinal to spoken German words. +/// +/// German ordinals are formed by adding a period after the number: "1.", "2.", "3." +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // German ordinals end with a period: "1.", "2.", "3." + let num_str = trimmed.strip_suffix('.')?; + + if num_str.is_empty() || !num_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let n: u32 = num_str.parse().ok()?; + if n == 0 { + return None; + } + + Some(ordinal_word(n)) +} + +/// Convert a number to its German ordinal word form. +fn ordinal_word(n: u32) -> String { + // Check special forms first + for &(num, word) in SPECIAL_ORDINALS { + if n == num { + return word.to_string(); + } + } + + let cardinal = number_to_words(n as i64); + + if n < 20 { + // 1-19: add "-te" suffix (special cases handled above) + format!("{}te", cardinal) + } else { + // 20+: add "-ste" suffix + format!("{}ste", cardinal) + } +} + +/// Convert a number to its German ordinal with "-ter" ending (for dates). +pub(crate) fn ordinal_word_ter(n: u32) -> String { + if n == 0 { + return "nullter".to_string(); + } + + // Check special forms + let base = match n { + 1 => "erster".to_string(), + 2 => "zweiter".to_string(), + 3 => "dritter".to_string(), + 4 => "vierter".to_string(), + 5 => "fuenfter".to_string(), + 6 => "sechster".to_string(), + 7 => "siebter".to_string(), + 8 => "achter".to_string(), + _ => { + let cardinal = number_to_words(n as i64); + if n < 20 { + format!("{}ter", cardinal) + } else { + format!("{}ster", cardinal) + } + } + }; + base +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_special_ordinals() { + assert_eq!(parse("1."), Some("erste".to_string())); + assert_eq!(parse("2."), Some("zweite".to_string())); + assert_eq!(parse("3."), Some("dritte".to_string())); + assert_eq!(parse("7."), Some("siebte".to_string())); + assert_eq!(parse("8."), Some("achte".to_string())); + } + + #[test] + fn test_regular_ordinals() { + assert_eq!(parse("9."), Some("neunte".to_string())); + assert_eq!(parse("10."), Some("zehnte".to_string())); + assert_eq!(parse("12."), Some("zwoelfte".to_string())); + assert_eq!(parse("15."), Some("fuenfzehnte".to_string())); + } + + #[test] + fn test_ordinals_20_plus() { + assert_eq!(parse("20."), Some("zwanzigste".to_string())); + assert_eq!(parse("21."), Some("einundzwanzigste".to_string())); + assert_eq!(parse("100."), Some("einhundertste".to_string())); + } + + #[test] + fn test_non_ordinals() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("0."), None); + assert_eq!(parse("."), None); + } +} diff --git a/src/tts/de/telephone.rs b/src/tts/de/telephone.rs new file mode 100644 index 0000000..fc87a49 --- /dev/null +++ b/src/tts/de/telephone.rs @@ -0,0 +1,173 @@ +//! Telephone TN tagger for German. +//! +//! Converts written phone numbers to spoken German form: +//! - "030-1234-5678" -> "null drei null, eins zwei drei vier, fuenf sechs sieben acht" +//! - "+49-30-1234-5678" -> "plus vier neun, drei null, eins zwei drei vier, fuenf sechs sieben acht" +//! - "(030) 123-4567" -> "null drei null, eins zwei drei, vier fuenf sechs sieben" + +/// Parse a written phone number to spoken German form. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Phone numbers contain digits and separators (-, ., space, parens) + // Must have mostly digits + let digit_count = trimmed.chars().filter(|c| c.is_ascii_digit()).count(); + let non_digit_non_sep = trimmed + .chars() + .filter(|c| { + !c.is_ascii_digit() + && *c != '-' + && *c != '.' + && *c != ' ' + && *c != '(' + && *c != ')' + && *c != '+' + }) + .count(); + + // Must have at least 7 digits and no unexpected characters + if digit_count < 7 || non_digit_non_sep > 0 { + return None; + } + + // Must contain at least one separator (-, ., space, parens) to distinguish + // from plain numbers like "1000000" + let has_separator = trimmed + .chars() + .any(|c| c == '-' || c == '.' || c == ' ' || c == '(' || c == ')'); + if !has_separator { + return None; + } + + let mut parts: Vec = Vec::new(); + let mut has_plus = false; + + // Handle leading + + let rest = if let Some(r) = trimmed.strip_prefix('+') { + has_plus = true; + r.trim_start() + } else { + trimmed + }; + + // Split by common separators + let groups = split_phone_groups(rest); + + if has_plus && !groups.is_empty() { + // The first group after + is the country code + let mut first = String::from("plus "); + first.push_str(&spell_digit_group(&groups[0])); + parts.push(first); + for g in &groups[1..] { + parts.push(spell_digit_group(g)); + } + } else { + for g in &groups { + parts.push(spell_digit_group(g)); + } + } + + if parts.is_empty() { + return None; + } + + Some(parts.join(", ")) +} + +/// Split phone number into groups by separators. +fn split_phone_groups(input: &str) -> Vec { + let mut groups: Vec = Vec::new(); + let mut current = String::new(); + + for c in input.chars() { + match c { + '0'..='9' => current.push(c), + '-' | '.' | ' ' | '(' | ')' => { + if !current.is_empty() { + groups.push(current.clone()); + current.clear(); + } + } + _ => {} + } + } + + if !current.is_empty() { + groups.push(current); + } + + groups +} + +/// Spell each digit in a group using German words. +fn spell_digit_group(group: &str) -> String { + group + .chars() + .filter_map(|c| { + let word = match c { + '0' => "null", + '1' => "eins", + '2' => "zwei", + '3' => "drei", + '4' => "vier", + '5' => "fuenf", + '6' => "sechs", + '7' => "sieben", + '8' => "acht", + '9' => "neun", + _ => return None, + }; + Some(word) + }) + .collect::>() + .join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_standard_phone() { + assert_eq!( + parse("030-1234-5678"), + Some("null drei null, eins zwei drei vier, fuenf sechs sieben acht".to_string()) + ); + } + + #[test] + fn test_with_country_code() { + assert_eq!( + parse("+49-30-1234-5678"), + Some( + "plus vier neun, drei null, eins zwei drei vier, fuenf sechs sieben acht" + .to_string() + ) + ); + } + + #[test] + fn test_parentheses() { + assert_eq!( + parse("(030) 123-4567"), + Some("null drei null, eins zwei drei, vier fuenf sechs sieben".to_string()) + ); + } + + #[test] + fn test_dots() { + assert_eq!( + parse("555.123.4567"), + Some("fuenf fuenf fuenf, eins zwei drei, vier fuenf sechs sieben".to_string()) + ); + } + + #[test] + fn test_non_phone() { + assert_eq!(parse("hallo"), None); + assert_eq!(parse("123"), None); + } +} diff --git a/src/tts/de/time.rs b/src/tts/de/time.rs new file mode 100644 index 0000000..58cb040 --- /dev/null +++ b/src/tts/de/time.rs @@ -0,0 +1,151 @@ +//! Time TN tagger for German. +//! +//! Converts written time expressions to spoken German: +//! - "14:30" → "vierzehn uhr dreissig" +//! - "2:00" → "zwei uhr" +//! - "0:00" → "null uhr" +//! - "12:00" → "zwoelf uhr" + +use super::number_to_words; + +/// Parse a written time expression to spoken German. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Try "14:30" format + if let Some(result) = parse_colon_format(trimmed) { + return Some(result); + } + + // Try "14 Uhr 30" or "14 Uhr" format (German convention) + if let Some(result) = parse_uhr_format(trimmed) { + return Some(result); + } + + None +} + +fn parse_colon_format(input: &str) -> Option { + if !input.contains(':') { + return None; + } + + let parts: Vec<&str> = input.splitn(2, ':').collect(); + if parts.len() != 2 { + return None; + } + + let hour_str = parts[0].trim(); + let min_str = parts[1].trim(); + + if !hour_str.chars().all(|c| c.is_ascii_digit()) || hour_str.is_empty() { + return None; + } + if !min_str.chars().all(|c| c.is_ascii_digit()) || min_str.is_empty() { + return None; + } + + let hour: u32 = hour_str.parse().ok()?; + let minute: u32 = min_str.parse().ok()?; + + if hour > 23 || minute > 59 { + return None; + } + + Some(format_time(hour, minute)) +} + +fn parse_uhr_format(input: &str) -> Option { + let lower = input.to_lowercase(); + + // Match "14 uhr 30" or "14 uhr" + let uhr_pos = lower.find(" uhr")?; + let hour_str = &lower[..uhr_pos].trim(); + let after_uhr = &lower[uhr_pos + 4..].trim(); + + if !hour_str.chars().all(|c| c.is_ascii_digit()) || hour_str.is_empty() { + return None; + } + + let hour: u32 = hour_str.parse().ok()?; + if hour > 23 { + return None; + } + + let minute: u32 = if after_uhr.is_empty() { + 0 + } else { + if !after_uhr.chars().all(|c| c.is_ascii_digit()) { + return None; + } + let m: u32 = after_uhr.parse().ok()?; + if m > 59 { + return None; + } + m + }; + + Some(format_time(hour, minute)) +} + +fn format_time(hour: u32, minute: u32) -> String { + // Special cases + if hour == 0 && minute == 0 { + return "mitternacht".to_string(); + } + if hour == 12 && minute == 0 { + return "mittag".to_string(); + } + + let hour_words = number_to_words(hour as i64); + + if minute == 0 { + format!("{} uhr", hour_words) + } else { + format!("{} uhr {}", hour_words, number_to_words(minute as i64)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_colon_format() { + assert_eq!(parse("14:30"), Some("vierzehn uhr dreissig".to_string())); + assert_eq!(parse("2:00"), Some("zwei uhr".to_string())); + assert_eq!(parse("8:15"), Some("acht uhr fuenfzehn".to_string())); + } + + #[test] + fn test_uhr_format() { + assert_eq!( + parse("14 Uhr 30"), + Some("vierzehn uhr dreissig".to_string()) + ); + assert_eq!(parse("8 Uhr"), Some("acht uhr".to_string())); + } + + #[test] + fn test_special_hours() { + assert_eq!(parse("0:00"), Some("mitternacht".to_string())); + assert_eq!(parse("12:00"), Some("mittag".to_string())); + assert_eq!(parse("0:30"), Some("null uhr dreissig".to_string())); + } + + #[test] + fn test_24h() { + assert_eq!(parse("14:00"), Some("vierzehn uhr".to_string())); + assert_eq!( + parse("23:59"), + Some("dreiundzwanzig uhr neunundfuenfzig".to_string()) + ); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("25:00"), None); + assert_eq!(parse("12:60"), None); + } +} diff --git a/src/tts/de/whitelist.rs b/src/tts/de/whitelist.rs new file mode 100644 index 0000000..631edd5 --- /dev/null +++ b/src/tts/de/whitelist.rs @@ -0,0 +1,90 @@ +//! Whitelist TN tagger for German. +//! +//! Lookup table for common German abbreviations and special terms: +//! - "Dr." -> "doktor" +//! - "Hr." -> "herr" +//! - "z.B." -> "zum beispiel" +//! - "GmbH" -> "gesellschaft mit beschraenkter haftung" + +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + static ref WHITELIST: HashMap<&'static str, &'static str> = { + let mut m = HashMap::new(); + // Titles + m.insert("Dr.", "doktor"); + m.insert("Dr", "doktor"); + m.insert("Hr.", "herr"); + m.insert("Hr", "herr"); + m.insert("Fr.", "frau"); + m.insert("Fr", "frau"); + m.insert("Prof.", "professor"); + m.insert("Prof", "professor"); + m.insert("St.", "sankt"); + m.insert("St", "sankt"); + m.insert("Jr.", "junior"); + m.insert("Sr.", "senior"); + + // Common abbreviations + m.insert("z.B.", "zum beispiel"); + m.insert("d.h.", "das heisst"); + m.insert("usw.", "und so weiter"); + m.insert("etc.", "et cetera"); + m.insert("bzw.", "beziehungsweise"); + m.insert("evtl.", "eventuell"); + m.insert("ca.", "circa"); + + // Organizational + m.insert("Nr.", "nummer"); + m.insert("Str.", "strasse"); + m.insert("GmbH", "gesellschaft mit beschraenkter haftung"); + m.insert("AG", "aktiengesellschaft"); + m.insert("Abt.", "abteilung"); + m.insert("Tel.", "telefon"); + + m + }; +} + +/// Parse a German whitelist abbreviation to its spoken form. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Direct lookup (case-sensitive) + if let Some(&spoken) = WHITELIST.get(trimmed) { + return Some(spoken.to_string()); + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_titles() { + assert_eq!(parse("Dr."), Some("doktor".to_string())); + assert_eq!(parse("Hr."), Some("herr".to_string())); + assert_eq!(parse("Fr."), Some("frau".to_string())); + assert_eq!(parse("Prof."), Some("professor".to_string())); + } + + #[test] + fn test_abbreviations() { + assert_eq!(parse("z.B."), Some("zum beispiel".to_string())); + assert_eq!(parse("d.h."), Some("das heisst".to_string())); + assert_eq!(parse("usw."), Some("und so weiter".to_string())); + assert_eq!( + parse("GmbH"), + Some("gesellschaft mit beschraenkter haftung".to_string()) + ); + } + + #[test] + fn test_no_match() { + assert_eq!(parse("hallo"), None); + assert_eq!(parse("welt"), None); + } +} diff --git a/src/tts/es/cardinal.rs b/src/tts/es/cardinal.rs new file mode 100644 index 0000000..4a1877e --- /dev/null +++ b/src/tts/es/cardinal.rs @@ -0,0 +1,80 @@ +//! Cardinal TN tagger for Spanish. +//! +//! Converts written cardinal numbers to spoken Spanish: +//! - "123" → "ciento veintitres" +//! - "-42" → "menos cuarenta y dos" +//! - "1 000" → "mil" + +use super::number_to_words; + +/// Parse a written cardinal number to spoken Spanish words. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + let (is_negative, digits_part) = if let Some(rest) = trimmed.strip_prefix('-') { + (true, rest) + } else { + (false, trimmed) + }; + + // Must be digits (with optional dots, commas, or spaces as thousands separators) + if !digits_part + .chars() + .all(|c| c.is_ascii_digit() || c == ',' || c == '.' || c == ' ' || c == '\u{a0}') + { + return None; + } + + if !digits_part.chars().any(|c| c.is_ascii_digit()) { + return None; + } + + // Strip thousands separators (spaces, dots used as thousands sep in Spanish) + let clean: String = digits_part.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + + if is_negative { + Some(format!("menos {}", number_to_words(n))) + } else { + Some(number_to_words(n)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(parse("0"), Some("cero".to_string())); + assert_eq!(parse("1"), Some("uno".to_string())); + assert_eq!(parse("21"), Some("veintiuno".to_string())); + assert_eq!(parse("100"), Some("cien".to_string())); + assert_eq!(parse("123"), Some("ciento veintitres".to_string())); + } + + #[test] + fn test_thousands_separators() { + assert_eq!(parse("1 000"), Some("mil".to_string())); + assert_eq!(parse("1.000"), Some("mil".to_string())); + assert_eq!(parse("1,000"), Some("mil".to_string())); + assert_eq!(parse("1 000 000"), Some("un millon".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!(parse("-42"), Some("menos cuarenta y dos".to_string())); + assert_eq!(parse("-1"), Some("menos uno".to_string())); + assert_eq!(parse("-1000"), Some("menos mil".to_string())); + } + + #[test] + fn test_non_numbers() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("12abc"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/es/date.rs b/src/tts/es/date.rs new file mode 100644 index 0000000..c824a92 --- /dev/null +++ b/src/tts/es/date.rs @@ -0,0 +1,343 @@ +//! Date TN tagger for Spanish. +//! +//! Converts written date expressions to spoken Spanish: +//! - "5 de enero de 2025" → "cinco de enero de dos mil veinticinco" +//! - "January 5, 2025" → "cinco de enero de dos mil veinticinco" +//! - "05/01/2025" → "cinco de enero de dos mil veinticinco" (DD/MM/YYYY) + +use super::number_to_words; + +const MONTHS_ES: &[(&str, &str)] = &[ + ("enero", "enero"), + ("febrero", "febrero"), + ("marzo", "marzo"), + ("abril", "abril"), + ("mayo", "mayo"), + ("junio", "junio"), + ("julio", "julio"), + ("agosto", "agosto"), + ("septiembre", "septiembre"), + ("octubre", "octubre"), + ("noviembre", "noviembre"), + ("diciembre", "diciembre"), +]; + +const MONTHS_EN: &[(&str, u32)] = &[ + ("january", 1), + ("february", 2), + ("march", 3), + ("april", 4), + ("may", 5), + ("june", 6), + ("july", 7), + ("august", 8), + ("september", 9), + ("october", 10), + ("november", 11), + ("december", 12), +]; + +const MONTH_NAMES: &[&str] = &[ + "", + "enero", + "febrero", + "marzo", + "abril", + "mayo", + "junio", + "julio", + "agosto", + "septiembre", + "octubre", + "noviembre", + "diciembre", +]; + +/// Parse a written date to spoken Spanish. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Try decade: "1980s" → "los anos mil novecientos ochenta" + if let Some(result) = parse_decade(trimmed) { + return Some(result); + } + + // Try Spanish format: "5 de enero de 2025" + if let Some(result) = parse_spanish_date(trimmed) { + return Some(result); + } + + // Try English month format: "January 5, 2025" + if let Some(result) = parse_english_month_date(trimmed) { + return Some(result); + } + + // Try numeric DD/MM/YYYY + if let Some(result) = parse_numeric_date(trimmed) { + return Some(result); + } + + None +} + +/// Parse decade: "1980s" → "los anos mil novecientos ochenta" +fn parse_decade(input: &str) -> Option { + let s = input.strip_suffix('s')?; + if s.len() != 4 || !s.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let year: u32 = s.parse().ok()?; + if year < 1000 { + return None; + } + + // Must be a round decade (ends in 0) + if year % 10 != 0 { + return None; + } + + // Spanish: "los anos" + the year number + let year_words = number_to_words(year as i64); + Some(format!("los anos {}", year_words)) +} + +fn parse_spanish_date(input: &str) -> Option { + let lower = input.to_lowercase(); + let tokens: Vec<&str> = lower.split_whitespace().collect(); + + // "5 de enero" or "5 de enero de 2025" + if tokens.len() < 3 { + return None; + } + + // First token is the day + let day_str = tokens[0]; + if !day_str.chars().all(|c| c.is_ascii_digit()) || day_str.is_empty() { + return None; + } + + let day: u32 = day_str.parse().ok()?; + if day == 0 || day > 31 { + return None; + } + + // Second token must be "de" + if tokens[1] != "de" { + return None; + } + + // Third token is month name + let month_name = MONTHS_ES.iter().find(|(name, _)| *name == tokens[2]); + let month_spoken = month_name?.1; + + let day_word = number_to_words(day as i64); + + // Check for "de YYYY" after month + if tokens.len() >= 5 && tokens[3] == "de" { + let year_str = + tokens[4].trim_end_matches(|c: char| c == '.' || c == ',' || c == '!' || c == '?'); + if year_str.chars().all(|c| c.is_ascii_digit()) && year_str.len() == 4 { + let year: u32 = year_str.parse().ok()?; + let year_words = verbalize_year(year)?; + return Some(format!( + "{} de {} de {}", + day_word, month_spoken, year_words + )); + } + } + + Some(format!("{} de {}", day_word, month_spoken)) +} + +fn parse_english_month_date(input: &str) -> Option { + let lower = input.to_lowercase(); + + let mut month_num = None; + let mut rest = ""; + for &(name, num) in MONTHS_EN { + if let Some(r) = lower.strip_prefix(name) { + if r.is_empty() || r.starts_with(' ') || r.starts_with(',') { + month_num = Some(num); + rest = r.trim_start_matches(|c: char| c == ' ' || c == ','); + break; + } + } + } + + let month_num = month_num?; + if rest.is_empty() { + return None; + } + + let month_name = MONTH_NAMES[month_num as usize]; + + // Parse day + let (day_str, year_part) = if let Some(comma_pos) = rest.find(',') { + (&rest[..comma_pos], Some(rest[comma_pos + 1..].trim())) + } else { + let parts: Vec<&str> = rest.splitn(2, ' ').collect(); + if parts.len() == 2 + && parts[0] + .trim_end_matches("st") + .trim_end_matches("nd") + .trim_end_matches("rd") + .trim_end_matches("th") + .chars() + .all(|c| c.is_ascii_digit()) + { + let year_clean = + parts[1].trim_end_matches(|c: char| c == '.' || c == ',' || c == '!' || c == '?'); + if year_clean.chars().all(|c| c.is_ascii_digit()) && year_clean.len() == 4 { + (parts[0], Some(year_clean)) + } else { + (rest, None) + } + } else { + (rest, None) + } + }; + + let day_digits = day_str + .trim() + .trim_end_matches("st") + .trim_end_matches("nd") + .trim_end_matches("rd") + .trim_end_matches("th"); + + if !day_digits.chars().all(|c| c.is_ascii_digit()) || day_digits.is_empty() { + return None; + } + + let day: u32 = day_digits.parse().ok()?; + if day == 0 || day > 31 { + return None; + } + + let day_word = number_to_words(day as i64); + + if let Some(year_str) = year_part { + let year_str = year_str + .trim() + .trim_end_matches(|c: char| c == '.' || c == ',' || c == '!' || c == '?'); + if !year_str.is_empty() && year_str.chars().all(|c| c.is_ascii_digit()) { + let year: u32 = year_str.parse().ok()?; + let year_words = verbalize_year(year)?; + return Some(format!("{} de {} de {}", day_word, month_name, year_words)); + } + } + + Some(format!("{} de {}", day_word, month_name)) +} + +/// Parse numeric date DD/MM/YYYY (European convention, same as French). +fn parse_numeric_date(input: &str) -> Option { + let sep = if input.contains('/') { + '/' + } else if input.contains('-') && input.chars().filter(|c| *c == '-').count() == 2 { + '-' + } else { + return None; + }; + + let parts: Vec<&str> = input.splitn(3, sep).collect(); + if parts.len() != 3 { + return None; + } + + if !parts + .iter() + .all(|p| !p.is_empty() && p.chars().all(|c| c.is_ascii_digit())) + { + return None; + } + + let day: u32 = parts[0].parse().ok()?; + let month_num: u32 = parts[1].parse().ok()?; + let year: u32 = parts[2].parse().ok()?; + + if month_num == 0 || month_num > 12 || day == 0 || day > 31 { + return None; + } + + let month_name = MONTH_NAMES[month_num as usize]; + let day_word = number_to_words(day as i64); + let year_words = verbalize_year(year)?; + + Some(format!("{} de {} de {}", day_word, month_name, year_words)) +} + +/// Verbalize a year in Spanish. +/// - 2025 → "dos mil veinticinco" +/// - 2000 → "dos mil" +/// - 1990 → "mil novecientos noventa" +fn verbalize_year(year: u32) -> Option { + if year == 0 { + return Some("cero".to_string()); + } + // Spanish reads years as full numbers + Some(number_to_words(year as i64)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_spanish_date() { + assert_eq!( + parse("5 de enero de 2025"), + Some("cinco de enero de dos mil veinticinco".to_string()) + ); + assert_eq!(parse("1 de marzo"), Some("uno de marzo".to_string())); + } + + #[test] + fn test_english_month() { + assert_eq!( + parse("January 5, 2025"), + Some("cinco de enero de dos mil veinticinco".to_string()) + ); + } + + #[test] + fn test_numeric_date() { + assert_eq!( + parse("05/01/2025"), + Some("cinco de enero de dos mil veinticinco".to_string()) + ); + } + + #[test] + fn test_decade() { + assert_eq!( + parse("1980s"), + Some("los anos mil novecientos ochenta".to_string()) + ); + assert_eq!(parse("2000s"), Some("los anos dos mil".to_string())); + assert_eq!( + parse("1990s"), + Some("los anos mil novecientos noventa".to_string()) + ); + } + + #[test] + fn test_year_verbalization() { + assert_eq!( + verbalize_year(2025), + Some("dos mil veinticinco".to_string()) + ); + assert_eq!(verbalize_year(2000), Some("dos mil".to_string())); + assert_eq!( + verbalize_year(1990), + Some("mil novecientos noventa".to_string()) + ); + assert_eq!(verbalize_year(1900), Some("mil novecientos".to_string())); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + } +} diff --git a/src/tts/es/decimal.rs b/src/tts/es/decimal.rs new file mode 100644 index 0000000..d4df9b1 --- /dev/null +++ b/src/tts/es/decimal.rs @@ -0,0 +1,136 @@ +//! Decimal TN tagger for Spanish. +//! +//! Converts written decimal numbers to spoken Spanish: +//! - "3,14" → "tres coma uno cuatro" +//! - "3.14" → "tres coma uno cuatro" +//! - "0,5" → "cero coma cinco" + +use super::{number_to_words, spell_digits}; + +/// Spanish quantity suffixes recognized after a decimal number. +const QUANTITY_SUFFIXES: &[&str] = &[ + "billones", + "billon", + "mil millones", + "millones", + "millon", + "mil", +]; + +/// Parse a written decimal number to spoken Spanish. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Check for quantity suffix: "1,5 millones" + let (number_part, suffix) = extract_suffix(trimmed); + + // Spanish uses comma as decimal separator, but also accept period + let sep = if number_part.contains(',') && !number_part.contains('.') { + ',' + } else if number_part.contains('.') { + '.' + } else { + return None; + }; + + let parts: Vec<&str> = number_part.splitn(2, sep).collect(); + if parts.len() != 2 { + return None; + } + + let int_str = parts[0]; + let frac_str = parts[1]; + + let (is_negative, int_digits) = if let Some(rest) = int_str.strip_prefix('-') { + (true, rest) + } else { + (false, int_str) + }; + + if !int_digits.chars().all(|c| c.is_ascii_digit()) { + return None; + } + if frac_str.is_empty() || !frac_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let int_val: i64 = if int_digits.is_empty() { + 0 + } else { + int_digits.parse().ok()? + }; + + let int_words = number_to_words(int_val); + let frac_words = spell_digits(frac_str); + + let mut result = if is_negative { + format!("menos {} coma {}", int_words, frac_words) + } else { + format!("{} coma {}", int_words, frac_words) + }; + + if let Some(suf) = suffix { + result.push(' '); + result.push_str(suf); + } + + Some(result) +} + +/// Extract a quantity suffix from the end if present. +fn extract_suffix(input: &str) -> (&str, Option<&str>) { + for &suf in QUANTITY_SUFFIXES { + if let Some(before) = input.strip_suffix(suf) { + let before = before.trim_end(); + if !before.is_empty() { + return (before, Some(suf)); + } + } + } + (input, None) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_comma_decimal() { + assert_eq!(parse("3,14"), Some("tres coma uno cuatro".to_string())); + assert_eq!(parse("0,5"), Some("cero coma cinco".to_string())); + } + + #[test] + fn test_period_decimal() { + assert_eq!(parse("3.14"), Some("tres coma uno cuatro".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!( + parse("-3,14"), + Some("menos tres coma uno cuatro".to_string()) + ); + } + + #[test] + fn test_with_quantity() { + assert_eq!( + parse("1,5 millones"), + Some("uno coma cinco millones".to_string()) + ); + assert_eq!( + parse("4,85 mil millones"), + Some("cuatro coma ocho cinco mil millones".to_string()) + ); + } + + #[test] + fn test_non_decimal() { + assert_eq!(parse("123"), None); + assert_eq!(parse("hello"), None); + } +} diff --git a/src/tts/es/electronic.rs b/src/tts/es/electronic.rs new file mode 100644 index 0000000..fe4f368 --- /dev/null +++ b/src/tts/es/electronic.rs @@ -0,0 +1,162 @@ +//! Electronic TN tagger for Spanish. +//! +//! Converts written emails and URLs to spoken Spanish form: +//! - "test@gmail.com" → "t e s t arroba g m a i l punto c o m" +//! - "https://www.example.com" → "h t t p s dos puntos barra barra w w w punto e x a m p l e punto c o m" + +/// Parse an email or URL to spoken Spanish form. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Email detection: contains @ with text on both sides + if trimmed.contains('@') { + return parse_email(trimmed); + } + + // URL detection: starts with http://, https://, or www. + let lower = trimmed.to_lowercase(); + if lower.starts_with("http://") || lower.starts_with("https://") || lower.starts_with("www.") { + return parse_url(trimmed); + } + + None +} + +/// Parse an email address to spoken Spanish form. +fn parse_email(input: &str) -> Option { + let parts: Vec<&str> = input.splitn(2, '@').collect(); + if parts.len() != 2 || parts[0].is_empty() || parts[1].is_empty() { + return None; + } + + let local = spell_domain(parts[0]); + let domain = spell_domain(parts[1]); + + Some(format!("{} arroba {}", local, domain)) +} + +/// Parse a URL to spoken Spanish form. +fn parse_url(input: &str) -> Option { + let mut result = String::new(); + let lower = input.to_lowercase(); + + let rest = if lower.starts_with("https://") { + result.push_str("h t t p s dos puntos barra barra"); + &input["https://".len()..] + } else if lower.starts_with("http://") { + result.push_str("h t t p dos puntos barra barra"); + &input["http://".len()..] + } else { + input + }; + + if !result.is_empty() && !rest.is_empty() { + result.push(' '); + } + + result.push_str(&spell_domain(rest)); + + Some(result) +} + +/// Spell out a domain name, using "punto" for periods. +fn spell_domain(domain: &str) -> String { + let parts: Vec<&str> = domain.split('.').collect(); + let spelled: Vec = parts.iter().map(|p| spell_electronic(p)).collect(); + spelled.join(" punto ") +} + +/// Spell out an electronic string in Spanish. +/// +/// Letters are spelled individually with spaces. +/// Digit runs are spelled individually in Spanish. +/// Special characters use Spanish names. +fn spell_electronic(s: &str) -> String { + let mut parts: Vec = Vec::new(); + + for c in s.chars() { + match c { + '-' => parts.push("guion".to_string()), + '_' => parts.push("guion bajo".to_string()), + '/' => parts.push("barra".to_string()), + '~' => parts.push("tilde".to_string()), + ':' => parts.push("dos puntos".to_string()), + c if c.is_ascii_alphabetic() => { + parts.push(c.to_lowercase().to_string()); + } + c if c.is_ascii_digit() => { + parts.push(digit_word_es(c)); + } + _ => { + // Skip unknown characters + } + } + } + + parts.join(" ") +} + +fn digit_word_es(c: char) -> String { + match c { + '0' => "cero", + '1' => "uno", + '2' => "dos", + '3' => "tres", + '4' => "cuatro", + '5' => "cinco", + '6' => "seis", + '7' => "siete", + '8' => "ocho", + '9' => "nueve", + _ => "", + } + .to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_email() { + assert_eq!( + parse("test@gmail.com"), + Some("t e s t arroba g m a i l punto c o m".to_string()) + ); + assert_eq!( + parse("juan.perez@ejemplo.com"), + Some("j u a n punto p e r e z arroba e j e m p l o punto c o m".to_string()) + ); + } + + #[test] + fn test_url_http() { + assert_eq!( + parse("http://www.ejemplo.com"), + Some( + "h t t p dos puntos barra barra w w w punto e j e m p l o punto c o m".to_string() + ) + ); + assert_eq!( + parse("https://google.com"), + Some("h t t p s dos puntos barra barra g o o g l e punto c o m".to_string()) + ); + } + + #[test] + fn test_www_url() { + assert_eq!( + parse("www.ejemplo.com"), + Some("w w w punto e j e m p l o punto c o m".to_string()) + ); + } + + #[test] + fn test_non_electronic() { + assert_eq!(parse("hola"), None); + assert_eq!(parse("123"), None); + } +} diff --git a/src/tts/es/measure.rs b/src/tts/es/measure.rs new file mode 100644 index 0000000..a34eb47 --- /dev/null +++ b/src/tts/es/measure.rs @@ -0,0 +1,250 @@ +//! Measure TN tagger for Spanish. +//! +//! Converts written measurements to spoken Spanish: +//! - "200 km/h" → "doscientos kilometros por hora" +//! - "1 kg" → "un kilogramo" +//! - "72°C" → "setenta y dos grados celsius" + +use super::number_to_words; + +use lazy_static::lazy_static; +use std::collections::HashMap; + +struct UnitInfo { + singular: &'static str, + plural: &'static str, +} + +lazy_static! { + static ref UNITS: HashMap<&'static str, UnitInfo> = { + let mut m = HashMap::new(); + + // Length + m.insert("mm", UnitInfo { singular: "milimetro", plural: "milimetros" }); + m.insert("cm", UnitInfo { singular: "centimetro", plural: "centimetros" }); + m.insert("m", UnitInfo { singular: "metro", plural: "metros" }); + m.insert("km", UnitInfo { singular: "kilometro", plural: "kilometros" }); + m.insert("in", UnitInfo { singular: "pulgada", plural: "pulgadas" }); + m.insert("ft", UnitInfo { singular: "pie", plural: "pies" }); + m.insert("mi", UnitInfo { singular: "milla", plural: "millas" }); + + // Weight + m.insert("mg", UnitInfo { singular: "miligramo", plural: "miligramos" }); + m.insert("g", UnitInfo { singular: "gramo", plural: "gramos" }); + m.insert("kg", UnitInfo { singular: "kilogramo", plural: "kilogramos" }); + m.insert("lb", UnitInfo { singular: "libra", plural: "libras" }); + m.insert("oz", UnitInfo { singular: "onza", plural: "onzas" }); + m.insert("t", UnitInfo { singular: "tonelada", plural: "toneladas" }); + + // Volume + m.insert("ml", UnitInfo { singular: "mililitro", plural: "mililitros" }); + m.insert("l", UnitInfo { singular: "litro", plural: "litros" }); + m.insert("L", UnitInfo { singular: "litro", plural: "litros" }); + + // Speed + m.insert("km/h", UnitInfo { singular: "kilometro por hora", plural: "kilometros por hora" }); + m.insert("mph", UnitInfo { singular: "milla por hora", plural: "millas por hora" }); + m.insert("m/s", UnitInfo { singular: "metro por segundo", plural: "metros por segundo" }); + + // Time + m.insert("s", UnitInfo { singular: "segundo", plural: "segundos" }); + m.insert("sec", UnitInfo { singular: "segundo", plural: "segundos" }); + m.insert("min", UnitInfo { singular: "minuto", plural: "minutos" }); + m.insert("h", UnitInfo { singular: "hora", plural: "horas" }); + m.insert("hr", UnitInfo { singular: "hora", plural: "horas" }); + + // Temperature + m.insert("°C", UnitInfo { singular: "grado celsius", plural: "grados celsius" }); + m.insert("°F", UnitInfo { singular: "grado fahrenheit", plural: "grados fahrenheit" }); + + // Data + m.insert("KB", UnitInfo { singular: "kilobyte", plural: "kilobytes" }); + m.insert("MB", UnitInfo { singular: "megabyte", plural: "megabytes" }); + m.insert("GB", UnitInfo { singular: "gigabyte", plural: "gigabytes" }); + m.insert("TB", UnitInfo { singular: "terabyte", plural: "terabytes" }); + + // Percentage + m.insert("%", UnitInfo { singular: "por ciento", plural: "por ciento" }); + + // Frequency + m.insert("Hz", UnitInfo { singular: "hercio", plural: "hercios" }); + m.insert("kHz", UnitInfo { singular: "kilohercio", plural: "kilohercios" }); + m.insert("MHz", UnitInfo { singular: "megahercio", plural: "megahercios" }); + m.insert("GHz", UnitInfo { singular: "gigahercio", plural: "gigahercios" }); + + m + }; +} + +/// Convert trailing "uno" to "un" for use before masculine nouns. +/// "uno" → "un", "veintiuno" → "veintiun", "treinta y uno" → "treinta y un" +fn apocope_uno(s: &str) -> String { + if s == "uno" { + return "un".to_string(); + } + if let Some(prefix) = s.strip_suffix(" uno") { + return format!("{} un", prefix); + } + if s.ends_with("iuno") { + // "veintiuno" → "veintiun" + return format!("{}un", &s[..s.len() - 3]); + } + s.to_string() +} + +/// Parse a written measurement to spoken Spanish. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + let mut unit_matches: Vec<(&str, &UnitInfo)> = UNITS + .iter() + .filter(|(unit, _)| { + trimmed.ends_with(*unit) + && (trimmed.len() == unit.len() || { + let before = &trimmed[..trimmed.len() - unit.len()]; + if unit.len() == 1 && unit.chars().all(|c| c.is_ascii_alphabetic()) { + before.ends_with(' ') + } else { + before.ends_with(' ') || before.ends_with(|c: char| c.is_ascii_digit()) + } + }) + }) + .map(|(k, v)| (*k, v)) + .collect(); + + unit_matches.sort_by(|a, b| b.0.len().cmp(&a.0.len())); + + for (unit_str, unit_info) in unit_matches { + let num_part = trimmed[..trimmed.len() - unit_str.len()].trim(); + if num_part.is_empty() { + continue; + } + + let (is_negative, digits) = if let Some(rest) = num_part.strip_prefix('-') { + (true, rest.trim()) + } else { + (false, num_part) + }; + + let clean: String = digits + .chars() + .filter(|c| c.is_ascii_digit() || *c == '.' || *c == ',') + .collect(); + + if clean.is_empty() + || !clean + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',') + { + continue; + } + + // Handle decimals + let decimal_sep = if clean.contains(',') { ',' } else { '.' }; + if clean.contains(decimal_sep) { + let parts: Vec<&str> = clean.splitn(2, decimal_sep).collect(); + if parts.len() == 2 { + let int_val: i64 = if parts[0].is_empty() { + 0 + } else { + let Ok(v) = parts[0].parse::() else { + continue; + }; + v + }; + let int_words = number_to_words(int_val); + let frac_words = super::spell_digits(parts[1]); + let unit_word = unit_info.plural; + let num_words = if is_negative { + format!("menos {} coma {}", int_words, frac_words) + } else { + format!("{} coma {}", int_words, frac_words) + }; + return Some(format!("{} {}", num_words, unit_word)); + } + continue; + } + + let Ok(n) = clean.parse::() else { + continue; + }; + let raw_words = if is_negative { + format!("menos {}", number_to_words(n)) + } else { + number_to_words(n) + }; + // In Spanish, "uno" becomes "un" before a masculine noun (unit). + // Also "veintiuno" → "veintiun", etc. + let num_words = apocope_uno(&raw_words); + + let abs_n = n.unsigned_abs(); + let unit_word = if abs_n == 1 { + unit_info.singular + } else { + unit_info.plural + }; + + return Some(format!("{} {}", num_words, unit_word)); + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!( + parse("200 km/h"), + Some("doscientos kilometros por hora".to_string()) + ); + assert_eq!(parse("1 kg"), Some("un kilogramo".to_string())); + assert_eq!(parse("2 kg"), Some("dos kilogramos".to_string())); + } + + #[test] + fn test_temperature() { + assert_eq!( + parse("72°C"), + Some("setenta y dos grados celsius".to_string()) + ); + } + + #[test] + fn test_percentage() { + assert_eq!(parse("50%"), Some("cincuenta por ciento".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!( + parse("-66 kg"), + Some("menos sesenta y seis kilogramos".to_string()) + ); + } + + #[test] + fn test_data() { + assert_eq!(parse("500 MB"), Some("quinientos megabytes".to_string())); + assert_eq!(parse("1 GB"), Some("un gigabyte".to_string())); + } + + #[test] + fn test_decimal_with_empty_integer() { + assert_eq!( + parse(".5 kg"), + Some("cero coma cinco kilogramos".to_string()) + ); + } + + #[test] + fn test_non_measure() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + } +} diff --git a/src/tts/es/mod.rs b/src/tts/es/mod.rs new file mode 100644 index 0000000..00d1aa9 --- /dev/null +++ b/src/tts/es/mod.rs @@ -0,0 +1,268 @@ +//! Text Normalization taggers for Spanish. +//! +//! Converts written-form text to spoken Spanish: +//! - "200" → "doscientos" +//! - "5,50 €" → "cinco euros con cincuenta centimos" +//! - "5 de enero de 2025" → "cinco de enero de dos mil veinticinco" + +pub mod cardinal; +pub mod date; +pub mod decimal; +pub mod electronic; +pub mod measure; +pub mod money; +pub mod ordinal; +pub mod telephone; +pub mod time; +pub mod whitelist; + +/// Ones words indexed by value (0..30). +/// Spanish has unique words for 0-15, and single-word forms for 16-29. +const ONES: [&str; 30] = [ + "cero", + "uno", + "dos", + "tres", + "cuatro", + "cinco", + "seis", + "siete", + "ocho", + "nueve", + "diez", + "once", + "doce", + "trece", + "catorce", + "quince", + "dieciseis", + "diecisiete", + "dieciocho", + "diecinueve", + "veinte", + "veintiuno", + "veintidos", + "veintitres", + "veinticuatro", + "veinticinco", + "veintiseis", + "veintisiete", + "veintiocho", + "veintinueve", +]; + +/// Tens words indexed by tens digit (3..9 → index 0..6). +/// 20 and below are handled by ONES. 30-90 use these base words. +const TENS: [&str; 7] = [ + "treinta", + "cuarenta", + "cincuenta", + "sesenta", + "setenta", + "ochenta", + "noventa", +]; + +/// Hundreds words indexed by value (1..9 → index 0..8). +const HUNDREDS: [&str; 9] = [ + "ciento", + "doscientos", + "trescientos", + "cuatrocientos", + "quinientos", + "seiscientos", + "setecientos", + "ochocientos", + "novecientos", +]; + +/// Convert an integer to Spanish words. +/// +/// Examples: +/// - `0` → `"cero"` +/// - `21` → `"veintiuno"` +/// - `31` → `"treinta y uno"` +/// - `100` → `"cien"` +/// - `200` → `"doscientos"` +/// - `1000` → `"mil"` +/// - `-42` → `"menos cuarenta y dos"` +pub fn number_to_words(n: i64) -> String { + if n == 0 { + return "cero".to_string(); + } + + if n < 0 { + let abs_val = (n as u64).wrapping_neg(); + return format!("menos {}", unsigned_to_words(abs_val)); + } + + unsigned_to_words(n as u64) +} + +fn unsigned_to_words(n: u64) -> String { + if n == 0 { + return "cero".to_string(); + } + + let mut parts: Vec = Vec::new(); + let mut remaining = n; + + // Spanish uses long scale: billion = 10^12 in some contexts, + // but we follow the standard RAE convention: + // millon (10^6), mil millones (10^9 - "billon" is 10^12 in Spanish) + let scales: &[(u64, &str, &str)] = &[ + (1_000_000_000_000, "billon", "billones"), + (1_000_000, "millon", "millones"), + (1_000, "mil", "mil"), + ]; + + for &(scale_value, singular, plural) in scales { + if remaining >= scale_value { + let chunk = remaining / scale_value; + remaining %= scale_value; + + if singular == "mil" { + // "mil" never takes "un" prefix: 1000 = "mil", not "un mil" + if chunk == 1 { + parts.push("mil".to_string()); + } else { + parts.push(format!("{} mil", chunk_to_words(chunk as u32))); + } + } else { + // millon/billon: "un millon", "dos millones" + let chunk_words = chunk_to_words(chunk as u32); + if chunk == 1 { + parts.push(format!("un {}", singular)); + } else { + parts.push(format!("{} {}", chunk_words, plural)); + } + } + } + } + + if remaining > 0 { + parts.push(chunk_to_words(remaining as u32)); + } + + parts.join(" ") +} + +/// Convert a number 1..999 to Spanish words. +fn chunk_to_words(n: u32) -> String { + debug_assert!(n > 0 && n < 1000); + let hundreds = n / 100; + let rest = n % 100; + + let mut result = String::new(); + + if hundreds > 0 { + if hundreds == 1 && rest == 0 { + // 100 standalone = "cien" + return "cien".to_string(); + } + result.push_str(HUNDREDS[(hundreds - 1) as usize]); + } + + if rest > 0 { + if !result.is_empty() { + result.push(' '); + } + result.push_str(&two_digit_to_words(rest)); + } + + result +} + +/// Convert 1..99 to Spanish words. +fn two_digit_to_words(n: u32) -> String { + debug_assert!(n > 0 && n < 100); + + // 1-29 have unique/compound single-word forms + if n < 30 { + return ONES[n as usize].to_string(); + } + + // 30-99: tens + " y " + ones + let tens_idx = (n / 10 - 3) as usize; + let ones = n % 10; + if ones == 0 { + TENS[tens_idx].to_string() + } else { + format!("{} y {}", TENS[tens_idx], ONES[ones as usize]) + } +} + +/// Spell each digit of a string individually in Spanish. +pub fn spell_digits(s: &str) -> String { + s.chars() + .filter_map(|c| c.to_digit(10).map(|d| ONES[d as usize])) + .collect::>() + .join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(number_to_words(0), "cero"); + assert_eq!(number_to_words(1), "uno"); + assert_eq!(number_to_words(10), "diez"); + assert_eq!(number_to_words(15), "quince"); + assert_eq!(number_to_words(16), "dieciseis"); + assert_eq!(number_to_words(19), "diecinueve"); + assert_eq!(number_to_words(20), "veinte"); + assert_eq!(number_to_words(21), "veintiuno"); + assert_eq!(number_to_words(25), "veinticinco"); + assert_eq!(number_to_words(29), "veintinueve"); + } + + #[test] + fn test_tens_with_y() { + assert_eq!(number_to_words(30), "treinta"); + assert_eq!(number_to_words(31), "treinta y uno"); + assert_eq!(number_to_words(42), "cuarenta y dos"); + assert_eq!(number_to_words(55), "cincuenta y cinco"); + assert_eq!(number_to_words(67), "sesenta y siete"); + assert_eq!(number_to_words(78), "setenta y ocho"); + assert_eq!(number_to_words(89), "ochenta y nueve"); + assert_eq!(number_to_words(99), "noventa y nueve"); + } + + #[test] + fn test_hundreds() { + assert_eq!(number_to_words(100), "cien"); + assert_eq!(number_to_words(101), "ciento uno"); + assert_eq!(number_to_words(200), "doscientos"); + assert_eq!(number_to_words(500), "quinientos"); + assert_eq!(number_to_words(999), "novecientos noventa y nueve"); + } + + #[test] + fn test_thousands() { + assert_eq!(number_to_words(1000), "mil"); + assert_eq!(number_to_words(2000), "dos mil"); + assert_eq!(number_to_words(2025), "dos mil veinticinco"); + assert_eq!(number_to_words(10000), "diez mil"); + } + + #[test] + fn test_millions() { + assert_eq!(number_to_words(1000000), "un millon"); + assert_eq!(number_to_words(2000000), "dos millones"); + assert_eq!(number_to_words(2000003), "dos millones tres"); + } + + #[test] + fn test_negative() { + assert_eq!(number_to_words(-42), "menos cuarenta y dos"); + } + + #[test] + fn test_spell_digits() { + assert_eq!(spell_digits("14"), "uno cuatro"); + assert_eq!(spell_digits("0"), "cero"); + assert_eq!(spell_digits("987"), "nueve ocho siete"); + } +} diff --git a/src/tts/es/money.rs b/src/tts/es/money.rs new file mode 100644 index 0000000..3d8605f --- /dev/null +++ b/src/tts/es/money.rs @@ -0,0 +1,336 @@ +//! Money TN tagger for Spanish. +//! +//! Converts written currency expressions to spoken Spanish: +//! - "5,50 €" → "cinco euros con cincuenta centimos" +//! - "€5.50" → "cinco euros con cincuenta centimos" +//! - "$100" → "cien dolares" +//! - "£1" → "una libra" + +use super::number_to_words; + +struct Currency { + singular: &'static str, + plural: &'static str, + cent_singular: &'static str, + cent_plural: &'static str, + /// Whether "uno" becomes "una" for this currency (feminine) + feminine: bool, +} + +const EURO: Currency = Currency { + singular: "euro", + plural: "euros", + cent_singular: "centimo", + cent_plural: "centimos", + feminine: false, +}; + +const DOLLAR: Currency = Currency { + singular: "dolar", + plural: "dolares", + cent_singular: "centavo", + cent_plural: "centavos", + feminine: false, +}; + +const POUND: Currency = Currency { + singular: "libra", + plural: "libras", + cent_singular: "penique", + cent_plural: "peniques", + feminine: true, +}; + +const YEN: Currency = Currency { + singular: "yen", + plural: "yenes", + cent_singular: "sen", + cent_plural: "sen", + feminine: false, +}; + +/// Scale suffixes recognized after a currency amount. +const SCALE_SUFFIXES: &[&str] = &[ + "billones", + "billon", + "mil millones", + "millones", + "millon", + "mil", +]; + +/// Parse a written money expression to spoken Spanish. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Try suffix symbol: "5,50 €", "100 €" + if let Some(result) = parse_suffix_currency(trimmed) { + return Some(result); + } + + // Try prefix symbol: "$5.50", "€100", "£1" + if let Some(result) = parse_prefix_currency(trimmed) { + return Some(result); + } + + None +} + +fn parse_suffix_currency(input: &str) -> Option { + let (amount_str, currency) = if let Some(s) = input.strip_suffix('€') { + (s.trim(), &EURO) + } else if let Some(s) = input.strip_suffix("EUR") { + (s.trim(), &EURO) + } else { + return None; + }; + + parse_amount(amount_str, currency) +} + +fn parse_prefix_currency(input: &str) -> Option { + let (currency, rest) = if let Some(r) = input.strip_prefix('$') { + (&DOLLAR, r) + } else if let Some(r) = input.strip_prefix('€') { + (&EURO, r) + } else if let Some(r) = input.strip_prefix('£') { + (&POUND, r) + } else if let Some(r) = input.strip_prefix('¥') { + (&YEN, r) + } else { + return None; + }; + + let rest = rest.trim(); + if rest.is_empty() { + return None; + } + + // Check for scale suffix: "$2,5 millones" + let (amount_str, scale) = extract_scale(rest); + + // Without a scale suffix, the amount must be purely numeric + if scale.is_none() + && !amount_str + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == ' ') + { + return None; + } + + if let Some(scale_word) = scale { + // With scale: "$2,5 millones" → "dos coma cinco millones de dolares" + let decimal_sep = if amount_str.contains(',') { + ',' + } else if amount_str.contains('.') { + '.' + } else { + // No decimal: "$50 millones" → "cincuenta millones de dolares" + let clean: String = amount_str.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + let words = number_to_words(n); + return Some(format!("{} {} de {}", words, scale_word, currency.plural)); + }; + + let parts: Vec<&str> = amount_str.splitn(2, decimal_sep).collect(); + if parts.len() == 2 { + let int_val: i64 = parts[0].parse().ok()?; + let int_words = number_to_words(int_val); + let frac_words = super::spell_digits(parts[1]); + return Some(format!( + "{} coma {} {} de {}", + int_words, frac_words, scale_word, currency.plural + )); + } + } + + parse_amount(amount_str, currency) +} + +/// Extract scale suffix from the amount string. +fn extract_scale(input: &str) -> (&str, Option<&str>) { + for &scale in SCALE_SUFFIXES { + if let Some(before) = input.strip_suffix(scale) { + let before = before.trim_end(); + if !before.is_empty() + && before + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == ' ') + { + return (before, Some(scale)); + } + } + } + (input, None) +} + +fn parse_amount(amount_str: &str, currency: &Currency) -> Option { + if amount_str.is_empty() { + return None; + } + + // Determine decimal separator: Spanish uses comma, but accept period too + let sep = if amount_str.contains(',') { ',' } else { '.' }; + + if amount_str.contains(sep) && sep != '.' || amount_str.contains('.') { + let actual_sep = if amount_str.contains(',') { ',' } else { '.' }; + let parts: Vec<&str> = amount_str.splitn(2, actual_sep).collect(); + if parts.len() == 2 { + let int_clean: String = parts[0].chars().filter(|c| c.is_ascii_digit()).collect(); + let dollars: i64 = if int_clean.is_empty() { + 0 + } else { + int_clean.parse().ok()? + }; + + let cents_str = parts[1].trim(); + let cents: i64 = if cents_str.is_empty() { + 0 + } else if cents_str.len() == 1 { + cents_str.parse::().ok()? * 10 + } else if cents_str.len() == 2 { + cents_str.parse().ok()? + } else { + cents_str[..2].parse().ok()? + }; + + return Some(format_currency(dollars, cents, currency)); + } + } + + let clean: String = amount_str.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + Some(format_currency(n, 0, currency)) +} + +/// Convert trailing "uno" to "un" for use before masculine nouns. +fn apocope_uno(s: &str) -> String { + if s == "uno" { + return "un".to_string(); + } + if let Some(prefix) = s.strip_suffix(" uno") { + return format!("{} un", prefix); + } + if s.ends_with("iuno") { + return format!("{}un", &s[..s.len() - 3]); + } + s.to_string() +} + +fn format_currency(dollars: i64, cents: i64, currency: &Currency) -> String { + let dollar_words = if dollars == 1 && currency.feminine { + "una".to_string() + } else if currency.feminine { + number_to_words(dollars) + } else { + // Masculine: "uno" → "un" before noun + apocope_uno(&number_to_words(dollars)) + }; + + if dollars == 0 && cents == 0 { + return format!("cero {}", currency.plural); + } + + if dollars == 0 { + let cents_words = apocope_uno(&number_to_words(cents)); + let unit = if cents == 1 { + currency.cent_singular + } else { + currency.cent_plural + }; + return format!("{} {}", cents_words, unit); + } + + if cents == 0 { + let unit = if dollars == 1 { + currency.singular + } else { + currency.plural + }; + return format!("{} {}", dollar_words, unit); + } + + let dollar_unit = if dollars == 1 { + currency.singular + } else { + currency.plural + }; + let cents_words = apocope_uno(&number_to_words(cents)); + let cent_unit = if cents == 1 { + currency.cent_singular + } else { + currency.cent_plural + }; + + format!( + "{} {} con {} {}", + dollar_words, dollar_unit, cents_words, cent_unit + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_euro_suffix() { + assert_eq!(parse("5 €"), Some("cinco euros".to_string())); + assert_eq!(parse("1 €"), Some("un euro".to_string())); + assert_eq!( + parse("5,50 €"), + Some("cinco euros con cincuenta centimos".to_string()) + ); + } + + #[test] + fn test_prefix_currencies() { + assert_eq!(parse("$100"), Some("cien dolares".to_string())); + assert_eq!(parse("£1"), Some("una libra".to_string())); + assert_eq!(parse("€100"), Some("cien euros".to_string())); + } + + #[test] + fn test_dollar_with_cents() { + assert_eq!( + parse("$5.50"), + Some("cinco dolares con cincuenta centavos".to_string()) + ); + } + + #[test] + fn test_dollars_and_cents() { + assert_eq!( + parse("$5.50"), + Some("cinco dolares con cincuenta centavos".to_string()) + ); + assert_eq!(parse("$1.01"), Some("un dolar con un centavo".to_string())); + assert_eq!(parse("$0.99"), Some("noventa y nueve centavos".to_string())); + } + + #[test] + fn test_large_amounts() { + assert_eq!( + parse("$2,5 millones"), + Some("dos coma cinco millones de dolares".to_string()) + ); + assert_eq!( + parse("$50 millones"), + Some("cincuenta millones de dolares".to_string()) + ); + } + + #[test] + fn test_trailing_dot() { + assert_eq!(parse("$5."), Some("cinco dolares".to_string())); + assert_eq!(parse("$1."), Some("un dolar".to_string())); + } + + #[test] + fn test_non_money() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + } +} diff --git a/src/tts/es/ordinal.rs b/src/tts/es/ordinal.rs new file mode 100644 index 0000000..1bce002 --- /dev/null +++ b/src/tts/es/ordinal.rs @@ -0,0 +1,181 @@ +//! Ordinal TN tagger for Spanish. +//! +//! Converts written ordinal numbers to spoken Spanish: +//! - "1.o" → "primero" +//! - "2.o" → "segundo" +//! - "3.o" → "tercero" +//! - "1.a" → "primera" +//! - "10.o" → "decimo" + +use super::number_to_words; + +/// Ordinal words for 1-10 (masculine). +const ORDINALS_M: [&str; 11] = [ + "", "primero", "segundo", "tercero", "cuarto", "quinto", "sexto", "septimo", "octavo", + "noveno", "decimo", +]; + +/// Ordinal words for 1-10 (feminine). +const ORDINALS_F: [&str; 11] = [ + "", "primera", "segunda", "tercera", "cuarta", "quinta", "sexta", "septima", "octava", + "novena", "decima", +]; + +/// Higher ordinals 11-20 (masculine). +const ORDINALS_HIGH_M: [&str; 10] = [ + "undecimo", + "duodecimo", + "decimotercero", + "decimocuarto", + "decimoquinto", + "decimosexto", + "decimoseptimo", + "decimoctavo", + "decimonoveno", + "vigesimo", +]; + +/// Higher ordinals 11-20 (feminine). +const ORDINALS_HIGH_F: [&str; 10] = [ + "undecima", + "duodecima", + "decimotercera", + "decimocuarta", + "decimoquinta", + "decimosexta", + "decimoseptima", + "decimoctava", + "decimonovena", + "vigesima", +]; + +/// Parse a written ordinal to spoken Spanish words. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Detect Spanish ordinal suffixes: .o, .a, .er, .os, .as + // Also handle without dot: 1o, 1a, 2o, etc. + let (num_str, feminine) = if let Some(s) = trimmed.strip_suffix(".a") { + (s, true) + } else if let Some(s) = trimmed.strip_suffix(".as") { + (s, true) + } else if let Some(s) = trimmed.strip_suffix(".o") { + (s, false) + } else if let Some(s) = trimmed.strip_suffix(".os") { + (s, false) + } else if let Some(s) = trimmed.strip_suffix(".er") { + (s, false) + } else if trimmed.len() >= 2 { + // Try without dot: "1o", "2a", "3er" + if let Some(s) = trimmed.strip_suffix("er") { + if s.chars().all(|c| c.is_ascii_digit()) && !s.is_empty() { + (s, false) + } else { + return None; + } + } else if let Some(s) = trimmed.strip_suffix("os") { + if s.chars().all(|c| c.is_ascii_digit()) && !s.is_empty() { + (s, false) + } else { + return None; + } + } else if let Some(s) = trimmed.strip_suffix("as") { + if s.chars().all(|c| c.is_ascii_digit()) && !s.is_empty() { + (s, true) + } else { + return None; + } + } else { + let last = trimmed.chars().last()?; + let rest = &trimmed[..trimmed.len() - 1]; + if last == 'a' && rest.chars().all(|c| c.is_ascii_digit()) && !rest.is_empty() { + (rest, true) + } else if last == 'o' && rest.chars().all(|c| c.is_ascii_digit()) && !rest.is_empty() { + (rest, false) + } else { + return None; + } + } + } else { + return None; + }; + + if num_str.is_empty() || !num_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let n: i64 = num_str.parse().ok()?; + if n <= 0 { + return None; + } + + // Ordinals 1-10: use dedicated words + if n <= 10 { + let ordinal = if feminine { + ORDINALS_F[n as usize] + } else { + ORDINALS_M[n as usize] + }; + return Some(ordinal.to_string()); + } + + // Ordinals 11-20: use dedicated higher ordinal words + if n <= 20 { + let idx = (n - 11) as usize; + let ordinal = if feminine { + ORDINALS_HIGH_F[idx] + } else { + ORDINALS_HIGH_M[idx] + }; + return Some(ordinal.to_string()); + } + + // For 21+, fall back to cardinal form (common in spoken Spanish) + Some(number_to_words(n)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_masculine() { + assert_eq!(parse("1.o"), Some("primero".to_string())); + assert_eq!(parse("2.o"), Some("segundo".to_string())); + assert_eq!(parse("3.o"), Some("tercero".to_string())); + assert_eq!(parse("5.o"), Some("quinto".to_string())); + assert_eq!(parse("10.o"), Some("decimo".to_string())); + } + + #[test] + fn test_basic_feminine() { + assert_eq!(parse("1.a"), Some("primera".to_string())); + assert_eq!(parse("2.a"), Some("segunda".to_string())); + assert_eq!(parse("3.a"), Some("tercera".to_string())); + } + + #[test] + fn test_higher_ordinals() { + assert_eq!(parse("11.o"), Some("undecimo".to_string())); + assert_eq!(parse("12.o"), Some("duodecimo".to_string())); + assert_eq!(parse("20.o"), Some("vigesimo".to_string())); + } + + #[test] + fn test_without_dot() { + assert_eq!(parse("1o"), Some("primero".to_string())); + assert_eq!(parse("1a"), Some("primera".to_string())); + assert_eq!(parse("3er"), Some("tercero".to_string())); + } + + #[test] + fn test_fallback_cardinal() { + assert_eq!(parse("21.o"), Some("veintiuno".to_string())); + } + + #[test] + fn test_non_ordinals() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("0.o"), None); + } +} diff --git a/src/tts/es/telephone.rs b/src/tts/es/telephone.rs new file mode 100644 index 0000000..f83d6cc --- /dev/null +++ b/src/tts/es/telephone.rs @@ -0,0 +1,170 @@ +//! Telephone TN tagger for Spanish. +//! +//! Converts written phone numbers to spoken Spanish form: +//! - "123-456-7890" → "uno dos tres, cuatro cinco seis, siete ocho nueve cero" +//! - "+34-91-123-4567" → "mas tres cuatro, nueve uno, uno dos tres, cuatro cinco seis siete" +//! - "(555) 123-4567" → "cinco cinco cinco, uno dos tres, cuatro cinco seis siete" + +/// Parse a written phone number to spoken Spanish form. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Phone numbers contain digits and separators (-, ., space, parens) + // Must have mostly digits + let digit_count = trimmed.chars().filter(|c| c.is_ascii_digit()).count(); + let non_digit_non_sep = trimmed + .chars() + .filter(|c| { + !c.is_ascii_digit() + && *c != '-' + && *c != '.' + && *c != ' ' + && *c != '(' + && *c != ')' + && *c != '+' + }) + .count(); + + // Must have at least 7 digits and no unexpected characters + if digit_count < 7 || non_digit_non_sep > 0 { + return None; + } + + // Must contain at least one separator (-, ., space, parens) to distinguish + // from plain numbers like "1000000" + let has_separator = trimmed + .chars() + .any(|c| c == '-' || c == '.' || c == ' ' || c == '(' || c == ')'); + if !has_separator { + return None; + } + + let mut parts: Vec = Vec::new(); + let mut has_plus = false; + + // Handle leading + + let rest = if let Some(r) = trimmed.strip_prefix('+') { + has_plus = true; + r.trim_start() + } else { + trimmed + }; + + // Split by common separators + let groups = split_phone_groups(rest); + + if has_plus && !groups.is_empty() { + // The first group after + is the country code + let mut first = String::from("mas "); + first.push_str(&spell_digit_group(&groups[0])); + parts.push(first); + for g in &groups[1..] { + parts.push(spell_digit_group(g)); + } + } else { + for g in &groups { + parts.push(spell_digit_group(g)); + } + } + + if parts.is_empty() { + return None; + } + + Some(parts.join(", ")) +} + +/// Split phone number into groups by separators. +fn split_phone_groups(input: &str) -> Vec { + let mut groups: Vec = Vec::new(); + let mut current = String::new(); + + for c in input.chars() { + match c { + '0'..='9' => current.push(c), + '-' | '.' | ' ' | '(' | ')' => { + if !current.is_empty() { + groups.push(current.clone()); + current.clear(); + } + } + _ => {} + } + } + + if !current.is_empty() { + groups.push(current); + } + + groups +} + +/// Spell each digit in a group using Spanish words. +fn spell_digit_group(group: &str) -> String { + group + .chars() + .filter_map(|c| { + let word = match c { + '0' => "cero", + '1' => "uno", + '2' => "dos", + '3' => "tres", + '4' => "cuatro", + '5' => "cinco", + '6' => "seis", + '7' => "siete", + '8' => "ocho", + '9' => "nueve", + _ => return None, + }; + Some(word) + }) + .collect::>() + .join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_standard_phone() { + assert_eq!( + parse("123-456-7890"), + Some("uno dos tres, cuatro cinco seis, siete ocho nueve cero".to_string()) + ); + } + + #[test] + fn test_with_country_code() { + assert_eq!( + parse("+34-91-123-4567"), + Some("mas tres cuatro, nueve uno, uno dos tres, cuatro cinco seis siete".to_string()) + ); + } + + #[test] + fn test_parentheses() { + assert_eq!( + parse("(555) 123-4567"), + Some("cinco cinco cinco, uno dos tres, cuatro cinco seis siete".to_string()) + ); + } + + #[test] + fn test_dots() { + assert_eq!( + parse("555.123.4567"), + Some("cinco cinco cinco, uno dos tres, cuatro cinco seis siete".to_string()) + ); + } + + #[test] + fn test_non_phone() { + assert_eq!(parse("hola"), None); + assert_eq!(parse("123"), None); // too few digits + } +} diff --git a/src/tts/es/time.rs b/src/tts/es/time.rs new file mode 100644 index 0000000..2925c7e --- /dev/null +++ b/src/tts/es/time.rs @@ -0,0 +1,104 @@ +//! Time TN tagger for Spanish. +//! +//! Converts written time expressions to spoken Spanish: +//! - "14:30" → "catorce treinta" +//! - "2:00" → "dos en punto" +//! - "0:00" → "medianoche" +//! - "12:00" → "mediodia" + +use super::number_to_words; + +/// Parse a written time expression to spoken Spanish. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Try "14:30" format + if let Some(result) = parse_colon_format(trimmed) { + return Some(result); + } + + None +} + +fn parse_colon_format(input: &str) -> Option { + if !input.contains(':') { + return None; + } + + let parts: Vec<&str> = input.splitn(2, ':').collect(); + if parts.len() != 2 { + return None; + } + + let hour_str = parts[0].trim(); + let min_str = parts[1].trim(); + + if !hour_str.chars().all(|c| c.is_ascii_digit()) || hour_str.is_empty() { + return None; + } + if !min_str.chars().all(|c| c.is_ascii_digit()) || min_str.is_empty() { + return None; + } + + let hour: u32 = hour_str.parse().ok()?; + let minute: u32 = min_str.parse().ok()?; + + if hour > 23 || minute > 59 { + return None; + } + + Some(format_time(hour, minute)) +} + +fn format_time(hour: u32, minute: u32) -> String { + // Special cases + if hour == 0 && minute == 0 { + return "medianoche".to_string(); + } + if hour == 12 && minute == 0 { + return "mediodia".to_string(); + } + + let hour_words = number_to_words(hour as i64); + + if minute == 0 { + format!("{} en punto", hour_words) + } else { + format!("{} {}", hour_words, number_to_words(minute as i64)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_colon_format() { + assert_eq!(parse("14:30"), Some("catorce treinta".to_string())); + assert_eq!(parse("8:15"), Some("ocho quince".to_string())); + assert_eq!(parse("2:00"), Some("dos en punto".to_string())); + } + + #[test] + fn test_special_hours() { + assert_eq!(parse("0:00"), Some("medianoche".to_string())); + assert_eq!(parse("12:00"), Some("mediodia".to_string())); + assert_eq!(parse("0:30"), Some("cero treinta".to_string())); + } + + #[test] + fn test_24h() { + assert_eq!(parse("14:00"), Some("catorce en punto".to_string())); + assert_eq!( + parse("23:59"), + Some("veintitres cincuenta y nueve".to_string()) + ); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("25:00"), None); + assert_eq!(parse("12:60"), None); + } +} diff --git a/src/tts/es/whitelist.rs b/src/tts/es/whitelist.rs new file mode 100644 index 0000000..df5e7d5 --- /dev/null +++ b/src/tts/es/whitelist.rs @@ -0,0 +1,118 @@ +//! Whitelist TN tagger for Spanish. +//! +//! Lookup table for common Spanish abbreviations and special terms: +//! - "Dr." → "doctor" +//! - "Sr." → "senor" +//! - "Ud." → "usted" +//! - "p.ej." → "por ejemplo" + +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + static ref WHITELIST: HashMap<&'static str, &'static str> = { + let mut m = HashMap::new(); + // Titles + m.insert("Dr.", "doctor"); + m.insert("Dr", "doctor"); + m.insert("Dra.", "doctora"); + m.insert("Dra", "doctora"); + m.insert("Sr.", "senor"); + m.insert("Sr", "senor"); + m.insert("Sra.", "senora"); + m.insert("Sra", "senora"); + m.insert("Srta.", "senorita"); + m.insert("Srta", "senorita"); + m.insert("Prof.", "profesor"); + m.insert("Prof", "profesor"); + m.insert("Profa.", "profesora"); + m.insert("Profa", "profesora"); + + // Formal pronouns + m.insert("Ud.", "usted"); + m.insert("Ud", "usted"); + m.insert("Uds.", "ustedes"); + m.insert("Uds", "ustedes"); + + // Common abbreviations + m.insert("etc.", "etcetera"); + m.insert("p.ej.", "por ejemplo"); + + // Address abbreviations + m.insert("Av.", "avenida"); + m.insert("Av", "avenida"); + m.insert("Blvd.", "bulevar"); + m.insert("Blvd", "bulevar"); + m.insert("Col.", "colonia"); + m.insert("Col", "colonia"); + + // Organizational + m.insert("Dept.", "departamento"); + m.insert("Dept", "departamento"); + m.insert("No.", "numero"); + m.insert("No", "numero"); + m.insert("Cia.", "compania"); + m.insert("Cia", "compania"); + m.insert("Ltda.", "limitada"); + m.insert("Ltda", "limitada"); + m.insert("S.A.", "sociedad anonima"); + + // Military / professional titles + m.insert("Gral.", "general"); + m.insert("Gral", "general"); + m.insert("Ing.", "ingeniero"); + m.insert("Ing", "ingeniero"); + m.insert("Lic.", "licenciado"); + m.insert("Lic", "licenciado"); + m.insert("Arq.", "arquitecto"); + m.insert("Arq", "arquitecto"); + + m + }; +} + +/// Parse a whitelist abbreviation to its spoken Spanish form. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Direct lookup (case-sensitive first) + if let Some(&spoken) = WHITELIST.get(trimmed) { + return Some(spoken.to_string()); + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_titles() { + assert_eq!(parse("Dr."), Some("doctor".to_string())); + assert_eq!(parse("Dra."), Some("doctora".to_string())); + assert_eq!(parse("Sr."), Some("senor".to_string())); + assert_eq!(parse("Sra."), Some("senora".to_string())); + } + + #[test] + fn test_abbreviations() { + assert_eq!(parse("etc."), Some("etcetera".to_string())); + assert_eq!(parse("p.ej."), Some("por ejemplo".to_string())); + assert_eq!(parse("Ud."), Some("usted".to_string())); + assert_eq!(parse("S.A."), Some("sociedad anonima".to_string())); + } + + #[test] + fn test_professional_titles() { + assert_eq!(parse("Ing."), Some("ingeniero".to_string())); + assert_eq!(parse("Lic."), Some("licenciado".to_string())); + assert_eq!(parse("Arq."), Some("arquitecto".to_string())); + } + + #[test] + fn test_no_match() { + assert_eq!(parse("hola"), None); + assert_eq!(parse("mundo"), None); + } +} diff --git a/src/tts/fr/cardinal.rs b/src/tts/fr/cardinal.rs new file mode 100644 index 0000000..fa2fb5a --- /dev/null +++ b/src/tts/fr/cardinal.rs @@ -0,0 +1,80 @@ +//! Cardinal TN tagger for French. +//! +//! Converts written cardinal numbers to spoken French: +//! - "123" → "cent vingt-trois" +//! - "-42" → "moins quarante-deux" +//! - "1 000" → "mille" + +use super::number_to_words; + +/// Parse a written cardinal number to spoken French words. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + let (is_negative, digits_part) = if let Some(rest) = trimmed.strip_prefix('-') { + (true, rest) + } else { + (false, trimmed) + }; + + // Must be digits (with optional commas, dots, or spaces as thousands separators) + if !digits_part + .chars() + .all(|c| c.is_ascii_digit() || c == ',' || c == '.' || c == ' ' || c == '\u{a0}') + { + return None; + } + + if !digits_part.chars().any(|c| c.is_ascii_digit()) { + return None; + } + + // Strip thousands separators (spaces, dots, commas used as thousands sep) + // French uses space or dot as thousands separator + let clean: String = digits_part.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + + if is_negative { + Some(format!("moins {}", number_to_words(n))) + } else { + Some(number_to_words(n)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(parse("0"), Some("zero".to_string())); + assert_eq!(parse("1"), Some("un".to_string())); + assert_eq!(parse("21"), Some("vingt et un".to_string())); + assert_eq!(parse("100"), Some("cent".to_string())); + assert_eq!(parse("123"), Some("cent vingt-trois".to_string())); + } + + #[test] + fn test_thousands_separators() { + assert_eq!(parse("1 000"), Some("mille".to_string())); + assert_eq!(parse("1.000"), Some("mille".to_string())); + assert_eq!(parse("1,000"), Some("mille".to_string())); + assert_eq!(parse("1 000 000"), Some("un million".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!(parse("-42"), Some("moins quarante-deux".to_string())); + assert_eq!(parse("-1000"), Some("moins mille".to_string())); + } + + #[test] + fn test_non_numbers() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("12abc"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/fr/date.rs b/src/tts/fr/date.rs new file mode 100644 index 0000000..0cf2d64 --- /dev/null +++ b/src/tts/fr/date.rs @@ -0,0 +1,347 @@ +//! Date TN tagger for French. +//! +//! Converts written date expressions to spoken French: +//! - "5 janvier 2025" → "cinq janvier deux mille vingt-cinq" +//! - "January 5, 2025" → "cinq janvier deux mille vingt-cinq" +//! - "05/01/2025" → "cinq janvier deux mille vingt-cinq" (DD/MM/YYYY) + +use super::number_to_words; + +const MONTHS_FR: &[(&str, &str)] = &[ + ("janvier", "janvier"), + ("fevrier", "fevrier"), + ("mars", "mars"), + ("avril", "avril"), + ("mai", "mai"), + ("juin", "juin"), + ("juillet", "juillet"), + ("aout", "aout"), + ("septembre", "septembre"), + ("octobre", "octobre"), + ("novembre", "novembre"), + ("decembre", "decembre"), +]; + +const MONTHS_EN: &[(&str, u32)] = &[ + ("january", 1), + ("february", 2), + ("march", 3), + ("april", 4), + ("may", 5), + ("june", 6), + ("july", 7), + ("august", 8), + ("september", 9), + ("october", 10), + ("november", 11), + ("december", 12), +]; + +const MONTH_NAMES: &[&str] = &[ + "", + "janvier", + "fevrier", + "mars", + "avril", + "mai", + "juin", + "juillet", + "aout", + "septembre", + "octobre", + "novembre", + "decembre", +]; + +/// Parse a written date to spoken French. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Try decade: "1980s" → "les annees mille neuf cent quatre-vingts" + if let Some(result) = parse_decade(trimmed) { + return Some(result); + } + + // Try French format: "5 janvier 2025" + if let Some(result) = parse_french_date(trimmed) { + return Some(result); + } + + // Try English month format: "January 5, 2025" + if let Some(result) = parse_english_month_date(trimmed) { + return Some(result); + } + + // Try numeric DD/MM/YYYY + if let Some(result) = parse_numeric_date(trimmed) { + return Some(result); + } + + None +} + +/// Parse decade: "1980s" → "les annees mille neuf cent quatre-vingts" +fn parse_decade(input: &str) -> Option { + let s = input.strip_suffix('s')?; + if s.len() != 4 || !s.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let year: u32 = s.parse().ok()?; + if year < 1000 { + return None; + } + + // Must be a round decade (ends in 0) + if year % 10 != 0 { + return None; + } + + // French: "les annees" + the year number + let year_words = number_to_words(year as i64); + Some(format!("les annees {}", year_words)) +} + +fn parse_french_date(input: &str) -> Option { + let lower = input.to_lowercase(); + let tokens: Vec<&str> = lower.split_whitespace().collect(); + if tokens.len() < 2 { + return None; + } + + // "5 janvier" or "5 janvier 2025" or "1er janvier 2025" + let day_str = tokens[0] + .trim_end_matches("er") + .trim_end_matches("eme") + .trim_end_matches('e'); + if !day_str.chars().all(|c| c.is_ascii_digit()) || day_str.is_empty() { + return None; + } + + let day: u32 = day_str.parse().ok()?; + if day == 0 || day > 31 { + return None; + } + + // Find month + let month_name = MONTHS_FR.iter().find(|(name, _)| *name == tokens[1]); + let month_spoken = month_name?.1; + + let day_word = if day == 1 { + "premier".to_string() + } else { + number_to_words(day as i64) + }; + + if tokens.len() >= 3 { + let year_str = + tokens[2].trim_end_matches(|c: char| c == '.' || c == ',' || c == '!' || c == '?'); + if year_str.chars().all(|c| c.is_ascii_digit()) && year_str.len() == 4 { + let year: u32 = year_str.parse().ok()?; + let year_words = verbalize_year(year)?; + return Some(format!("{} {} {}", day_word, month_spoken, year_words)); + } + } + + Some(format!("{} {}", day_word, month_spoken)) +} + +fn parse_english_month_date(input: &str) -> Option { + let lower = input.to_lowercase(); + + let mut month_num = None; + let mut rest = ""; + for &(name, num) in MONTHS_EN { + if let Some(r) = lower.strip_prefix(name) { + if r.is_empty() || r.starts_with(' ') || r.starts_with(',') { + month_num = Some(num); + rest = r.trim_start_matches(|c: char| c == ' ' || c == ','); + break; + } + } + } + + let month_num = month_num?; + if rest.is_empty() { + return None; + } + + let month_name = MONTH_NAMES[month_num as usize]; + + // Parse day + let (day_str, year_part) = if let Some(comma_pos) = rest.find(',') { + (&rest[..comma_pos], Some(rest[comma_pos + 1..].trim())) + } else { + let parts: Vec<&str> = rest.splitn(2, ' ').collect(); + if parts.len() == 2 + && parts[0] + .trim_end_matches("st") + .trim_end_matches("nd") + .trim_end_matches("rd") + .trim_end_matches("th") + .chars() + .all(|c| c.is_ascii_digit()) + { + let year_clean = + parts[1].trim_end_matches(|c: char| c == '.' || c == ',' || c == '!' || c == '?'); + if year_clean.chars().all(|c| c.is_ascii_digit()) && year_clean.len() == 4 { + (parts[0], Some(year_clean)) + } else { + (rest, None) + } + } else { + (rest, None) + } + }; + + let day_digits = day_str + .trim() + .trim_end_matches("st") + .trim_end_matches("nd") + .trim_end_matches("rd") + .trim_end_matches("th"); + + if !day_digits.chars().all(|c| c.is_ascii_digit()) || day_digits.is_empty() { + return None; + } + + let day: u32 = day_digits.parse().ok()?; + if day == 0 || day > 31 { + return None; + } + + let day_word = if day == 1 { + "premier".to_string() + } else { + number_to_words(day as i64) + }; + + if let Some(year_str) = year_part { + let year_str = year_str + .trim() + .trim_end_matches(|c: char| c == '.' || c == ',' || c == '!' || c == '?'); + if !year_str.is_empty() && year_str.chars().all(|c| c.is_ascii_digit()) { + let year: u32 = year_str.parse().ok()?; + let year_words = verbalize_year(year)?; + return Some(format!("{} {} {}", day_word, month_name, year_words)); + } + } + + Some(format!("{} {}", day_word, month_name)) +} + +/// Parse numeric date DD/MM/YYYY (French convention). +fn parse_numeric_date(input: &str) -> Option { + let sep = if input.contains('/') { + '/' + } else if input.contains('-') && input.chars().filter(|c| *c == '-').count() == 2 { + '-' + } else { + return None; + }; + + let parts: Vec<&str> = input.splitn(3, sep).collect(); + if parts.len() != 3 { + return None; + } + + if !parts + .iter() + .all(|p| !p.is_empty() && p.chars().all(|c| c.is_ascii_digit())) + { + return None; + } + + let day: u32 = parts[0].parse().ok()?; + let month_num: u32 = parts[1].parse().ok()?; + let year: u32 = parts[2].parse().ok()?; + + if month_num == 0 || month_num > 12 || day == 0 || day > 31 { + return None; + } + + let month_name = MONTH_NAMES[month_num as usize]; + let day_word = if day == 1 { + "premier".to_string() + } else { + number_to_words(day as i64) + }; + let year_words = verbalize_year(year)?; + + Some(format!("{} {} {}", day_word, month_name, year_words)) +} + +/// Verbalize a year in French. +/// - 2025 → "deux mille vingt-cinq" +/// - 2000 → "deux mille" +/// - 1990 → "mille neuf cent quatre-vingt-dix" +fn verbalize_year(year: u32) -> Option { + if year == 0 { + return Some("zero".to_string()); + } + // French typically says the full number for years + Some(number_to_words(year as i64)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_french_date() { + assert_eq!( + parse("5 janvier 2025"), + Some("cinq janvier deux mille vingt-cinq".to_string()) + ); + assert_eq!(parse("1er janvier"), Some("premier janvier".to_string())); + } + + #[test] + fn test_english_month() { + assert_eq!( + parse("January 5, 2025"), + Some("cinq janvier deux mille vingt-cinq".to_string()) + ); + } + + #[test] + fn test_numeric_date() { + assert_eq!( + parse("05/01/2025"), + Some("cinq janvier deux mille vingt-cinq".to_string()) + ); + } + + #[test] + fn test_decade() { + assert_eq!( + parse("1980s"), + Some("les annees mille neuf cent quatre-vingts".to_string()) + ); + assert_eq!(parse("2000s"), Some("les annees deux mille".to_string())); + assert_eq!( + parse("1990s"), + Some("les annees mille neuf cent quatre-vingt-dix".to_string()) + ); + } + + #[test] + fn test_year_verbalization() { + assert_eq!( + verbalize_year(2025), + Some("deux mille vingt-cinq".to_string()) + ); + assert_eq!(verbalize_year(2000), Some("deux mille".to_string())); + assert_eq!( + verbalize_year(1990), + Some("mille neuf cent quatre-vingt-dix".to_string()) + ); + assert_eq!(verbalize_year(1900), Some("mille neuf cent".to_string())); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + } +} diff --git a/src/tts/fr/decimal.rs b/src/tts/fr/decimal.rs new file mode 100644 index 0000000..02b904f --- /dev/null +++ b/src/tts/fr/decimal.rs @@ -0,0 +1,138 @@ +//! Decimal TN tagger for French. +//! +//! Converts written decimal numbers to spoken French: +//! - "3,14" → "trois virgule un quatre" +//! - "3.14" → "trois virgule un quatre" +//! - "0,5" → "zero virgule cinq" +//! - "1,5 milliard" → "un virgule cinq milliard" + +use super::{number_to_words, spell_digits}; + +/// French quantity suffixes recognized after a decimal number. +const QUANTITY_SUFFIXES: &[&str] = &[ + "billiard", + "billion", + "milliard", + "milliards", + "million", + "millions", + "mille", +]; + +/// Parse a written decimal number to spoken French. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Check for quantity suffix: "1,5 milliard" + let (number_part, suffix) = extract_suffix(trimmed); + + // French uses comma as decimal separator, but also accept period + let sep = if number_part.contains(',') && !number_part.contains('.') { + ',' + } else if number_part.contains('.') { + '.' + } else { + return None; + }; + + let parts: Vec<&str> = number_part.splitn(2, sep).collect(); + if parts.len() != 2 { + return None; + } + + let int_str = parts[0]; + let frac_str = parts[1]; + + let (is_negative, int_digits) = if let Some(rest) = int_str.strip_prefix('-') { + (true, rest) + } else { + (false, int_str) + }; + + if !int_digits.chars().all(|c| c.is_ascii_digit()) { + return None; + } + if frac_str.is_empty() || !frac_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let int_val: i64 = if int_digits.is_empty() { + 0 + } else { + int_digits.parse().ok()? + }; + + let int_words = number_to_words(int_val); + let frac_words = spell_digits(frac_str); + + let mut result = if is_negative { + format!("moins {} virgule {}", int_words, frac_words) + } else { + format!("{} virgule {}", int_words, frac_words) + }; + + if let Some(suf) = suffix { + result.push(' '); + result.push_str(suf); + } + + Some(result) +} + +/// Extract a quantity suffix from the end if present. +fn extract_suffix(input: &str) -> (&str, Option<&str>) { + for &suf in QUANTITY_SUFFIXES { + if let Some(before) = input.strip_suffix(suf) { + let before = before.trim_end(); + if !before.is_empty() { + return (before, Some(suf)); + } + } + } + (input, None) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_comma_decimal() { + assert_eq!(parse("3,14"), Some("trois virgule un quatre".to_string())); + assert_eq!(parse("0,5"), Some("zero virgule cinq".to_string())); + } + + #[test] + fn test_period_decimal() { + assert_eq!(parse("3.14"), Some("trois virgule un quatre".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!( + parse("-3,14"), + Some("moins trois virgule un quatre".to_string()) + ); + } + + #[test] + fn test_with_quantity() { + assert_eq!( + parse("1,5 milliard"), + Some("un virgule cinq milliard".to_string()) + ); + assert_eq!( + parse("4,85 millions"), + Some("quatre virgule huit cinq millions".to_string()) + ); + } + + #[test] + fn test_non_decimal() { + assert_eq!(parse("123"), None); + assert_eq!(parse("hello"), None); + } +} diff --git a/src/tts/fr/electronic.rs b/src/tts/fr/electronic.rs new file mode 100644 index 0000000..3515546 --- /dev/null +++ b/src/tts/fr/electronic.rs @@ -0,0 +1,166 @@ +//! Electronic TN tagger for French. +//! +//! Converts written emails and URLs to spoken French form: +//! - "test@gmail.com" → "t e s t arobase g m a i l point c o m" +//! - "http://www.example.com" → "h t t p deux-points barre oblique barre oblique w w w point e x a m p l e point c o m" + +/// Parse an email or URL to spoken French form. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Email detection: contains @ with text on both sides + if trimmed.contains('@') { + return parse_email(trimmed); + } + + // URL detection: starts with http://, https://, or www. + let lower = trimmed.to_lowercase(); + if lower.starts_with("http://") || lower.starts_with("https://") || lower.starts_with("www.") { + return parse_url(trimmed); + } + + None +} + +/// Parse an email address to spoken French form. +fn parse_email(input: &str) -> Option { + let parts: Vec<&str> = input.splitn(2, '@').collect(); + if parts.len() != 2 || parts[0].is_empty() || parts[1].is_empty() { + return None; + } + + let local = spell_domain(parts[0]); + let domain = spell_domain(parts[1]); + + Some(format!("{} arobase {}", local, domain)) +} + +/// Parse a URL to spoken French form. +fn parse_url(input: &str) -> Option { + let mut result = String::new(); + let lower = input.to_lowercase(); + + let rest = if lower.starts_with("https://") { + result.push_str("h t t p s deux-points barre oblique barre oblique"); + &input["https://".len()..] + } else if lower.starts_with("http://") { + result.push_str("h t t p deux-points barre oblique barre oblique"); + &input["http://".len()..] + } else { + input + }; + + if !result.is_empty() && !rest.is_empty() { + result.push(' '); + } + + result.push_str(&spell_domain(rest)); + + Some(result) +} + +/// Spell out a domain name, using "point" for periods. +fn spell_domain(domain: &str) -> String { + let parts: Vec<&str> = domain.split('.').collect(); + let spelled: Vec = parts.iter().map(|p| spell_electronic(p)).collect(); + spelled.join(" point ") +} + +/// Spell out an electronic string in French. +/// +/// Letters are spelled individually with spaces. +/// Digit runs are spelled individually using French words. +/// Special characters use French connector words. +fn spell_electronic(s: &str) -> String { + let mut parts: Vec = Vec::new(); + + for c in s.chars() { + match c { + '-' => parts.push("tiret".to_string()), + '_' => parts.push("tiret bas".to_string()), + '/' => parts.push("barre oblique".to_string()), + '~' => parts.push("tilde".to_string()), + ':' => parts.push("deux-points".to_string()), + c if c.is_ascii_alphabetic() => { + parts.push(c.to_lowercase().to_string()); + } + c if c.is_ascii_digit() => { + parts.push(digit_word_fr(c)); + } + _ => { + // Skip unknown characters + } + } + } + + parts.join(" ") +} + +fn digit_word_fr(c: char) -> String { + match c { + '0' => "zero", + '1' => "un", + '2' => "deux", + '3' => "trois", + '4' => "quatre", + '5' => "cinq", + '6' => "six", + '7' => "sept", + '8' => "huit", + '9' => "neuf", + _ => "", + } + .to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_email() { + assert_eq!( + parse("test@gmail.com"), + Some("t e s t arobase g m a i l point c o m".to_string()) + ); + assert_eq!( + parse("jean.dupont@example.fr"), + Some("j e a n point d u p o n t arobase e x a m p l e point f r".to_string()) + ); + } + + #[test] + fn test_url_http() { + assert_eq!( + parse("http://www.example.com"), + Some( + "h t t p deux-points barre oblique barre oblique w w w point e x a m p l e point c o m" + .to_string() + ) + ); + assert_eq!( + parse("https://google.fr"), + Some( + "h t t p s deux-points barre oblique barre oblique g o o g l e point f r" + .to_string() + ) + ); + } + + #[test] + fn test_www_url() { + assert_eq!( + parse("www.exemple.fr"), + Some("w w w point e x e m p l e point f r".to_string()) + ); + } + + #[test] + fn test_non_electronic() { + assert_eq!(parse("bonjour"), None); + assert_eq!(parse("123"), None); + } +} diff --git a/src/tts/fr/measure.rs b/src/tts/fr/measure.rs new file mode 100644 index 0000000..632f581 --- /dev/null +++ b/src/tts/fr/measure.rs @@ -0,0 +1,231 @@ +//! Measure TN tagger for French. +//! +//! Converts written measurements to spoken French: +//! - "200 km/h" → "deux cents kilometres par heure" +//! - "1 kg" → "un kilogramme" +//! - "72°C" → "soixante-douze degres celsius" + +use super::number_to_words; + +use lazy_static::lazy_static; +use std::collections::HashMap; + +struct UnitInfo { + singular: &'static str, + plural: &'static str, +} + +lazy_static! { + static ref UNITS: HashMap<&'static str, UnitInfo> = { + let mut m = HashMap::new(); + + // Length + m.insert("mm", UnitInfo { singular: "millimetre", plural: "millimetres" }); + m.insert("cm", UnitInfo { singular: "centimetre", plural: "centimetres" }); + m.insert("m", UnitInfo { singular: "metre", plural: "metres" }); + m.insert("km", UnitInfo { singular: "kilometre", plural: "kilometres" }); + m.insert("in", UnitInfo { singular: "pouce", plural: "pouces" }); + m.insert("ft", UnitInfo { singular: "pied", plural: "pieds" }); + m.insert("mi", UnitInfo { singular: "mile", plural: "miles" }); + + // Weight + m.insert("mg", UnitInfo { singular: "milligramme", plural: "milligrammes" }); + m.insert("g", UnitInfo { singular: "gramme", plural: "grammes" }); + m.insert("kg", UnitInfo { singular: "kilogramme", plural: "kilogrammes" }); + m.insert("lb", UnitInfo { singular: "livre", plural: "livres" }); + m.insert("oz", UnitInfo { singular: "once", plural: "onces" }); + m.insert("t", UnitInfo { singular: "tonne", plural: "tonnes" }); + + // Volume + m.insert("ml", UnitInfo { singular: "millilitre", plural: "millilitres" }); + m.insert("l", UnitInfo { singular: "litre", plural: "litres" }); + m.insert("L", UnitInfo { singular: "litre", plural: "litres" }); + + // Speed + m.insert("km/h", UnitInfo { singular: "kilometre par heure", plural: "kilometres par heure" }); + m.insert("mph", UnitInfo { singular: "mile par heure", plural: "miles par heure" }); + m.insert("m/s", UnitInfo { singular: "metre par seconde", plural: "metres par seconde" }); + + // Time + m.insert("s", UnitInfo { singular: "seconde", plural: "secondes" }); + m.insert("sec", UnitInfo { singular: "seconde", plural: "secondes" }); + m.insert("min", UnitInfo { singular: "minute", plural: "minutes" }); + m.insert("h", UnitInfo { singular: "heure", plural: "heures" }); + m.insert("hr", UnitInfo { singular: "heure", plural: "heures" }); + + // Temperature + m.insert("°C", UnitInfo { singular: "degre celsius", plural: "degres celsius" }); + m.insert("°F", UnitInfo { singular: "degre fahrenheit", plural: "degres fahrenheit" }); + + // Data + m.insert("KB", UnitInfo { singular: "kilooctet", plural: "kilooctets" }); + m.insert("MB", UnitInfo { singular: "megaoctet", plural: "megaoctets" }); + m.insert("GB", UnitInfo { singular: "gigaoctet", plural: "gigaoctets" }); + m.insert("TB", UnitInfo { singular: "teraoctet", plural: "teraoctets" }); + + // Percentage + m.insert("%", UnitInfo { singular: "pour cent", plural: "pour cent" }); + + // Frequency + m.insert("Hz", UnitInfo { singular: "hertz", plural: "hertz" }); + m.insert("kHz", UnitInfo { singular: "kilohertz", plural: "kilohertz" }); + m.insert("MHz", UnitInfo { singular: "megahertz", plural: "megahertz" }); + m.insert("GHz", UnitInfo { singular: "gigahertz", plural: "gigahertz" }); + + m + }; +} + +/// Parse a written measurement to spoken French. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + let mut unit_matches: Vec<(&str, &UnitInfo)> = UNITS + .iter() + .filter(|(unit, _)| { + trimmed.ends_with(*unit) + && (trimmed.len() == unit.len() || { + let before = &trimmed[..trimmed.len() - unit.len()]; + if unit.len() == 1 && unit.chars().all(|c| c.is_ascii_alphabetic()) { + before.ends_with(' ') + } else { + before.ends_with(' ') || before.ends_with(|c: char| c.is_ascii_digit()) + } + }) + }) + .map(|(k, v)| (*k, v)) + .collect(); + + unit_matches.sort_by(|a, b| b.0.len().cmp(&a.0.len())); + + for (unit_str, unit_info) in unit_matches { + let num_part = trimmed[..trimmed.len() - unit_str.len()].trim(); + if num_part.is_empty() { + continue; + } + + let (is_negative, digits) = if let Some(rest) = num_part.strip_prefix('-') { + (true, rest.trim()) + } else { + (false, num_part) + }; + + let clean: String = digits + .chars() + .filter(|c| c.is_ascii_digit() || *c == '.' || *c == ',') + .collect(); + + if clean.is_empty() + || !clean + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',') + { + continue; + } + + // Handle decimals + let decimal_sep = if clean.contains(',') { ',' } else { '.' }; + if clean.contains(decimal_sep) { + let parts: Vec<&str> = clean.splitn(2, decimal_sep).collect(); + if parts.len() == 2 { + let int_val: i64 = if parts[0].is_empty() { + 0 + } else { + let Ok(v) = parts[0].parse::() else { + continue; + }; + v + }; + let int_words = number_to_words(int_val); + let frac_words = super::spell_digits(parts[1]); + let unit_word = unit_info.plural; + let num_words = if is_negative { + format!("moins {} virgule {}", int_words, frac_words) + } else { + format!("{} virgule {}", int_words, frac_words) + }; + return Some(format!("{} {}", num_words, unit_word)); + } + continue; + } + + let Ok(n) = clean.parse::() else { + continue; + }; + let num_words = if is_negative { + format!("moins {}", number_to_words(n)) + } else { + number_to_words(n) + }; + + let abs_n = n.unsigned_abs(); + let unit_word = if abs_n == 1 { + unit_info.singular + } else { + unit_info.plural + }; + + return Some(format!("{} {}", num_words, unit_word)); + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!( + parse("200 km/h"), + Some("deux cents kilometres par heure".to_string()) + ); + assert_eq!(parse("1 kg"), Some("un kilogramme".to_string())); + assert_eq!(parse("2 kg"), Some("deux kilogrammes".to_string())); + } + + #[test] + fn test_temperature() { + assert_eq!( + parse("72°C"), + Some("soixante-douze degres celsius".to_string()) + ); + } + + #[test] + fn test_percentage() { + assert_eq!(parse("50%"), Some("cinquante pour cent".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!( + parse("-66 kg"), + Some("moins soixante-six kilogrammes".to_string()) + ); + } + + #[test] + fn test_data() { + assert_eq!(parse("500 MB"), Some("cinq cents megaoctets".to_string())); + assert_eq!(parse("1 GB"), Some("un gigaoctet".to_string())); + } + + #[test] + fn test_decimal_with_empty_integer() { + assert_eq!( + parse(".5 kg"), + Some("zero virgule cinq kilogrammes".to_string()) + ); + } + + #[test] + fn test_non_measure() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + } +} diff --git a/src/tts/fr/mod.rs b/src/tts/fr/mod.rs new file mode 100644 index 0000000..83a5751 --- /dev/null +++ b/src/tts/fr/mod.rs @@ -0,0 +1,226 @@ +//! Text Normalization taggers for French. +//! +//! Converts written-form text to spoken French: +//! - "200" → "deux cents" +//! - "5,50 €" → "cinq euros et cinquante centimes" +//! - "5 janvier 2025" → "cinq janvier deux mille vingt-cinq" + +pub mod cardinal; +pub mod date; +pub mod decimal; +pub mod electronic; +pub mod measure; +pub mod money; +pub mod ordinal; +pub mod telephone; +pub mod time; +pub mod whitelist; + +/// Ones words indexed by value (0..20). +const ONES: [&str; 20] = [ + "zero", "un", "deux", "trois", "quatre", "cinq", "six", "sept", "huit", "neuf", "dix", "onze", + "douze", "treize", "quatorze", "quinze", "seize", "dix-sept", "dix-huit", "dix-neuf", +]; + +/// Tens words indexed by tens digit (2..7 → index 0..5). +/// French uses special forms for 70, 80, 90. +const TENS: [&str; 5] = ["vingt", "trente", "quarante", "cinquante", "soixante"]; + +/// Convert an integer to French words. +/// +/// Examples: +/// - `0` → `"zero"` +/// - `21` → `"vingt et un"` +/// - `71` → `"soixante et onze"` +/// - `80` → `"quatre-vingts"` +/// - `91` → `"quatre-vingt-onze"` +/// - `200` → `"deux cents"` +/// - `-42` → `"moins quarante-deux"` +pub fn number_to_words(n: i64) -> String { + if n == 0 { + return "zero".to_string(); + } + + if n < 0 { + let abs_val = (n as u64).wrapping_neg(); + return format!("moins {}", unsigned_to_words(abs_val)); + } + + unsigned_to_words(n as u64) +} + +fn unsigned_to_words(n: u64) -> String { + if n == 0 { + return "zero".to_string(); + } + + let mut parts: Vec = Vec::new(); + let mut remaining = n; + + let scales: &[(u64, &str)] = &[ + (1_000_000_000_000_000_000, "trillion"), + (1_000_000_000_000_000, "billiard"), + (1_000_000_000_000, "billion"), + (1_000_000_000, "milliard"), + (1_000_000, "million"), + (1_000, "mille"), + ]; + + for &(scale_value, scale_name) in scales { + if remaining >= scale_value { + let chunk = remaining / scale_value; + remaining %= scale_value; + + if scale_name == "mille" { + if chunk == 1 { + parts.push("mille".to_string()); + } else { + parts.push(format!("{} mille", chunk_to_words(chunk as u32, false))); + } + } else { + let chunk_words = chunk_to_words(chunk as u32, false); + if chunk == 1 { + parts.push(format!("un {}", scale_name)); + } else { + parts.push(format!("{} {}s", chunk_words, scale_name)); + } + } + } + } + + if remaining > 0 { + parts.push(chunk_to_words( + remaining as u32, + remaining < 1000 && parts.is_empty(), + )); + } + + parts.join(" ") +} + +/// Convert a number 1..999 to French words. +/// `standalone_cents`: if true and value is exact hundreds, add 's' to "cent" (deux cents). +fn chunk_to_words(n: u32, standalone_cents: bool) -> String { + debug_assert!(n > 0 && n < 1000); + let hundreds = n / 100; + let rest = n % 100; + + let mut result = String::new(); + + if hundreds > 0 { + if hundreds == 1 { + result.push_str("cent"); + } else { + result.push_str(ONES[hundreds as usize]); + if rest == 0 && standalone_cents { + result.push_str(" cents"); + } else { + result.push_str(" cent"); + } + } + } + + if rest > 0 { + if !result.is_empty() { + result.push(' '); + } + result.push_str(&two_digit_to_words(rest)); + } + + result +} + +/// Convert 1..99 to French words. +fn two_digit_to_words(n: u32) -> String { + debug_assert!(n > 0 && n < 100); + + if n < 20 { + return ONES[n as usize].to_string(); + } + + if n < 70 { + let tens_idx = (n / 10 - 2) as usize; + let ones = n % 10; + if ones == 0 { + TENS[tens_idx].to_string() + } else if ones == 1 { + format!("{} et un", TENS[tens_idx]) + } else { + format!("{}-{}", TENS[tens_idx], ONES[ones as usize]) + } + } else if n < 80 { + // 70-79: soixante-dix, soixante et onze, soixante-douze... + let ones = n - 60; + if ones == 10 { + "soixante-dix".to_string() + } else if ones == 11 { + "soixante et onze".to_string() + } else { + format!("soixante-{}", ONES[ones as usize]) + } + } else if n == 80 { + "quatre-vingts".to_string() + } else { + // 81-99: quatre-vingt-un, quatre-vingt-deux... quatre-vingt-dix, quatre-vingt-onze... + let ones = n - 80; + format!("quatre-vingt-{}", ONES[ones as usize]) + } +} + +/// Spell each digit of a string individually in French. +pub fn spell_digits(s: &str) -> String { + s.chars() + .filter_map(|c| c.to_digit(10).map(|d| ONES[d as usize])) + .collect::>() + .join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(number_to_words(0), "zero"); + assert_eq!(number_to_words(1), "un"); + assert_eq!(number_to_words(10), "dix"); + assert_eq!(number_to_words(16), "seize"); + assert_eq!(number_to_words(17), "dix-sept"); + assert_eq!(number_to_words(20), "vingt"); + assert_eq!(number_to_words(21), "vingt et un"); + assert_eq!(number_to_words(22), "vingt-deux"); + } + + #[test] + fn test_french_special() { + assert_eq!(number_to_words(70), "soixante-dix"); + assert_eq!(number_to_words(71), "soixante et onze"); + assert_eq!(number_to_words(72), "soixante-douze"); + assert_eq!(number_to_words(79), "soixante-dix-neuf"); + assert_eq!(number_to_words(80), "quatre-vingts"); + assert_eq!(number_to_words(81), "quatre-vingt-un"); + assert_eq!(number_to_words(90), "quatre-vingt-dix"); + assert_eq!(number_to_words(91), "quatre-vingt-onze"); + assert_eq!(number_to_words(99), "quatre-vingt-dix-neuf"); + } + + #[test] + fn test_hundreds() { + assert_eq!(number_to_words(100), "cent"); + assert_eq!(number_to_words(200), "deux cents"); + assert_eq!(number_to_words(201), "deux cent un"); + assert_eq!(number_to_words(999), "neuf cent quatre-vingt-dix-neuf"); + } + + #[test] + fn test_thousands() { + assert_eq!(number_to_words(1000), "mille"); + assert_eq!(number_to_words(2000), "deux mille"); + assert_eq!(number_to_words(2025), "deux mille vingt-cinq"); + } + + #[test] + fn test_negative() { + assert_eq!(number_to_words(-42), "moins quarante-deux"); + } +} diff --git a/src/tts/fr/money.rs b/src/tts/fr/money.rs new file mode 100644 index 0000000..13b2ec8 --- /dev/null +++ b/src/tts/fr/money.rs @@ -0,0 +1,315 @@ +//! Money TN tagger for French. +//! +//! Converts written currency expressions to spoken French: +//! - "5,50 €" → "cinq euros et cinquante centimes" +//! - "€5.50" → "cinq euros et cinquante centimes" +//! - "$100" → "cent dollars" +//! - "£1" → "une livre" + +use super::number_to_words; + +struct Currency { + singular: &'static str, + plural: &'static str, + cent_singular: &'static str, + cent_plural: &'static str, + /// Whether "un" becomes "une" for this currency + feminine: bool, +} + +const EURO: Currency = Currency { + singular: "euro", + plural: "euros", + cent_singular: "centime", + cent_plural: "centimes", + feminine: false, +}; + +const DOLLAR: Currency = Currency { + singular: "dollar", + plural: "dollars", + cent_singular: "cent", + cent_plural: "cents", + feminine: false, +}; + +const POUND: Currency = Currency { + singular: "livre", + plural: "livres", + cent_singular: "penny", + cent_plural: "pence", + feminine: true, +}; + +const YEN: Currency = Currency { + singular: "yen", + plural: "yens", + cent_singular: "sen", + cent_plural: "sen", + feminine: false, +}; + +/// Scale suffixes recognized after a currency amount. +const SCALE_SUFFIXES: &[&str] = &[ + "billiard", + "billion", + "milliards", + "milliard", + "millions", + "million", + "mille", +]; + +/// Parse a written money expression to spoken French. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Try suffix symbol: "5,50 €", "100 €" + if let Some(result) = parse_suffix_currency(trimmed) { + return Some(result); + } + + // Try prefix symbol: "$5.50", "€100", "£1" + if let Some(result) = parse_prefix_currency(trimmed) { + return Some(result); + } + + None +} + +fn parse_suffix_currency(input: &str) -> Option { + let (amount_str, currency) = if let Some(s) = input.strip_suffix('€') { + (s.trim(), &EURO) + } else if let Some(s) = input.strip_suffix("EUR") { + (s.trim(), &EURO) + } else { + return None; + }; + + parse_amount(amount_str, currency) +} + +fn parse_prefix_currency(input: &str) -> Option { + let (currency, rest) = if let Some(r) = input.strip_prefix('$') { + (&DOLLAR, r) + } else if let Some(r) = input.strip_prefix('€') { + (&EURO, r) + } else if let Some(r) = input.strip_prefix('£') { + (&POUND, r) + } else if let Some(r) = input.strip_prefix('¥') { + (&YEN, r) + } else { + return None; + }; + + let rest = rest.trim(); + if rest.is_empty() { + return None; + } + + // Check for scale suffix: "$2,5 milliards" + let (amount_str, scale) = extract_scale(rest); + + // Without a scale suffix, the amount must be purely numeric + if scale.is_none() + && !amount_str + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == ' ') + { + return None; + } + + if let Some(scale_word) = scale { + // With scale: "$2,5 milliards" → "deux virgule cinq milliards de dollars" + let decimal_sep = if amount_str.contains(',') { + ',' + } else if amount_str.contains('.') { + '.' + } else { + // No decimal: "$50 millions" → "cinquante millions de dollars" + let clean: String = amount_str.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + let words = number_to_words(n); + return Some(format!("{} {} de {}", words, scale_word, currency.plural)); + }; + + let parts: Vec<&str> = amount_str.splitn(2, decimal_sep).collect(); + if parts.len() == 2 { + let int_val: i64 = parts[0].parse().ok()?; + let int_words = number_to_words(int_val); + let frac_words = super::spell_digits(parts[1]); + return Some(format!( + "{} virgule {} {} de {}", + int_words, frac_words, scale_word, currency.plural + )); + } + } + + parse_amount(amount_str, currency) +} + +/// Extract scale suffix from the amount string. +fn extract_scale(input: &str) -> (&str, Option<&str>) { + for &scale in SCALE_SUFFIXES { + if let Some(before) = input.strip_suffix(scale) { + let before = before.trim_end(); + if !before.is_empty() + && before + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == ' ') + { + return (before, Some(scale)); + } + } + } + (input, None) +} + +fn parse_amount(amount_str: &str, currency: &Currency) -> Option { + if amount_str.is_empty() { + return None; + } + + // Determine decimal separator: French uses comma + let sep = if amount_str.contains(',') { ',' } else { '.' }; + + if amount_str.contains(sep) && sep != '.' || amount_str.contains('.') { + let actual_sep = if amount_str.contains(',') { ',' } else { '.' }; + let parts: Vec<&str> = amount_str.splitn(2, actual_sep).collect(); + if parts.len() == 2 { + let int_clean: String = parts[0].chars().filter(|c| c.is_ascii_digit()).collect(); + let dollars: i64 = if int_clean.is_empty() { + 0 + } else { + int_clean.parse().ok()? + }; + + let cents_str = parts[1].trim(); + let cents: i64 = if cents_str.is_empty() { + 0 + } else if cents_str.len() == 1 { + cents_str.parse::().ok()? * 10 + } else if cents_str.len() == 2 { + cents_str.parse().ok()? + } else { + cents_str[..2].parse().ok()? + }; + + return Some(format_currency(dollars, cents, currency)); + } + } + + let clean: String = amount_str.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + Some(format_currency(n, 0, currency)) +} + +fn format_currency(dollars: i64, cents: i64, currency: &Currency) -> String { + let dollar_words = if dollars == 1 && currency.feminine { + "une".to_string() + } else { + number_to_words(dollars) + }; + + if dollars == 0 && cents == 0 { + return format!("zero {}", currency.plural); + } + + if dollars == 0 { + let cents_words = number_to_words(cents); + let unit = if cents == 1 { + currency.cent_singular + } else { + currency.cent_plural + }; + return format!("{} {}", cents_words, unit); + } + + if cents == 0 { + let unit = if dollars == 1 { + currency.singular + } else { + currency.plural + }; + return format!("{} {}", dollar_words, unit); + } + + let dollar_unit = if dollars == 1 { + currency.singular + } else { + currency.plural + }; + let cents_words = number_to_words(cents); + let cent_unit = if cents == 1 { + currency.cent_singular + } else { + currency.cent_plural + }; + + format!( + "{} {} et {} {}", + dollar_words, dollar_unit, cents_words, cent_unit + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_euro_suffix() { + assert_eq!(parse("5 €"), Some("cinq euros".to_string())); + assert_eq!(parse("1 €"), Some("un euro".to_string())); + assert_eq!( + parse("5,50 €"), + Some("cinq euros et cinquante centimes".to_string()) + ); + } + + #[test] + fn test_prefix_currencies() { + assert_eq!(parse("$100"), Some("cent dollars".to_string())); + assert_eq!(parse("£1"), Some("une livre".to_string())); + assert_eq!(parse("€100"), Some("cent euros".to_string())); + } + + #[test] + fn test_dollars_and_cents() { + assert_eq!( + parse("$5.50"), + Some("cinq dollars et cinquante cents".to_string()) + ); + assert_eq!(parse("$1.01"), Some("un dollar et un cent".to_string())); + assert_eq!( + parse("$0.99"), + Some("quatre-vingt-dix-neuf cents".to_string()) + ); + } + + #[test] + fn test_large_amounts() { + assert_eq!( + parse("$2,5 milliards"), + Some("deux virgule cinq milliards de dollars".to_string()) + ); + assert_eq!( + parse("$50 millions"), + Some("cinquante millions de dollars".to_string()) + ); + } + + #[test] + fn test_trailing_dot() { + assert_eq!(parse("$5."), Some("cinq dollars".to_string())); + assert_eq!(parse("$1."), Some("un dollar".to_string())); + } + + #[test] + fn test_non_money() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + } +} diff --git a/src/tts/fr/ordinal.rs b/src/tts/fr/ordinal.rs new file mode 100644 index 0000000..9dafcb5 --- /dev/null +++ b/src/tts/fr/ordinal.rs @@ -0,0 +1,130 @@ +//! Ordinal TN tagger for French. +//! +//! Converts written ordinal numbers to spoken French: +//! - "1er" → "premier" +//! - "1re" → "premiere" +//! - "2e" → "deuxieme" +//! - "21e" → "vingt et unieme" + +use super::number_to_words; + +/// Parse a written ordinal to spoken French words. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Detect French ordinal suffixes: er, ere, re, e, eme, ieme, ieme + let (num_str, feminine) = if let Some(s) = trimmed.strip_suffix("ere") { + (s, true) + } else if let Some(s) = trimmed.strip_suffix("re") { + (s, true) + } else if let Some(s) = trimmed.strip_suffix("ieme") { + (s, false) + } else if let Some(s) = trimmed.strip_suffix("eme") { + (s, false) + } else if let Some(s) = trimmed.strip_suffix("er") { + (s, false) + } else if let Some(s) = trimmed.strip_suffix("nd") { + (s, false) + } else if let Some(s) = trimmed.strip_suffix("nde") { + (s, true) + } else if let Some(s) = trimmed.strip_suffix('e') { + // Must check this is not just a word ending in 'e' + if s.chars().all(|c| c.is_ascii_digit()) && !s.is_empty() { + (s, false) + } else { + return None; + } + } else { + return None; + }; + + if num_str.is_empty() || !num_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let n: i64 = num_str.parse().ok()?; + if n <= 0 { + return None; + } + + if n == 1 { + return Some(if feminine { + "premiere".to_string() + } else { + "premier".to_string() + }); + } + + if n == 2 && trimmed.ends_with("nd") { + return Some("second".to_string()); + } + if n == 2 && trimmed.ends_with("nde") { + return Some("seconde".to_string()); + } + + let cardinal = number_to_words(n); + Some(cardinal_to_ordinal(&cardinal)) +} + +/// Convert cardinal words to ordinal by adding -ieme suffix. +fn cardinal_to_ordinal(cardinal: &str) -> String { + // Special transformations for the last word + if let Some(prefix) = cardinal.strip_suffix("cinq") { + format!("{}cinquieme", prefix) + } else if let Some(prefix) = cardinal.strip_suffix("neuf") { + format!("{}neuvieme", prefix) + } else if cardinal.ends_with('e') { + // Drop final 'e' before adding -ieme (quatre → quatrieme) + format!("{}ieme", &cardinal[..cardinal.len() - 1]) + } else { + format!("{}ieme", cardinal) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_premier() { + assert_eq!(parse("1er"), Some("premier".to_string())); + assert_eq!(parse("1re"), Some("premiere".to_string())); + assert_eq!(parse("1ere"), Some("premiere".to_string())); + } + + #[test] + fn test_basic() { + assert_eq!(parse("2e"), Some("deuxieme".to_string())); + assert_eq!(parse("3e"), Some("troisieme".to_string())); + assert_eq!(parse("4e"), Some("quatrieme".to_string())); + assert_eq!(parse("5e"), Some("cinquieme".to_string())); + assert_eq!(parse("9e"), Some("neuvieme".to_string())); + } + + #[test] + fn test_teens() { + assert_eq!(parse("11e"), Some("onzieme".to_string())); + assert_eq!(parse("12e"), Some("douzieme".to_string())); + assert_eq!(parse("13e"), Some("treizieme".to_string())); + } + + #[test] + fn test_compound() { + assert_eq!(parse("21e"), Some("vingt et unieme".to_string())); + assert_eq!(parse("22e"), Some("vingt-deuxieme".to_string())); + assert_eq!(parse("99e"), Some("quatre-vingt-dix-neuvieme".to_string())); + } + + #[test] + fn test_large() { + assert_eq!(parse("100e"), Some("centieme".to_string())); + assert_eq!(parse("1000e"), Some("millieme".to_string())); + assert_eq!(parse("101e"), Some("cent unieme".to_string())); + } + + #[test] + fn test_non_ordinals() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("0e"), None); + } +} diff --git a/src/tts/fr/telephone.rs b/src/tts/fr/telephone.rs new file mode 100644 index 0000000..80796cd --- /dev/null +++ b/src/tts/fr/telephone.rs @@ -0,0 +1,170 @@ +//! Telephone TN tagger for French. +//! +//! Converts written phone numbers to spoken French form: +//! - "01-23-45-67-89" → "zero un, deux trois, quatre cinq, six sept, huit neuf" +//! - "+33-1-23-45-67-89" → "plus trois trois, un, deux trois, quatre cinq, six sept, huit neuf" +//! - "(01) 234-5678" → "zero un, deux trois quatre, cinq six sept huit" + +/// Parse a written phone number to spoken French form. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Phone numbers contain digits and separators (-, ., space, parens) + // Must have mostly digits + let digit_count = trimmed.chars().filter(|c| c.is_ascii_digit()).count(); + let non_digit_non_sep = trimmed + .chars() + .filter(|c| { + !c.is_ascii_digit() + && *c != '-' + && *c != '.' + && *c != ' ' + && *c != '(' + && *c != ')' + && *c != '+' + }) + .count(); + + // Must have at least 7 digits and no unexpected characters + if digit_count < 7 || non_digit_non_sep > 0 { + return None; + } + + // Must contain at least one separator (-, ., space, parens) to distinguish + // from plain numbers like "1000000" + let has_separator = trimmed + .chars() + .any(|c| c == '-' || c == '.' || c == ' ' || c == '(' || c == ')'); + if !has_separator { + return None; + } + + let mut parts: Vec = Vec::new(); + let mut has_plus = false; + + // Handle leading + + let rest = if let Some(r) = trimmed.strip_prefix('+') { + has_plus = true; + r.trim_start() + } else { + trimmed + }; + + // Split by common separators + let groups = split_phone_groups(rest); + + if has_plus && !groups.is_empty() { + // The first group after + is the country code + let mut first = String::from("plus "); + first.push_str(&spell_digit_group(&groups[0])); + parts.push(first); + for g in &groups[1..] { + parts.push(spell_digit_group(g)); + } + } else { + for g in &groups { + parts.push(spell_digit_group(g)); + } + } + + if parts.is_empty() { + return None; + } + + Some(parts.join(", ")) +} + +/// Split phone number into groups by separators. +fn split_phone_groups(input: &str) -> Vec { + let mut groups: Vec = Vec::new(); + let mut current = String::new(); + + for c in input.chars() { + match c { + '0'..='9' => current.push(c), + '-' | '.' | ' ' | '(' | ')' => { + if !current.is_empty() { + groups.push(current.clone()); + current.clear(); + } + } + _ => {} + } + } + + if !current.is_empty() { + groups.push(current); + } + + groups +} + +/// Spell each digit in a group using French words. +fn spell_digit_group(group: &str) -> String { + group + .chars() + .filter_map(|c| { + let word = match c { + '0' => "zero", + '1' => "un", + '2' => "deux", + '3' => "trois", + '4' => "quatre", + '5' => "cinq", + '6' => "six", + '7' => "sept", + '8' => "huit", + '9' => "neuf", + _ => return None, + }; + Some(word) + }) + .collect::>() + .join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_standard_phone() { + assert_eq!( + parse("01-23-45-67-89"), + Some("zero un, deux trois, quatre cinq, six sept, huit neuf".to_string()) + ); + } + + #[test] + fn test_with_country_code() { + assert_eq!( + parse("+33-1-23-45-67-89"), + Some("plus trois trois, un, deux trois, quatre cinq, six sept, huit neuf".to_string()) + ); + } + + #[test] + fn test_parentheses() { + assert_eq!( + parse("(01) 234-56789"), + Some("zero un, deux trois quatre, cinq six sept huit neuf".to_string()) + ); + } + + #[test] + fn test_dots() { + assert_eq!( + parse("555.123.4567"), + Some("cinq cinq cinq, un deux trois, quatre cinq six sept".to_string()) + ); + } + + #[test] + fn test_non_phone() { + assert_eq!(parse("bonjour"), None); + assert_eq!(parse("123"), None); // too few digits + } +} diff --git a/src/tts/fr/time.rs b/src/tts/fr/time.rs new file mode 100644 index 0000000..c9e76df --- /dev/null +++ b/src/tts/fr/time.rs @@ -0,0 +1,145 @@ +//! Time TN tagger for French. +//! +//! Converts written time expressions to spoken French: +//! - "14:30" → "quatorze heures trente" +//! - "14h30" → "quatorze heures trente" +//! - "2:00" → "deux heures" +//! - "0:00" → "minuit" +//! - "12:00" → "midi" + +use super::number_to_words; + +/// Parse a written time expression to spoken French. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Try "14h30" or "14h" format (French convention) + if let Some(result) = parse_h_format(trimmed) { + return Some(result); + } + + // Try "14:30" format + if let Some(result) = parse_colon_format(trimmed) { + return Some(result); + } + + None +} + +fn parse_h_format(input: &str) -> Option { + let lower = input.to_lowercase(); + + // Find 'h' separator + let h_pos = lower.find('h')?; + let hour_str = &lower[..h_pos]; + let min_str = lower[h_pos + 1..].trim(); + + if !hour_str.chars().all(|c| c.is_ascii_digit()) || hour_str.is_empty() { + return None; + } + + let hour: u32 = hour_str.parse().ok()?; + if hour > 23 { + return None; + } + + let minute: u32 = if min_str.is_empty() { + 0 + } else { + if !min_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } + let m: u32 = min_str.parse().ok()?; + if m > 59 { + return None; + } + m + }; + + Some(format_time(hour, minute)) +} + +fn parse_colon_format(input: &str) -> Option { + if !input.contains(':') { + return None; + } + + let parts: Vec<&str> = input.splitn(2, ':').collect(); + if parts.len() != 2 { + return None; + } + + let hour_str = parts[0].trim(); + let min_str = parts[1].trim(); + + if !hour_str.chars().all(|c| c.is_ascii_digit()) || hour_str.is_empty() { + return None; + } + if !min_str.chars().all(|c| c.is_ascii_digit()) || min_str.is_empty() { + return None; + } + + let hour: u32 = hour_str.parse().ok()?; + let minute: u32 = min_str.parse().ok()?; + + if hour > 23 || minute > 59 { + return None; + } + + Some(format_time(hour, minute)) +} + +fn format_time(hour: u32, minute: u32) -> String { + // Special cases + if hour == 0 && minute == 0 { + return "minuit".to_string(); + } + if hour == 12 && minute == 0 { + return "midi".to_string(); + } + if hour == 0 { + return format!("minuit {}", number_to_words(minute as i64)); + } + if hour == 12 { + return format!("midi {}", number_to_words(minute as i64)); + } + + let hour_words = number_to_words(hour as i64); + + if minute == 0 { + format!("{} heures", hour_words) + } else { + format!("{} heures {}", hour_words, number_to_words(minute as i64)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_h_format() { + assert_eq!(parse("14h30"), Some("quatorze heures trente".to_string())); + assert_eq!(parse("14h"), Some("quatorze heures".to_string())); + assert_eq!(parse("8h15"), Some("huit heures quinze".to_string())); + } + + #[test] + fn test_colon_format() { + assert_eq!(parse("14:30"), Some("quatorze heures trente".to_string())); + assert_eq!(parse("2:00"), Some("deux heures".to_string())); + } + + #[test] + fn test_special_hours() { + assert_eq!(parse("0:00"), Some("minuit".to_string())); + assert_eq!(parse("12:00"), Some("midi".to_string())); + assert_eq!(parse("0:30"), Some("minuit trente".to_string())); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("25:00"), None); + } +} diff --git a/src/tts/fr/whitelist.rs b/src/tts/fr/whitelist.rs new file mode 100644 index 0000000..81218bf --- /dev/null +++ b/src/tts/fr/whitelist.rs @@ -0,0 +1,84 @@ +//! Whitelist TN tagger for French. +//! +//! Lookup table for common French abbreviations and special terms: +//! - "M." → "monsieur" +//! - "Mme" → "madame" +//! - "Dr." → "docteur" +//! - "c.-à-d." → "c'est-a-dire" + +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + static ref WHITELIST: HashMap<&'static str, &'static str> = { + let mut m = HashMap::new(); + // Titles + m.insert("M.", "monsieur"); + m.insert("M", "monsieur"); + m.insert("Mme", "madame"); + m.insert("Mme.", "madame"); + m.insert("Mlle", "mademoiselle"); + m.insert("Mlle.", "mademoiselle"); + m.insert("Dr", "docteur"); + m.insert("Dr.", "docteur"); + m.insert("Prof.", "professeur"); + m.insert("St", "saint"); + m.insert("St.", "saint"); + m.insert("Jr.", "junior"); + m.insert("Sr.", "senior"); + + // French abbreviations + m.insert("c.-\u{00e0}-d.", "c'est-a-dire"); + m.insert("c-\u{00e0}-d", "c'est-a-dire"); + m.insert("etc.", "et cetera"); + m.insert("p.ex.", "par exemple"); + + // Common address and organization abbreviations + m.insert("Av.", "avenue"); + m.insert("Bd.", "boulevard"); + m.insert("Cie", "compagnie"); + m.insert("Ste", "societe"); + m.insert("No", "numero"); + m.insert("no", "numero"); + + m + }; +} + +/// Parse a French whitelist abbreviation to its spoken form. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Direct lookup (case-sensitive) + if let Some(&spoken) = WHITELIST.get(trimmed) { + return Some(spoken.to_string()); + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_titles() { + assert_eq!(parse("M."), Some("monsieur".to_string())); + assert_eq!(parse("Mme"), Some("madame".to_string())); + assert_eq!(parse("Mlle"), Some("mademoiselle".to_string())); + assert_eq!(parse("Dr."), Some("docteur".to_string())); + } + + #[test] + fn test_abbreviations() { + assert_eq!(parse("etc."), Some("et cetera".to_string())); + assert_eq!(parse("p.ex."), Some("par exemple".to_string())); + assert_eq!(parse("Av."), Some("avenue".to_string())); + } + + #[test] + fn test_no_match() { + assert_eq!(parse("bonjour"), None); + assert_eq!(parse("monde"), None); + } +} diff --git a/src/tts/hi/cardinal.rs b/src/tts/hi/cardinal.rs new file mode 100644 index 0000000..0a98a2d --- /dev/null +++ b/src/tts/hi/cardinal.rs @@ -0,0 +1,87 @@ +//! Cardinal TN tagger for Hindi (romanized). +//! +//! Converts written cardinal numbers to spoken romanized Hindi: +//! - "123" → "ek sau teis" +//! - "-42" → "rhin bayaalees" +//! - "1,00,000" → "ek lakh" + +use super::number_to_words; + +/// Parse a written cardinal number to spoken romanized Hindi words. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + let (is_negative, digits_part) = if let Some(rest) = trimmed.strip_prefix('-') { + (true, rest) + } else { + (false, trimmed) + }; + + // Must be digits (with optional commas, dots, or spaces as thousands separators) + // Hindi uses Indian comma grouping: 1,23,45,678 + if !digits_part + .chars() + .all(|c| c.is_ascii_digit() || c == ',' || c == '.' || c == ' ' || c == '\u{a0}') + { + return None; + } + + if !digits_part.chars().any(|c| c.is_ascii_digit()) { + return None; + } + + // Strip thousands/lakh separators + let clean: String = digits_part.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + + if is_negative { + Some(format!("rhin {}", number_to_words(n))) + } else { + Some(number_to_words(n)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(parse("0"), Some("shunya".to_string())); + assert_eq!(parse("1"), Some("ek".to_string())); + assert_eq!(parse("21"), Some("ikkees".to_string())); + assert_eq!(parse("100"), Some("ek sau".to_string())); + assert_eq!(parse("123"), Some("ek sau teis".to_string())); + } + + #[test] + fn test_thousands_separators() { + assert_eq!(parse("1 000"), Some("ek hazaar".to_string())); + assert_eq!(parse("1,000"), Some("ek hazaar".to_string())); + assert_eq!(parse("1.000"), Some("ek hazaar".to_string())); + assert_eq!(parse("1 000 000"), Some("das lakh".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!(parse("-42"), Some("rhin bayaalees".to_string())); + assert_eq!(parse("-1"), Some("rhin ek".to_string())); + assert_eq!(parse("-1000"), Some("rhin ek hazaar".to_string())); + } + + #[test] + fn test_indian_grouping() { + assert_eq!(parse("1,00,000"), Some("ek lakh".to_string())); + assert_eq!(parse("1,00,00,000"), Some("ek crore".to_string())); + } + + #[test] + fn test_non_numbers() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("12abc"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/hi/date.rs b/src/tts/hi/date.rs new file mode 100644 index 0000000..41be5eb --- /dev/null +++ b/src/tts/hi/date.rs @@ -0,0 +1,317 @@ +//! Date TN tagger for Hindi (romanized). +//! +//! Converts written date expressions to spoken romanized Hindi: +//! - "5 January 2025" → "paanch janvari do hazaar pachchees" +//! - "5/1/2025" → "paanch janvari do hazaar pachchees" (DD/MM/YYYY) +//! - "15 march 2000" → "pandrah march do hazaar" + +use super::number_to_words; + +/// English month names mapped to their Hindi romanized equivalents and month number. +const MONTHS_EN: &[(&str, &str, u32)] = &[ + ("january", "janvari", 1), + ("february", "farvari", 2), + ("march", "march", 3), + ("april", "aprail", 4), + ("may", "mai", 5), + ("june", "june", 6), + ("july", "julai", 7), + ("august", "agast", 8), + ("september", "sitambar", 9), + ("october", "aktubar", 10), + ("november", "navambar", 11), + ("december", "disambar", 12), +]; + +/// Month names by index (1-based) in romanized Hindi. +const MONTH_NAMES: &[&str] = &[ + "", // 0 placeholder + "janvari", // 1 + "farvari", // 2 + "march", // 3 + "aprail", // 4 + "mai", // 5 + "june", // 6 + "julai", // 7 + "agast", // 8 + "sitambar", // 9 + "aktubar", // 10 + "navambar", // 11 + "disambar", // 12 +]; + +/// Parse a written date to spoken romanized Hindi. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Try decade: "1980s" → "unnis sau assee ka dashak" + if let Some(result) = parse_decade(trimmed) { + return Some(result); + } + + // Try "5 January 2025" or "5 january" format + if let Some(result) = parse_day_month_year(trimmed) { + return Some(result); + } + + // Try "January 5, 2025" format (English style) + if let Some(result) = parse_month_day_year(trimmed) { + return Some(result); + } + + // Try numeric DD/MM/YYYY + if let Some(result) = parse_numeric_date(trimmed) { + return Some(result); + } + + None +} + +/// Parse decade: "1980s" → "unnis sau assee ka dashak" +fn parse_decade(input: &str) -> Option { + let s = input.strip_suffix('s')?; + if s.len() != 4 || !s.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let year: u32 = s.parse().ok()?; + if year < 1000 { + return None; + } + + // Must be a round decade (ends in 0) + if year % 10 != 0 { + return None; + } + + // Hindi: year number + "ka dashak" (का दशक) + let year_words = number_to_words(year as i64); + Some(format!("{} ka dashak", year_words)) +} + +/// Parse "5 January 2025" or "5 january" format. +fn parse_day_month_year(input: &str) -> Option { + let lower = input.to_lowercase(); + let tokens: Vec<&str> = lower.split_whitespace().collect(); + if tokens.len() < 2 { + return None; + } + + // Strip ordinal suffixes from day + let day_str = tokens[0] + .trim_end_matches("st") + .trim_end_matches("nd") + .trim_end_matches("rd") + .trim_end_matches("th"); + if !day_str.chars().all(|c| c.is_ascii_digit()) || day_str.is_empty() { + return None; + } + + let day: u32 = day_str.parse().ok()?; + if day == 0 || day > 31 { + return None; + } + + // Find month name + let month_token = tokens[1].trim_end_matches(','); + let month_hindi = MONTHS_EN + .iter() + .find(|(en, _, _)| *en == month_token) + .map(|(_, hi, _)| *hi)?; + + let day_word = number_to_words(day as i64); + + if tokens.len() >= 3 { + let year_str = + tokens[2].trim_end_matches(|c: char| c == '.' || c == ',' || c == '!' || c == '?'); + if year_str.chars().all(|c| c.is_ascii_digit()) && year_str.len() == 4 { + let year: u32 = year_str.parse().ok()?; + let year_words = number_to_words(year as i64); + return Some(format!("{} {} {}", day_word, month_hindi, year_words)); + } + } + + Some(format!("{} {}", day_word, month_hindi)) +} + +/// Parse "January 5, 2025" format. +fn parse_month_day_year(input: &str) -> Option { + let lower = input.to_lowercase(); + + let mut month_hindi = None; + let mut rest = ""; + for &(en_name, hi_name, _) in MONTHS_EN { + if let Some(r) = lower.strip_prefix(en_name) { + if r.is_empty() || r.starts_with(' ') || r.starts_with(',') { + month_hindi = Some(hi_name); + rest = r.trim_start_matches(|c: char| c == ' ' || c == ','); + break; + } + } + } + + let month_hindi = month_hindi?; + if rest.is_empty() { + return None; + } + + // Parse day + let (day_str, year_part) = if let Some(comma_pos) = rest.find(',') { + (&rest[..comma_pos], Some(rest[comma_pos + 1..].trim())) + } else { + let parts: Vec<&str> = rest.splitn(2, ' ').collect(); + if parts.len() == 2 + && parts[0] + .trim_end_matches("st") + .trim_end_matches("nd") + .trim_end_matches("rd") + .trim_end_matches("th") + .chars() + .all(|c| c.is_ascii_digit()) + { + let year_clean = + parts[1].trim_end_matches(|c: char| c == '.' || c == ',' || c == '!' || c == '?'); + if year_clean.chars().all(|c| c.is_ascii_digit()) && year_clean.len() == 4 { + (parts[0], Some(year_clean)) + } else { + (rest, None) + } + } else { + (rest, None) + } + }; + + let day_digits = day_str + .trim() + .trim_end_matches("st") + .trim_end_matches("nd") + .trim_end_matches("rd") + .trim_end_matches("th"); + + if !day_digits.chars().all(|c| c.is_ascii_digit()) || day_digits.is_empty() { + return None; + } + + let day: u32 = day_digits.parse().ok()?; + if day == 0 || day > 31 { + return None; + } + + let day_word = number_to_words(day as i64); + + if let Some(year_str) = year_part { + let year_str = year_str + .trim() + .trim_end_matches(|c: char| c == '.' || c == ',' || c == '!' || c == '?'); + if !year_str.is_empty() && year_str.chars().all(|c| c.is_ascii_digit()) { + let year: u32 = year_str.parse().ok()?; + let year_words = number_to_words(year as i64); + return Some(format!("{} {} {}", day_word, month_hindi, year_words)); + } + } + + Some(format!("{} {}", day_word, month_hindi)) +} + +/// Parse numeric date DD/MM/YYYY (Indian convention, same as French: day first). +fn parse_numeric_date(input: &str) -> Option { + let sep = if input.contains('/') { + '/' + } else if input.contains('-') && input.chars().filter(|c| *c == '-').count() == 2 { + '-' + } else { + return None; + }; + + let parts: Vec<&str> = input.splitn(3, sep).collect(); + if parts.len() != 3 { + return None; + } + + if !parts + .iter() + .all(|p| !p.is_empty() && p.chars().all(|c| c.is_ascii_digit())) + { + return None; + } + + let day: u32 = parts[0].parse().ok()?; + let month_num: u32 = parts[1].parse().ok()?; + let year: u32 = parts[2].parse().ok()?; + + if month_num == 0 || month_num > 12 || day == 0 || day > 31 { + return None; + } + + let month_name = MONTH_NAMES[month_num as usize]; + let day_word = number_to_words(day as i64); + let year_words = number_to_words(year as i64); + + Some(format!("{} {} {}", day_word, month_name, year_words)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_day_month_year() { + assert_eq!( + parse("5 January 2025"), + Some("paanch janvari do hazaar pachchees".to_string()) + ); + assert_eq!( + parse("15 march 2000"), + Some("pandrah march do hazaar".to_string()) + ); + assert_eq!(parse("1 january"), Some("ek janvari".to_string())); + } + + #[test] + fn test_month_day_year() { + assert_eq!( + parse("January 5, 2025"), + Some("paanch janvari do hazaar pachchees".to_string()) + ); + } + + #[test] + fn test_numeric_date() { + assert_eq!( + parse("05/01/2025"), + Some("paanch janvari do hazaar pachchees".to_string()) + ); + assert_eq!( + parse("26/01/1950"), + Some("chhabees janvari ek hazaar nau sau pachaas".to_string()) + ); + } + + #[test] + fn test_decade() { + assert_eq!( + parse("1980s"), + Some("ek hazaar nau sau assi ka dashak".to_string()) + ); + assert_eq!(parse("2000s"), Some("do hazaar ka dashak".to_string())); + assert_eq!( + parse("1990s"), + Some("ek hazaar nau sau nabbe ka dashak".to_string()) + ); + } + + #[test] + fn test_year_verbalization() { + assert_eq!(number_to_words(2025), "do hazaar pachchees".to_string()); + assert_eq!(number_to_words(2000), "do hazaar".to_string()); + assert_eq!(number_to_words(1990), "ek hazaar nau sau nabbe".to_string()); + assert_eq!(number_to_words(1900), "ek hazaar nau sau".to_string()); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/hi/decimal.rs b/src/tts/hi/decimal.rs new file mode 100644 index 0000000..8025848 --- /dev/null +++ b/src/tts/hi/decimal.rs @@ -0,0 +1,133 @@ +//! Decimal TN tagger for Hindi (romanized). +//! +//! Converts written decimal numbers to spoken romanized Hindi: +//! - "3.14" → "teen dashmlav ek chaar" +//! - "0.5" → "shunya dashmlav paanch" +//! - "-2.7" → "rhin do dashmlav saat" + +use super::{number_to_words, spell_digits}; + +/// Hindi quantity suffixes recognized after a decimal number. +/// crore, lakh, hazaar (thousand) +const QUANTITY_SUFFIXES: &[&str] = &["crore", "lakh", "hazaar"]; + +/// Parse a written decimal number to spoken romanized Hindi. +/// +/// Uses "dashmlav" for the decimal point. The fractional part is spelled +/// digit by digit. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Check for quantity suffix: "1.5 lakh" + let (number_part, suffix) = extract_suffix(trimmed); + + // Hindi typically uses period as decimal separator + if !number_part.contains('.') { + return None; + } + + let parts: Vec<&str> = number_part.splitn(2, '.').collect(); + if parts.len() != 2 { + return None; + } + + let int_str = parts[0]; + let frac_str = parts[1]; + + let (is_negative, int_digits) = if let Some(rest) = int_str.strip_prefix('-') { + (true, rest) + } else { + (false, int_str) + }; + + if !int_digits.chars().all(|c| c.is_ascii_digit()) { + return None; + } + if frac_str.is_empty() || !frac_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let int_val: i64 = if int_digits.is_empty() { + 0 + } else { + int_digits.parse().ok()? + }; + + let int_words = number_to_words(int_val); + let frac_words = spell_digits(frac_str); + + let mut result = if is_negative { + format!("rhin {} dashmlav {}", int_words, frac_words) + } else { + format!("{} dashmlav {}", int_words, frac_words) + }; + + if let Some(suf) = suffix { + result.push(' '); + result.push_str(suf); + } + + Some(result) +} + +/// Extract a quantity suffix from the end if present. +fn extract_suffix(input: &str) -> (&str, Option<&str>) { + for &suf in QUANTITY_SUFFIXES { + if let Some(before) = input.strip_suffix(suf) { + let before = before.trim_end(); + if !before.is_empty() { + return (before, Some(suf)); + } + } + } + (input, None) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_decimal() { + assert_eq!(parse("3.14"), Some("teen dashmlav ek chaar".to_string())); + assert_eq!(parse("0.5"), Some("shunya dashmlav paanch".to_string())); + } + + #[test] + fn test_negative_decimal() { + assert_eq!( + parse("-3.14"), + Some("rhin teen dashmlav ek chaar".to_string()) + ); + } + + #[test] + fn test_larger_decimal() { + assert_eq!( + parse("100.25"), + Some("ek sau dashmlav do paanch".to_string()) + ); + } + + #[test] + fn test_with_quantity() { + assert_eq!( + parse("1.5 lakh"), + Some("ek dashmlav paanch lakh".to_string()) + ); + assert_eq!( + parse("4.85 crore"), + Some("chaar dashmlav aath paanch crore".to_string()) + ); + } + + #[test] + fn test_non_decimal() { + assert_eq!(parse("123"), None); + assert_eq!(parse("hello"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/hi/electronic.rs b/src/tts/hi/electronic.rs new file mode 100644 index 0000000..38978bc --- /dev/null +++ b/src/tts/hi/electronic.rs @@ -0,0 +1,157 @@ +//! Electronic TN tagger for Hindi (romanized). +//! +//! Converts written emails and URLs to spoken form with Hindi romanized digit words: +//! - "test@gmail.com" -> "t e s t at g m a i l dot c o m" +//! - "http://www.example.com" -> "h t t p colon slash slash w w w dot e x a m p l e dot c o m" +//! - Digits are spoken in Hindi: 0 -> "shunya", 1 -> "ek", etc. + +/// Parse an email or URL to spoken form (Hindi romanized). +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Email detection: contains @ with text on both sides + if trimmed.contains('@') { + return parse_email(trimmed); + } + + // URL detection: starts with http://, https://, or www. + let lower = trimmed.to_lowercase(); + if lower.starts_with("http://") || lower.starts_with("https://") || lower.starts_with("www.") { + return parse_url(trimmed); + } + + None +} + +/// Parse an email address to spoken form. +fn parse_email(input: &str) -> Option { + let parts: Vec<&str> = input.splitn(2, '@').collect(); + if parts.len() != 2 || parts[0].is_empty() || parts[1].is_empty() { + return None; + } + + let local = spell_domain(parts[0]); + let domain = spell_domain(parts[1]); + + Some(format!("{} at {}", local, domain)) +} + +/// Parse a URL to spoken form. +fn parse_url(input: &str) -> Option { + let mut result = String::new(); + let lower = input.to_lowercase(); + + let rest = if lower.starts_with("https://") { + result.push_str("h t t p s colon slash slash"); + &input["https://".len()..] + } else if lower.starts_with("http://") { + result.push_str("h t t p colon slash slash"); + &input["http://".len()..] + } else { + input + }; + + if !result.is_empty() && !rest.is_empty() { + result.push(' '); + } + + result.push_str(&spell_domain(rest)); + + Some(result) +} + +/// Spell out a domain name, using "dot" for periods. +fn spell_domain(domain: &str) -> String { + let parts: Vec<&str> = domain.split('.').collect(); + let spelled: Vec = parts.iter().map(|p| spell_electronic(p)).collect(); + spelled.join(" dot ") +} + +/// Spell out an electronic string. +/// +/// Letters are spelled individually with spaces. +/// Digits use Hindi romanized words. +/// Special characters use their English technical names. +fn spell_electronic(s: &str) -> String { + let mut parts: Vec = Vec::new(); + + for c in s.chars() { + match c { + '-' => parts.push("dash".to_string()), + '_' => parts.push("underscore".to_string()), + '/' => parts.push("slash".to_string()), + '~' => parts.push("tilde".to_string()), + ':' => parts.push("colon".to_string()), + c if c.is_ascii_alphabetic() => { + parts.push(c.to_lowercase().to_string()); + } + c if c.is_ascii_digit() => { + parts.push(digit_word_hi(c)); + } + _ => { + // Skip unknown characters + } + } + } + + parts.join(" ") +} + +fn digit_word_hi(c: char) -> String { + match c { + '0' => "shunya", + '1' => "ek", + '2' => "do", + '3' => "teen", + '4' => "chaar", + '5' => "paanch", + '6' => "chhah", + '7' => "saat", + '8' => "aath", + '9' => "nau", + _ => "", + } + .to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_email() { + assert_eq!( + parse("user1@mail.com"), + Some("u s e r ek at m a i l dot c o m".to_string()) + ); + assert_eq!( + parse("test@gmail.com"), + Some("t e s t at g m a i l dot c o m".to_string()) + ); + } + + #[test] + fn test_url_http() { + assert_eq!( + parse("http://site2.com"), + Some("h t t p colon slash slash s i t e do dot c o m".to_string()) + ); + } + + #[test] + fn test_www_url() { + assert_eq!( + parse("www.example.com"), + Some("w w w dot e x a m p l e dot c o m".to_string()) + ); + } + + #[test] + fn test_non_electronic() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("12345"), None); + } +} diff --git a/src/tts/hi/measure.rs b/src/tts/hi/measure.rs new file mode 100644 index 0000000..480321a --- /dev/null +++ b/src/tts/hi/measure.rs @@ -0,0 +1,239 @@ +//! Measure TN tagger for Hindi (romanized). +//! +//! Converts written measurements to spoken romanized Hindi: +//! - "200 km/h" → "do sau kilometre prati ghanta" +//! - "1 kg" → "ek kilogram" +//! - "50%" → "pachaas pratishat" +//! - "72°C" → "bahattar digri selsiyas" + +use super::number_to_words; + +use lazy_static::lazy_static; +use std::collections::HashMap; + +struct UnitInfo { + singular: &'static str, + plural: &'static str, +} + +lazy_static! { + static ref UNITS: HashMap<&'static str, UnitInfo> = { + let mut m = HashMap::new(); + + // Length + m.insert("mm", UnitInfo { singular: "millimetre", plural: "millimetre" }); + m.insert("cm", UnitInfo { singular: "centimetre", plural: "centimetre" }); + m.insert("m", UnitInfo { singular: "metre", plural: "metre" }); + m.insert("km", UnitInfo { singular: "kilometre", plural: "kilometre" }); + m.insert("in", UnitInfo { singular: "inch", plural: "inch" }); + m.insert("ft", UnitInfo { singular: "feet", plural: "feet" }); + m.insert("mi", UnitInfo { singular: "mile", plural: "mile" }); + + // Weight + m.insert("mg", UnitInfo { singular: "milligram", plural: "milligram" }); + m.insert("g", UnitInfo { singular: "gram", plural: "gram" }); + m.insert("kg", UnitInfo { singular: "kilogram", plural: "kilogram" }); + m.insert("lb", UnitInfo { singular: "pound", plural: "pound" }); + m.insert("oz", UnitInfo { singular: "aunce", plural: "aunce" }); + m.insert("t", UnitInfo { singular: "tan", plural: "tan" }); + + // Volume + m.insert("ml", UnitInfo { singular: "millilitre", plural: "millilitre" }); + m.insert("l", UnitInfo { singular: "litre", plural: "litre" }); + m.insert("L", UnitInfo { singular: "litre", plural: "litre" }); + + // Speed + m.insert("km/h", UnitInfo { singular: "kilometre prati ghanta", plural: "kilometre prati ghanta" }); + m.insert("mph", UnitInfo { singular: "mile prati ghanta", plural: "mile prati ghanta" }); + m.insert("m/s", UnitInfo { singular: "metre prati second", plural: "metre prati second" }); + + // Time + m.insert("s", UnitInfo { singular: "second", plural: "second" }); + m.insert("sec", UnitInfo { singular: "second", plural: "second" }); + m.insert("min", UnitInfo { singular: "minat", plural: "minat" }); + m.insert("h", UnitInfo { singular: "ghanta", plural: "ghante" }); + m.insert("hr", UnitInfo { singular: "ghanta", plural: "ghante" }); + + // Temperature + m.insert("\u{00B0}C", UnitInfo { singular: "digri selsiyas", plural: "digri selsiyas" }); + m.insert("\u{00B0}F", UnitInfo { singular: "digri farenheit", plural: "digri farenheit" }); + + // Data + m.insert("KB", UnitInfo { singular: "kilobyte", plural: "kilobyte" }); + m.insert("MB", UnitInfo { singular: "megabyte", plural: "megabyte" }); + m.insert("GB", UnitInfo { singular: "gigabyte", plural: "gigabyte" }); + m.insert("TB", UnitInfo { singular: "terabyte", plural: "terabyte" }); + + // Percentage + m.insert("%", UnitInfo { singular: "pratishat", plural: "pratishat" }); + + // Frequency + m.insert("Hz", UnitInfo { singular: "hertz", plural: "hertz" }); + m.insert("kHz", UnitInfo { singular: "kilohertz", plural: "kilohertz" }); + m.insert("MHz", UnitInfo { singular: "megahertz", plural: "megahertz" }); + m.insert("GHz", UnitInfo { singular: "gigahertz", plural: "gigahertz" }); + + m + }; +} + +/// Parse a written measurement to spoken romanized Hindi. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + let mut unit_matches: Vec<(&str, &UnitInfo)> = UNITS + .iter() + .filter(|(unit, _)| { + trimmed.ends_with(*unit) + && (trimmed.len() == unit.len() || { + let before = &trimmed[..trimmed.len() - unit.len()]; + if unit.len() == 1 && unit.chars().all(|c| c.is_ascii_alphabetic()) { + before.ends_with(' ') + } else { + before.ends_with(' ') || before.ends_with(|c: char| c.is_ascii_digit()) + } + }) + }) + .map(|(k, v)| (*k, v)) + .collect(); + + // Sort by length descending so longer unit matches take priority (e.g. "km/h" over "h") + unit_matches.sort_by(|a, b| b.0.len().cmp(&a.0.len())); + + for (unit_str, unit_info) in unit_matches { + let num_part = trimmed[..trimmed.len() - unit_str.len()].trim(); + if num_part.is_empty() { + continue; + } + + let (is_negative, digits) = if let Some(rest) = num_part.strip_prefix('-') { + (true, rest.trim()) + } else { + (false, num_part) + }; + + let clean: String = digits + .chars() + .filter(|c| c.is_ascii_digit() || *c == '.' || *c == ',') + .collect(); + + if clean.is_empty() + || !clean + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',') + { + continue; + } + + // Handle decimals + let decimal_sep = if clean.contains(',') { ',' } else { '.' }; + if clean.contains(decimal_sep) { + let parts: Vec<&str> = clean.splitn(2, decimal_sep).collect(); + if parts.len() == 2 { + let int_val: i64 = if parts[0].is_empty() { + 0 + } else { + let Ok(v) = parts[0].parse::() else { + continue; + }; + v + }; + let int_words = number_to_words(int_val); + let frac_words = super::spell_digits(parts[1]); + let unit_word = unit_info.plural; + let num_words = if is_negative { + format!("rhin {} dashmlav {}", int_words, frac_words) + } else { + format!("{} dashmlav {}", int_words, frac_words) + }; + return Some(format!("{} {}", num_words, unit_word)); + } + continue; + } + + let Ok(n) = clean.parse::() else { + continue; + }; + let num_words = if is_negative { + format!("rhin {}", number_to_words(n)) + } else { + number_to_words(n) + }; + + let abs_n = n.unsigned_abs(); + let unit_word = if abs_n == 1 { + unit_info.singular + } else { + unit_info.plural + }; + + return Some(format!("{} {}", num_words, unit_word)); + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_speed() { + assert_eq!( + parse("200 km/h"), + Some("do sau kilometre prati ghanta".to_string()) + ); + } + + #[test] + fn test_weight() { + assert_eq!(parse("1 kg"), Some("ek kilogram".to_string())); + assert_eq!(parse("2 kg"), Some("do kilogram".to_string())); + } + + #[test] + fn test_temperature() { + assert_eq!( + parse("72\u{00B0}C"), + Some("bahattar digri selsiyas".to_string()) + ); + } + + #[test] + fn test_percentage() { + assert_eq!(parse("50%"), Some("pachaas pratishat".to_string())); + assert_eq!(parse("100%"), Some("ek sau pratishat".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!( + parse("-66 kg"), + Some("rhin chhiyaasath kilogram".to_string()) + ); + } + + #[test] + fn test_data() { + assert_eq!(parse("500 MB"), Some("paanch sau megabyte".to_string())); + assert_eq!(parse("1 GB"), Some("ek gigabyte".to_string())); + } + + #[test] + fn test_decimal_with_empty_integer() { + assert_eq!( + parse(".5 kg"), + Some("shunya dashmlav paanch kilogram".to_string()) + ); + } + + #[test] + fn test_non_measure() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/hi/mod.rs b/src/tts/hi/mod.rs new file mode 100644 index 0000000..2ac0db7 --- /dev/null +++ b/src/tts/hi/mod.rs @@ -0,0 +1,298 @@ +//! Text Normalization taggers for Hindi (romanized). +//! +//! Converts written-form text to spoken Hindi in romanized transliteration: +//! - "200" → "do sau" +//! - "5.50 ₹" → "paanch rupaye aur pachaas paise" +//! - "5 January 2025" → "paanch janvari do hazaar pachees" + +pub mod cardinal; +pub mod date; +pub mod decimal; +pub mod electronic; +pub mod measure; +pub mod money; +pub mod ordinal; +pub mod telephone; +pub mod time; +pub mod whitelist; + +/// Hindi unique words for 0-99 in romanized form. +/// Hindi has distinct words for every number from 0 to 99. +const HINDI_0_TO_99: [&str; 100] = [ + "shunya", // 0 + "ek", // 1 + "do", // 2 + "teen", // 3 + "chaar", // 4 + "paanch", // 5 + "chhah", // 6 + "saat", // 7 + "aath", // 8 + "nau", // 9 + "das", // 10 + "gyaarah", // 11 + "baarah", // 12 + "terah", // 13 + "chaudah", // 14 + "pandrah", // 15 + "solah", // 16 + "satrah", // 17 + "atthaarah", // 18 + "unees", // 19 + "bees", // 20 + "ikkees", // 21 + "baees", // 22 + "teis", // 23 + "chaubees", // 24 + "pachchees", // 25 + "chhabees", // 26 + "sattaees", // 27 + "atthaees", // 28 + "untees", // 29 + "tees", // 30 + "ikattees", // 31 + "battees", // 32 + "taintees", // 33 + "chautees", // 34 + "paintees", // 35 + "chhattees", // 36 + "saintees", // 37 + "adtees", // 38 + "untaalees", // 39 + "chaalis", // 40 + "iktaalees", // 41 + "bayaalees", // 42 + "taintaalees", // 43 + "chauvaalees", // 44 + "paintaalees", // 45 + "chhiyaalees", // 46 + "saintaalees", // 47 + "adtaalees", // 48 + "unchaas", // 49 + "pachaas", // 50 + "ikyaavan", // 51 + "baavan", // 52 + "tirpan", // 53 + "chauvan", // 54 + "pachpan", // 55 + "chhappan", // 56 + "sattaavan", // 57 + "atthaavan", // 58 + "unsath", // 59 + "saath", // 60 + "iksath", // 61 + "baasath", // 62 + "tirsath", // 63 + "chausath", // 64 + "painsath", // 65 + "chhiyaasath", // 66 + "sarsath", // 67 + "adsath", // 68 + "unhattar", // 69 + "sattar", // 70 + "ikhattar", // 71 + "bahattar", // 72 + "tihattar", // 73 + "chauhattar", // 74 + "pachahattar", // 75 + "chhihattar", // 76 + "satahattar", // 77 + "athahattar", // 78 + "unyaasi", // 79 + "assi", // 80 + "ikyaasi", // 81 + "bayaasi", // 82 + "tiraasi", // 83 + "chauraasi", // 84 + "pachaasi", // 85 + "chhiyaasi", // 86 + "sataasi", // 87 + "athaasi", // 88 + "navaasi", // 89 + "nabbe", // 90 + "ikyaanbe", // 91 + "baanbe", // 92 + "tiraanbe", // 93 + "chauraanbe", // 94 + "pachaanbe", // 95 + "chhiyaanbe", // 96 + "sataanbe", // 97 + "athaanbe", // 98 + "ninyaanbe", // 99 +]; + +/// Digit words indexed 0-9 for spell_digits. +const DIGIT_WORDS: [&str; 10] = [ + "shunya", "ek", "do", "teen", "chaar", "paanch", "chhah", "saat", "aath", "nau", +]; + +/// Convert an integer to romanized Hindi words. +/// +/// Uses the Indian numbering system: lakh (1,00,000) and crore (1,00,00,000). +/// +/// Examples: +/// - `0` -> `"shunya"` +/// - `21` -> `"ikkees"` +/// - `100` -> `"ek sau"` +/// - `1000` -> `"ek hazaar"` +/// - `100000` -> `"ek lakh"` +/// - `10000000` -> `"ek crore"` +/// - `-42` -> `"rhin bayaalees"` +pub fn number_to_words(n: i64) -> String { + if n == 0 { + return "shunya".to_string(); + } + + if n < 0 { + let abs_val = (n as u64).wrapping_neg(); + return format!("rhin {}", unsigned_to_words(abs_val)); + } + + unsigned_to_words(n as u64) +} + +fn unsigned_to_words(n: u64) -> String { + if n == 0 { + return "shunya".to_string(); + } + + let mut parts: Vec = Vec::new(); + let mut remaining = n; + + // Indian numbering: crore (10^7), lakh (10^5), hazaar (10^3), sau (10^2) + // Above crore we use "arab" (10^9), "kharab" (10^11) etc. but for simplicity + // we handle up to crores by repeating crore groups. + let scales: &[(u64, &str)] = &[ + (1_00_00_00_00_00_000, "kharab"), // 10^12 (lakh crore) + (1_00_00_00_00_000, "arab"), // 10^9 (hundred crore) + (1_00_00_000, "crore"), // 10^7 + (1_00_000, "lakh"), // 10^5 + (1_000, "hazaar"), // 10^3 + ]; + + for &(scale_value, scale_name) in scales { + if remaining >= scale_value { + let chunk = remaining / scale_value; + remaining %= scale_value; + let chunk_words = small_number_to_words(chunk); + parts.push(format!("{} {}", chunk_words, scale_name)); + } + } + + // Handle hundreds (sau) + if remaining >= 100 { + let hundreds = remaining / 100; + remaining %= 100; + parts.push(format!("{} sau", HINDI_0_TO_99[hundreds as usize])); + } + + // Handle 1-99 + if remaining > 0 { + parts.push(HINDI_0_TO_99[remaining as usize].to_string()); + } + + parts.join(" ") +} + +/// Convert a number that can appear as a chunk before a scale word. +/// This handles numbers up to 99 (for lakh/crore grouping which uses 2-digit groups), +/// but also needs to handle up to 999 for the hazaar group (3 digits from right). +fn small_number_to_words(n: u64) -> String { + if n == 0 { + return "shunya".to_string(); + } + if n < 100 { + return HINDI_0_TO_99[n as usize].to_string(); + } + + // For numbers >= 100, recursively handle + let mut parts: Vec = Vec::new(); + let mut remaining = n; + + if remaining >= 100 { + let hundreds = remaining / 100; + remaining %= 100; + if hundreds < 100 { + parts.push(format!("{} sau", HINDI_0_TO_99[hundreds as usize])); + } else { + parts.push(format!("{} sau", small_number_to_words(hundreds))); + } + } + + if remaining > 0 { + parts.push(HINDI_0_TO_99[remaining as usize].to_string()); + } + + parts.join(" ") +} + +/// Spell each digit of a string individually in romanized Hindi. +/// +/// "14" -> "ek chaar" +pub fn spell_digits(s: &str) -> String { + s.chars() + .filter_map(|c| c.to_digit(10).map(|d| DIGIT_WORDS[d as usize])) + .collect::>() + .join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(number_to_words(0), "shunya"); + assert_eq!(number_to_words(1), "ek"); + assert_eq!(number_to_words(10), "das"); + assert_eq!(number_to_words(16), "solah"); + assert_eq!(number_to_words(19), "unees"); + assert_eq!(number_to_words(20), "bees"); + assert_eq!(number_to_words(21), "ikkees"); + assert_eq!(number_to_words(50), "pachaas"); + assert_eq!(number_to_words(99), "ninyaanbe"); + } + + #[test] + fn test_hundreds() { + assert_eq!(number_to_words(100), "ek sau"); + assert_eq!(number_to_words(200), "do sau"); + assert_eq!(number_to_words(123), "ek sau teis"); + assert_eq!(number_to_words(999), "nau sau ninyaanbe"); + } + + #[test] + fn test_thousands() { + assert_eq!(number_to_words(1000), "ek hazaar"); + assert_eq!(number_to_words(2000), "do hazaar"); + assert_eq!(number_to_words(2025), "do hazaar pachchees"); + assert_eq!(number_to_words(10000), "das hazaar"); + } + + #[test] + fn test_lakhs_and_crores() { + assert_eq!(number_to_words(100000), "ek lakh"); + assert_eq!(number_to_words(200000), "do lakh"); + assert_eq!( + number_to_words(1234567), + "baarah lakh chautees hazaar paanch sau sarsath" + ); + assert_eq!(number_to_words(10000000), "ek crore"); + assert_eq!( + number_to_words(12345678), + "ek crore teis lakh paintaalees hazaar chhah sau athahattar" + ); + } + + #[test] + fn test_negative() { + assert_eq!(number_to_words(-42), "rhin bayaalees"); + } + + #[test] + fn test_spell_digits() { + assert_eq!(spell_digits("14"), "ek chaar"); + assert_eq!(spell_digits("0"), "shunya"); + assert_eq!(spell_digits("987"), "nau aath saat"); + } +} diff --git a/src/tts/hi/money.rs b/src/tts/hi/money.rs new file mode 100644 index 0000000..1a15f33 --- /dev/null +++ b/src/tts/hi/money.rs @@ -0,0 +1,312 @@ +//! Money TN tagger for Hindi (romanized). +//! +//! Converts written currency expressions to spoken romanized Hindi: +//! - "₹100" → "ek sau rupaye" +//! - "₹5.50" → "paanch rupaye aur pachaas paise" +//! - "$100" → "ek sau dollar" +//! - "€1" → "ek euro" + +use super::number_to_words; + +/// Scale suffixes recognized after a currency amount. +/// crore, lakh, hazaar (thousand) +const SCALE_SUFFIXES: &[&str] = &["crore", "lakh", "hazaar"]; + +struct Currency { + singular: &'static str, + plural: &'static str, + cent_singular: &'static str, + cent_plural: &'static str, +} + +const RUPEE: Currency = Currency { + singular: "rupaya", + plural: "rupaye", + cent_singular: "paisa", + cent_plural: "paise", +}; + +const DOLLAR: Currency = Currency { + singular: "dollar", + plural: "dollar", + cent_singular: "cent", + cent_plural: "cents", +}; + +const EURO: Currency = Currency { + singular: "euro", + plural: "euro", + cent_singular: "cent", + cent_plural: "cents", +}; + +const POUND: Currency = Currency { + singular: "pound", + plural: "pound", + cent_singular: "penny", + cent_plural: "pence", +}; + +const YEN: Currency = Currency { + singular: "yen", + plural: "yen", + cent_singular: "sen", + cent_plural: "sen", +}; + +/// Parse a written money expression to spoken romanized Hindi. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Try suffix symbol: "100 ₹" + if let Some(result) = parse_suffix_currency(trimmed) { + return Some(result); + } + + // Try prefix symbol: "₹100", "$5.50", "€100" + if let Some(result) = parse_prefix_currency(trimmed) { + return Some(result); + } + + None +} + +fn parse_suffix_currency(input: &str) -> Option { + let (amount_str, currency) = if let Some(s) = input.strip_suffix('\u{20B9}') { + // ₹ + (s.trim(), &RUPEE) + } else if let Some(s) = input.strip_suffix("INR") { + (s.trim(), &RUPEE) + } else if let Some(s) = input.strip_suffix('\u{20AC}') { + // Euro sign + (s.trim(), &EURO) + } else if let Some(s) = input.strip_suffix("EUR") { + (s.trim(), &EURO) + } else { + return None; + }; + + parse_amount(amount_str, currency) +} + +fn parse_prefix_currency(input: &str) -> Option { + let (currency, rest) = if let Some(r) = input.strip_prefix('\u{20B9}') { + // ₹ + (&RUPEE, r) + } else if let Some(r) = input.strip_prefix("Rs") { + // Rs or Rs. + let r = r.strip_prefix('.').unwrap_or(r); + (&RUPEE, r) + } else if let Some(r) = input.strip_prefix('$') { + (&DOLLAR, r) + } else if let Some(r) = input.strip_prefix('\u{20AC}') { + (&EURO, r) + } else if let Some(r) = input.strip_prefix('\u{00A3}') { + // £ + (&POUND, r) + } else if let Some(r) = input.strip_prefix('\u{00A5}') { + // ¥ + (&YEN, r) + } else { + return None; + }; + + let rest = rest.trim(); + if rest.is_empty() { + return None; + } + + // Check for scale suffix: "₹2.5 lakh" (2.5 lakh rupees) + let (amount_str, scale) = extract_scale(rest); + + // Without a scale suffix, the amount must be purely numeric + if scale.is_none() + && !amount_str + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == ' ') + { + return None; + } + + if let Some(scale_word) = scale { + // With scale: "₹2.5 lakh" → "do dashmlav paanch lakh rupaye" + if amount_str.contains('.') { + let parts: Vec<&str> = amount_str.splitn(2, '.').collect(); + if parts.len() == 2 { + let int_val: i64 = parts[0].parse().ok()?; + let int_words = number_to_words(int_val); + let frac_words = super::spell_digits(parts[1]); + return Some(format!( + "{} dashmlav {} {} {}", + int_words, frac_words, scale_word, currency.plural + )); + } + } else { + // No decimal: "₹50 lakh" → "pachaas lakh rupaye" + let clean: String = amount_str.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + let words = number_to_words(n); + return Some(format!("{} {} {}", words, scale_word, currency.plural)); + } + } + + parse_amount(amount_str, currency) +} + +/// Extract scale suffix from the amount string. +fn extract_scale(input: &str) -> (&str, Option<&str>) { + for &scale in SCALE_SUFFIXES { + if let Some(before) = input.strip_suffix(scale) { + let before = before.trim_end(); + if !before.is_empty() + && before + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == ' ') + { + return (before, Some(scale)); + } + } + } + (input, None) +} + +fn parse_amount(amount_str: &str, currency: &Currency) -> Option { + if amount_str.is_empty() { + return None; + } + + // Check for decimal part + let decimal_sep = '.'; + if amount_str.contains(decimal_sep) { + let parts: Vec<&str> = amount_str.splitn(2, decimal_sep).collect(); + if parts.len() == 2 { + let int_clean: String = parts[0].chars().filter(|c| c.is_ascii_digit()).collect(); + let main_amount: i64 = if int_clean.is_empty() { + 0 + } else { + int_clean.parse().ok()? + }; + + let cents_str = parts[1].trim(); + let cents: i64 = if cents_str.is_empty() { + 0 + } else if cents_str.len() == 1 { + cents_str.parse::().ok()? * 10 + } else if cents_str.len() == 2 { + cents_str.parse().ok()? + } else { + cents_str[..2].parse().ok()? + }; + + return Some(format_currency(main_amount, cents, currency)); + } + } + + let clean: String = amount_str.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + Some(format_currency(n, 0, currency)) +} + +fn format_currency(main_amount: i64, cents: i64, currency: &Currency) -> String { + let main_words = number_to_words(main_amount); + + if main_amount == 0 && cents == 0 { + return format!("shunya {}", currency.plural); + } + + if main_amount == 0 { + let cents_words = number_to_words(cents); + let unit = if cents == 1 { + currency.cent_singular + } else { + currency.cent_plural + }; + return format!("{} {}", cents_words, unit); + } + + if cents == 0 { + let unit = if main_amount == 1 { + currency.singular + } else { + currency.plural + }; + return format!("{} {}", main_words, unit); + } + + let main_unit = if main_amount == 1 { + currency.singular + } else { + currency.plural + }; + let cents_words = number_to_words(cents); + let cent_unit = if cents == 1 { + currency.cent_singular + } else { + currency.cent_plural + }; + + format!( + "{} {} aur {} {}", + main_words, main_unit, cents_words, cent_unit + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_rupee_prefix() { + assert_eq!(parse("\u{20B9}100"), Some("ek sau rupaye".to_string())); + assert_eq!(parse("\u{20B9}1"), Some("ek rupaya".to_string())); + assert_eq!( + parse("\u{20B9}5.50"), + Some("paanch rupaye aur pachaas paise".to_string()) + ); + } + + #[test] + fn test_dollar_prefix() { + assert_eq!(parse("$100"), Some("ek sau dollar".to_string())); + assert_eq!( + parse("$5.50"), + Some("paanch dollar aur pachaas cents".to_string()) + ); + } + + #[test] + fn test_euro_prefix() { + assert_eq!(parse("\u{20AC}100"), Some("ek sau euro".to_string())); + } + + #[test] + fn test_dollars_and_cents() { + assert_eq!(parse("$1.01"), Some("ek dollar aur ek cent".to_string())); + assert_eq!(parse("$0.99"), Some("ninyaanbe cents".to_string())); + } + + #[test] + fn test_large_amounts() { + assert_eq!( + parse("\u{20B9}2.5 lakh"), + Some("do dashmlav paanch lakh rupaye".to_string()) + ); + assert_eq!(parse("$50 crore"), Some("pachaas crore dollar".to_string())); + } + + #[test] + fn test_trailing_dot() { + assert_eq!(parse("$5."), Some("paanch dollar".to_string())); + assert_eq!(parse("$1."), Some("ek dollar".to_string())); + } + + #[test] + fn test_non_money() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/hi/ordinal.rs b/src/tts/hi/ordinal.rs new file mode 100644 index 0000000..7544d3b --- /dev/null +++ b/src/tts/hi/ordinal.rs @@ -0,0 +1,99 @@ +//! Ordinal TN tagger for Hindi (romanized). +//! +//! Converts written ordinal numbers to spoken romanized Hindi: +//! - "1st" → "pehla" +//! - "2nd" → "doosra" +//! - "3rd" → "teesra" +//! - "5th" → "paanchvaan" + +use super::number_to_words; + +/// Special ordinal forms for small numbers in Hindi. +const SPECIAL_ORDINALS: &[(i64, &str)] = &[ + (1, "pehla"), + (2, "doosra"), + (3, "teesra"), + (4, "chautha"), + (5, "paanchvaan"), + (6, "chhathvaan"), + (7, "saatvaan"), + (8, "aathvaan"), + (9, "nauvaan"), + (10, "dasvaan"), +]; + +/// Parse a written ordinal to spoken romanized Hindi words. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Detect English ordinal suffixes: st, nd, rd, th + // Also detect Hindi-style suffix: vaan, veen, va, vi + let num_str = if let Some(s) = trimmed.strip_suffix("vaan") { + s + } else if let Some(s) = trimmed.strip_suffix("veen") { + s + } else if let Some(s) = trimmed.strip_suffix("th") { + s + } else if let Some(s) = trimmed.strip_suffix("st") { + s + } else if let Some(s) = trimmed.strip_suffix("nd") { + s + } else if let Some(s) = trimmed.strip_suffix("rd") { + s + } else { + return None; + }; + + if num_str.is_empty() || !num_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let n: i64 = num_str.parse().ok()?; + if n <= 0 { + return None; + } + + // Check for special ordinal forms + for &(val, word) in SPECIAL_ORDINALS { + if n == val { + return Some(word.to_string()); + } + } + + // General ordinal: cardinal + "vaan" + let cardinal = number_to_words(n); + Some(format!("{}vaan", cardinal)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_special_ordinals() { + assert_eq!(parse("1st"), Some("pehla".to_string())); + assert_eq!(parse("2nd"), Some("doosra".to_string())); + assert_eq!(parse("3rd"), Some("teesra".to_string())); + assert_eq!(parse("4th"), Some("chautha".to_string())); + } + + #[test] + fn test_general_ordinals() { + assert_eq!(parse("5th"), Some("paanchvaan".to_string())); + assert_eq!(parse("20th"), Some("beesvaan".to_string())); + assert_eq!(parse("100th"), Some("ek sauvaan".to_string())); + } + + #[test] + fn test_hindi_suffix() { + assert_eq!(parse("5vaan"), Some("paanchvaan".to_string())); + assert_eq!(parse("10vaan"), Some("dasvaan".to_string())); + } + + #[test] + fn test_non_ordinals() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("0th"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/hi/telephone.rs b/src/tts/hi/telephone.rs new file mode 100644 index 0000000..19bd516 --- /dev/null +++ b/src/tts/hi/telephone.rs @@ -0,0 +1,170 @@ +//! Telephone TN tagger for Hindi (romanized). +//! +//! Converts written phone numbers to spoken form with Hindi romanized digit words: +//! - "123-456-7890" -> "ek do teen, chaar paanch chhah, saat aath nau shunya" +//! - "+91-98765-43210" -> "plus nau ek, nau aath saat chhah paanch, chaar teen do ek shunya" +//! - "(011) 2345-6789" -> "shunya ek ek, do teen chaar paanch, chhah saat aath nau" + +/// Parse a written phone number to spoken form (Hindi romanized). +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Phone numbers contain digits and separators (-, ., space, parens) + // Must have mostly digits + let digit_count = trimmed.chars().filter(|c| c.is_ascii_digit()).count(); + let non_digit_non_sep = trimmed + .chars() + .filter(|c| { + !c.is_ascii_digit() + && *c != '-' + && *c != '.' + && *c != ' ' + && *c != '(' + && *c != ')' + && *c != '+' + }) + .count(); + + // Must have at least 7 digits and no unexpected characters + if digit_count < 7 || non_digit_non_sep > 0 { + return None; + } + + // Must contain at least one separator (-, ., space, parens) to distinguish + // from plain numbers like "1000000" + let has_separator = trimmed + .chars() + .any(|c| c == '-' || c == '.' || c == ' ' || c == '(' || c == ')'); + if !has_separator { + return None; + } + + let mut parts: Vec = Vec::new(); + let mut has_plus = false; + + // Handle leading + + let rest = if let Some(r) = trimmed.strip_prefix('+') { + has_plus = true; + r.trim_start() + } else { + trimmed + }; + + // Split by common separators + let groups = split_phone_groups(rest); + + if has_plus && !groups.is_empty() { + // The first group after + is the country code + let mut first = String::from("plus "); + first.push_str(&spell_digit_group_hi(&groups[0])); + parts.push(first); + for g in &groups[1..] { + parts.push(spell_digit_group_hi(g)); + } + } else { + for g in &groups { + parts.push(spell_digit_group_hi(g)); + } + } + + if parts.is_empty() { + return None; + } + + Some(parts.join(", ")) +} + +/// Split phone number into groups by separators. +fn split_phone_groups(input: &str) -> Vec { + let mut groups: Vec = Vec::new(); + let mut current = String::new(); + + for c in input.chars() { + match c { + '0'..='9' => current.push(c), + '-' | '.' | ' ' | '(' | ')' => { + if !current.is_empty() { + groups.push(current.clone()); + current.clear(); + } + } + _ => {} + } + } + + if !current.is_empty() { + groups.push(current); + } + + groups +} + +/// Spell each digit in a group using Hindi romanized words. +fn spell_digit_group_hi(group: &str) -> String { + group + .chars() + .filter_map(|c| { + let word = match c { + '0' => "shunya", + '1' => "ek", + '2' => "do", + '3' => "teen", + '4' => "chaar", + '5' => "paanch", + '6' => "chhah", + '7' => "saat", + '8' => "aath", + '9' => "nau", + _ => return None, + }; + Some(word) + }) + .collect::>() + .join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_standard_phone() { + assert_eq!( + parse("123-456-7890"), + Some("ek do teen, chaar paanch chhah, saat aath nau shunya".to_string()) + ); + } + + #[test] + fn test_with_country_code() { + assert_eq!( + parse("+91-98765-43210"), + Some("plus nau ek, nau aath saat chhah paanch, chaar teen do ek shunya".to_string()) + ); + } + + #[test] + fn test_parentheses() { + assert_eq!( + parse("(011) 2345-6789"), + Some("shunya ek ek, do teen chaar paanch, chhah saat aath nau".to_string()) + ); + } + + #[test] + fn test_dots() { + assert_eq!( + parse("555.123.4567"), + Some("paanch paanch paanch, ek do teen, chaar paanch chhah saat".to_string()) + ); + } + + #[test] + fn test_non_phone() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + } +} diff --git a/src/tts/hi/time.rs b/src/tts/hi/time.rs new file mode 100644 index 0000000..0ab5df3 --- /dev/null +++ b/src/tts/hi/time.rs @@ -0,0 +1,159 @@ +//! Time TN tagger for Hindi (romanized). +//! +//! Converts written time expressions to spoken romanized Hindi: +//! - "14:30" → "chaudah baj kar tees minat" +//! - "2:00" → "do baje" +//! - "0:00" → "baarah baje raat ke" (midnight) +//! - "12:00" → "baarah baje dopahar ke" (noon) + +use super::number_to_words; + +/// Parse a written time expression to spoken romanized Hindi. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Try "14:30" colon format + if let Some(result) = parse_colon_format(trimmed) { + return Some(result); + } + + // Try "14 baje" or "14 baj kar 30 minat" format + if let Some(result) = parse_baje_format(trimmed) { + return Some(result); + } + + None +} + +fn parse_colon_format(input: &str) -> Option { + if !input.contains(':') { + return None; + } + + let parts: Vec<&str> = input.splitn(2, ':').collect(); + if parts.len() != 2 { + return None; + } + + let hour_str = parts[0].trim(); + let min_str = parts[1].trim(); + + if !hour_str.chars().all(|c| c.is_ascii_digit()) || hour_str.is_empty() { + return None; + } + if !min_str.chars().all(|c| c.is_ascii_digit()) || min_str.is_empty() { + return None; + } + + let hour: u32 = hour_str.parse().ok()?; + let minute: u32 = min_str.parse().ok()?; + + if hour > 23 || minute > 59 { + return None; + } + + Some(format_time(hour, minute)) +} + +fn parse_baje_format(input: &str) -> Option { + let lower = input.to_lowercase(); + + // Match "X baje" or "X baj kar Y minat" + if !lower.contains("baj") { + return None; + } + + let baj_pos = lower.find("baj")?; + let hour_str = lower[..baj_pos].trim(); + + if !hour_str.chars().all(|c| c.is_ascii_digit()) || hour_str.is_empty() { + return None; + } + + let hour: u32 = hour_str.parse().ok()?; + if hour > 23 { + return None; + } + + // Check for minutes after "baj kar" + let after_baj = &lower[baj_pos..]; + if let Some(rest) = after_baj.strip_prefix("baj kar") { + let rest = rest.trim(); + // Try to extract minutes: "30 minat" or just "30" + let min_str = rest + .trim_end_matches("minat") + .trim_end_matches("minute") + .trim(); + if !min_str.is_empty() && min_str.chars().all(|c| c.is_ascii_digit()) { + let minute: u32 = min_str.parse().ok()?; + if minute <= 59 { + return Some(format_time(hour, minute)); + } + } + } + + // Just "X baje" - no minutes + Some(format_time(hour, 0)) +} + +fn format_time(hour: u32, minute: u32) -> String { + let hour_words = number_to_words(hour as i64); + + if minute == 0 { + format!("{} baje", hour_words) + } else { + let minute_words = number_to_words(minute as i64); + format!("{} baj kar {} minat", hour_words, minute_words) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_colon_format() { + assert_eq!( + parse("14:30"), + Some("chaudah baj kar tees minat".to_string()) + ); + assert_eq!(parse("2:00"), Some("do baje".to_string())); + assert_eq!( + parse("8:15"), + Some("aath baj kar pandrah minat".to_string()) + ); + } + + #[test] + fn test_midnight_and_noon() { + assert_eq!(parse("0:00"), Some("shunya baje".to_string())); + assert_eq!(parse("12:00"), Some("baarah baje".to_string())); + assert_eq!(parse("0:30"), Some("shunya baj kar tees minat".to_string())); + } + + #[test] + fn test_baje_format() { + assert_eq!(parse("14 baje"), Some("chaudah baje".to_string())); + assert_eq!( + parse("8 baj kar 30 minat"), + Some("aath baj kar tees minat".to_string()) + ); + } + + #[test] + fn test_24h() { + assert_eq!(parse("14:00"), Some("chaudah baje".to_string())); + assert_eq!( + parse("23:59"), + Some("teis baj kar unsath minat".to_string()) + ); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("25:00"), None); + assert_eq!(parse("12:60"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/hi/whitelist.rs b/src/tts/hi/whitelist.rs new file mode 100644 index 0000000..d7c3f8f --- /dev/null +++ b/src/tts/hi/whitelist.rs @@ -0,0 +1,90 @@ +//! Whitelist TN tagger for Hindi (romanized). +//! +//! Lookup table for common abbreviations and titles with Hindi romanized output: +//! - "Dr." -> "daaktor" +//! - "Mr." -> "shri" +//! - "Mrs." -> "shreemati" +//! - "etc." -> "ityaadi" + +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + static ref WHITELIST: HashMap<&'static str, &'static str> = { + let mut m = HashMap::new(); + // Titles + m.insert("Dr.", "daaktor"); + m.insert("Dr", "daaktor"); + m.insert("Mr.", "shri"); + m.insert("Mr", "shri"); + m.insert("Mrs.", "shreemati"); + m.insert("Mrs", "shreemati"); + m.insert("Ms.", "sushri"); + m.insert("Ms", "sushri"); + m.insert("Shri", "shri"); + m.insert("Smt.", "shreemati"); + m.insert("Prof.", "pradhyaapak"); + m.insert("St.", "sant"); + m.insert("Jr.", "kanishth"); + m.insert("Sr.", "varishth"); + + // Common abbreviations + m.insert("etc.", "ityaadi"); + m.insert("vs.", "banam"); + m.insert("vs", "banam"); + m.insert("No.", "sankhya"); + + // Units + m.insert("Km", "kilometre"); + + // Currency + m.insert("Rs.", "rupaye"); + m.insert("Rs", "rupaye"); + + m + }; +} + +/// Parse a whitelist abbreviation to its Hindi romanized spoken form. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Direct lookup (case-sensitive) + if let Some(&spoken) = WHITELIST.get(trimmed) { + return Some(spoken.to_string()); + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_titles() { + assert_eq!(parse("Dr."), Some("daaktor".to_string())); + assert_eq!(parse("Mr."), Some("shri".to_string())); + assert_eq!(parse("Mrs."), Some("shreemati".to_string())); + assert_eq!(parse("Ms."), Some("sushri".to_string())); + } + + #[test] + fn test_abbreviations() { + assert_eq!(parse("etc."), Some("ityaadi".to_string())); + assert_eq!(parse("vs."), Some("banam".to_string())); + assert_eq!(parse("Rs."), Some("rupaye".to_string())); + } + + #[test] + fn test_hindi_specific() { + assert_eq!(parse("Shri"), Some("shri".to_string())); + assert_eq!(parse("Smt."), Some("shreemati".to_string())); + } + + #[test] + fn test_no_match() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("world"), None); + } +} diff --git a/src/tts/ja/cardinal.rs b/src/tts/ja/cardinal.rs new file mode 100644 index 0000000..5c9db1c --- /dev/null +++ b/src/tts/ja/cardinal.rs @@ -0,0 +1,88 @@ +//! Cardinal TN tagger for Japanese (romaji output). +//! +//! Converts written cardinal numbers to spoken Japanese in romaji: +//! - "123" → "hyaku ni juu san" +//! - "-42" → "mainasu yon juu ni" +//! - "10000" → "ichi man" + +use super::number_to_words; + +/// Parse a written cardinal number to spoken Japanese words in romaji. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + let (is_negative, digits_part) = if let Some(rest) = trimmed.strip_prefix('-') { + (true, rest) + } else { + (false, trimmed) + }; + + // Must be digits (with optional commas, dots, or spaces as thousands separators) + if !digits_part + .chars() + .all(|c| c.is_ascii_digit() || c == ',' || c == '.' || c == ' ' || c == '\u{a0}') + { + return None; + } + + if !digits_part.chars().any(|c| c.is_ascii_digit()) { + return None; + } + + // Strip thousands separators + let clean: String = digits_part.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + + if is_negative { + Some(format!("mainasu {}", number_to_words(n))) + } else { + Some(number_to_words(n)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(parse("0"), Some("zero".to_string())); + assert_eq!(parse("1"), Some("ichi".to_string())); + assert_eq!(parse("21"), Some("ni juu ichi".to_string())); + assert_eq!(parse("100"), Some("hyaku".to_string())); + assert_eq!(parse("123"), Some("hyaku ni juu san".to_string())); + } + + #[test] + fn test_thousands_separators() { + assert_eq!(parse("1 000"), Some("sen".to_string())); + assert_eq!(parse("1,000"), Some("sen".to_string())); + assert_eq!(parse("1.000"), Some("sen".to_string())); + assert_eq!(parse("1 000 000"), Some("hyaku man".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!(parse("-42"), Some("mainasu yon juu ni".to_string())); + assert_eq!(parse("-300"), Some("mainasu sanbyaku".to_string())); + assert_eq!(parse("-1"), Some("mainasu ichi".to_string())); + assert_eq!(parse("-10000"), Some("mainasu ichi man".to_string())); + } + + #[test] + fn test_large_numbers() { + assert_eq!(parse("10000"), Some("ichi man".to_string())); + assert_eq!(parse("3000"), Some("sanzen".to_string())); + assert_eq!(parse("8000"), Some("hassen".to_string())); + } + + #[test] + fn test_non_numbers() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("12abc"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/ja/date.rs b/src/tts/ja/date.rs new file mode 100644 index 0000000..ded8756 --- /dev/null +++ b/src/tts/ja/date.rs @@ -0,0 +1,355 @@ +//! Date TN tagger for Japanese (romaji output). +//! +//! Converts written date expressions to spoken Japanese in romaji: +//! - "2025年1月5日" → "ni sen ni juu go nen ichi gatsu itsuka" +//! - "2025-01-05" → "ni sen ni juu go nen ichi gatsu itsuka" +//! - "January 5, 2025" → "ni sen ni juu go nen ichi gatsu itsuka" + +use super::number_to_words; + +const MONTHS_EN: &[(&str, u32)] = &[ + ("january", 1), + ("february", 2), + ("march", 3), + ("april", 4), + ("may", 5), + ("june", 6), + ("july", 7), + ("august", 8), + ("september", 9), + ("october", 10), + ("november", 11), + ("december", 12), +]; + +/// Special day readings for Japanese dates. +/// Days 1-10 and some others have special kun'yomi readings. +fn day_to_romaji(day: u32) -> String { + match day { + 1 => "tsuitachi".to_string(), + 2 => "futsuka".to_string(), + 3 => "mikka".to_string(), + 4 => "yokka".to_string(), + 5 => "itsuka".to_string(), + 6 => "muika".to_string(), + 7 => "nanoka".to_string(), + 8 => "youka".to_string(), + 9 => "kokonoka".to_string(), + 10 => "tooka".to_string(), + 14 => "juu yokka".to_string(), + 20 => "hatsuka".to_string(), + 24 => "ni juu yokka".to_string(), + _ => format!("{} nichi", number_to_words(day as i64)), + } +} + +/// Month reading: number + "gatsu" +fn month_to_romaji(month: u32) -> String { + format!("{} gatsu", number_to_words(month as i64)) +} + +/// Parse a written date to spoken Japanese in romaji. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Try decade: "1980s" → "sen kyuu hyaku hachi juu nen dai" + if let Some(result) = parse_decade(trimmed) { + return Some(result); + } + + // Try Japanese format: "2025年1月5日" + if let Some(result) = parse_japanese_date(trimmed) { + return Some(result); + } + + // Try English month format: "January 5, 2025" + if let Some(result) = parse_english_month_date(trimmed) { + return Some(result); + } + + // Try numeric YYYY-MM-DD or YYYY/MM/DD + if let Some(result) = parse_numeric_date(trimmed) { + return Some(result); + } + + None +} + +/// Parse decade: "1980s" → "sen kyuu hyaku hachi juu nen dai" (1980年代) +fn parse_decade(input: &str) -> Option { + let s = input.strip_suffix('s')?; + if s.len() != 4 || !s.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let year: u32 = s.parse().ok()?; + if year < 1000 { + return None; + } + + // Must be a round decade (ends in 0) + if year % 10 != 0 { + return None; + } + + // Japanese: year + "nen dai" (年代) + let year_words = number_to_words(year as i64); + Some(format!("{} nen dai", year_words)) +} + +fn parse_japanese_date(input: &str) -> Option { + // Pattern: YYYY年M月D日 + let nen_pos = input.find('\u{5E74}')?; // 年 + let year_str = &input[..nen_pos]; + if !year_str.chars().all(|c| c.is_ascii_digit()) || year_str.is_empty() { + return None; + } + let year: u32 = year_str.parse().ok()?; + + let after_nen = &input[nen_pos + '\u{5E74}'.len_utf8()..]; + + let gatsu_pos = after_nen.find('\u{6708}')?; // 月 + let month_str = &after_nen[..gatsu_pos]; + if !month_str.chars().all(|c| c.is_ascii_digit()) || month_str.is_empty() { + return None; + } + let month: u32 = month_str.parse().ok()?; + if month == 0 || month > 12 { + return None; + } + + let after_gatsu = &after_nen[gatsu_pos + '\u{6708}'.len_utf8()..]; + + // Day part: may or may not end with 日 + let day_str = if let Some(nichi_pos) = after_gatsu.find('\u{65E5}') { + // 日 + &after_gatsu[..nichi_pos] + } else { + after_gatsu.trim() + }; + + if !day_str.chars().all(|c| c.is_ascii_digit()) || day_str.is_empty() { + return None; + } + let day: u32 = day_str.parse().ok()?; + if day == 0 || day > 31 { + return None; + } + + let year_words = number_to_words(year as i64); + let month_words = month_to_romaji(month); + let day_words = day_to_romaji(day); + + Some(format!("{} nen {} {}", year_words, month_words, day_words)) +} + +fn parse_english_month_date(input: &str) -> Option { + let lower = input.to_lowercase(); + + let mut month_num = None; + let mut rest = ""; + for &(name, num) in MONTHS_EN { + if let Some(r) = lower.strip_prefix(name) { + if r.is_empty() || r.starts_with(' ') || r.starts_with(',') { + month_num = Some(num); + rest = r.trim_start_matches(|c: char| c == ' ' || c == ','); + break; + } + } + } + + let month_num = month_num?; + if rest.is_empty() { + return None; + } + + // Parse day + let (day_str, year_part) = if let Some(comma_pos) = rest.find(',') { + (&rest[..comma_pos], Some(rest[comma_pos + 1..].trim())) + } else { + let parts: Vec<&str> = rest.splitn(2, ' ').collect(); + if parts.len() == 2 + && parts[0] + .trim_end_matches("st") + .trim_end_matches("nd") + .trim_end_matches("rd") + .trim_end_matches("th") + .chars() + .all(|c| c.is_ascii_digit()) + { + let year_clean = + parts[1].trim_end_matches(|c: char| c == '.' || c == ',' || c == '!' || c == '?'); + if year_clean.chars().all(|c| c.is_ascii_digit()) && year_clean.len() == 4 { + (parts[0], Some(year_clean)) + } else { + (rest, None) + } + } else { + (rest, None) + } + }; + + let day_digits = day_str + .trim() + .trim_end_matches("st") + .trim_end_matches("nd") + .trim_end_matches("rd") + .trim_end_matches("th"); + + if !day_digits.chars().all(|c| c.is_ascii_digit()) || day_digits.is_empty() { + return None; + } + + let day: u32 = day_digits.parse().ok()?; + if day == 0 || day > 31 { + return None; + } + + let month_words = month_to_romaji(month_num); + let day_words = day_to_romaji(day); + + if let Some(year_str) = year_part { + let year_str = year_str + .trim() + .trim_end_matches(|c: char| c == '.' || c == ',' || c == '!' || c == '?'); + if !year_str.is_empty() && year_str.chars().all(|c| c.is_ascii_digit()) { + let year: u32 = year_str.parse().ok()?; + let year_words = number_to_words(year as i64); + return Some(format!("{} nen {} {}", year_words, month_words, day_words)); + } + } + + Some(format!("{} {}", month_words, day_words)) +} + +/// Parse numeric date in YYYY-MM-DD or YYYY/MM/DD format. +fn parse_numeric_date(input: &str) -> Option { + let sep = if input.contains('/') { + '/' + } else if input.contains('-') && input.chars().filter(|c| *c == '-').count() == 2 { + '-' + } else { + return None; + }; + + let parts: Vec<&str> = input.splitn(3, sep).collect(); + if parts.len() != 3 { + return None; + } + + if !parts + .iter() + .all(|p| !p.is_empty() && p.chars().all(|c| c.is_ascii_digit())) + { + return None; + } + + // Assume YYYY-MM-DD (ISO format, common in Japan) + let year: u32 = parts[0].parse().ok()?; + let month_num: u32 = parts[1].parse().ok()?; + let day: u32 = parts[2].parse().ok()?; + + if month_num == 0 || month_num > 12 || day == 0 || day > 31 { + return None; + } + + // Reject if first part looks like a day (1-31) rather than a year + if year <= 31 { + return None; + } + + let year_words = number_to_words(year as i64); + let month_words = month_to_romaji(month_num); + let day_words = day_to_romaji(day); + + Some(format!("{} nen {} {}", year_words, month_words, day_words)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_japanese_date() { + assert_eq!( + parse("2025\u{5E74}1\u{6708}5\u{65E5}"), + Some("ni sen ni juu go nen ichi gatsu itsuka".to_string()) + ); + assert_eq!( + parse("2025\u{5E74}3\u{6708}14\u{65E5}"), + Some("ni sen ni juu go nen san gatsu juu yokka".to_string()) + ); + assert_eq!( + parse("2025\u{5E74}12\u{6708}20\u{65E5}"), + Some("ni sen ni juu go nen juu ni gatsu hatsuka".to_string()) + ); + } + + #[test] + fn test_numeric_date() { + assert_eq!( + parse("2025-01-05"), + Some("ni sen ni juu go nen ichi gatsu itsuka".to_string()) + ); + assert_eq!( + parse("2025/03/01"), + Some("ni sen ni juu go nen san gatsu tsuitachi".to_string()) + ); + } + + #[test] + fn test_english_month() { + assert_eq!( + parse("January 5, 2025"), + Some("ni sen ni juu go nen ichi gatsu itsuka".to_string()) + ); + assert_eq!( + parse("March 14, 2025"), + Some("ni sen ni juu go nen san gatsu juu yokka".to_string()) + ); + } + + #[test] + fn test_special_days() { + assert_eq!( + parse("2025\u{5E74}1\u{6708}1\u{65E5}"), + Some("ni sen ni juu go nen ichi gatsu tsuitachi".to_string()) + ); + assert_eq!( + parse("2025\u{5E74}1\u{6708}10\u{65E5}"), + Some("ni sen ni juu go nen ichi gatsu tooka".to_string()) + ); + assert_eq!( + parse("2025\u{5E74}1\u{6708}24\u{65E5}"), + Some("ni sen ni juu go nen ichi gatsu ni juu yokka".to_string()) + ); + } + + #[test] + fn test_decade() { + assert_eq!( + parse("1980s"), + Some("sen kyuu hyaku hachi juu nen dai".to_string()) + ); + assert_eq!(parse("2000s"), Some("ni sen nen dai".to_string())); + assert_eq!( + parse("1990s"), + Some("sen kyuu hyaku kyuu juu nen dai".to_string()) + ); + } + + #[test] + fn test_year_verbalization() { + assert_eq!(number_to_words(2025), "ni sen ni juu go".to_string()); + assert_eq!(number_to_words(2000), "ni sen".to_string()); + assert_eq!(number_to_words(1990), "sen kyuu hyaku kyuu juu".to_string()); + assert_eq!(number_to_words(1900), "sen kyuu hyaku".to_string()); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/ja/decimal.rs b/src/tts/ja/decimal.rs new file mode 100644 index 0000000..48289c5 --- /dev/null +++ b/src/tts/ja/decimal.rs @@ -0,0 +1,126 @@ +//! Decimal TN tagger for Japanese (romaji output). +//! +//! Converts written decimal numbers to spoken Japanese in romaji: +//! - "3.14" → "san ten ichi yon" +//! - "0.5" → "zero ten go" +//! - "-2.7" → "mainasu ni ten nana" + +use super::{number_to_words, spell_digits}; + +/// Japanese quantity suffixes recognized after a decimal number. +/// oku (億) = hundred million, man (万) = ten thousand +const QUANTITY_SUFFIXES: &[&str] = &["oku", "man"]; + +/// Parse a written decimal number to spoken Japanese in romaji. +/// +/// Uses "ten" (点) as the decimal point word. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Check for quantity suffix: "1.5 man" + let (number_part, suffix) = extract_suffix(trimmed); + + // Accept both period and comma as decimal separator + let sep = if number_part.contains('.') { + '.' + } else if number_part.contains(',') { + ',' + } else { + return None; + }; + + let parts: Vec<&str> = number_part.splitn(2, sep).collect(); + if parts.len() != 2 { + return None; + } + + let int_str = parts[0]; + let frac_str = parts[1]; + + let (is_negative, int_digits) = if let Some(rest) = int_str.strip_prefix('-') { + (true, rest) + } else { + (false, int_str) + }; + + if !int_digits.chars().all(|c| c.is_ascii_digit()) { + return None; + } + if frac_str.is_empty() || !frac_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let int_val: i64 = if int_digits.is_empty() { + 0 + } else { + int_digits.parse().ok()? + }; + + let int_words = number_to_words(int_val); + let frac_words = spell_digits(frac_str); + + let mut result = if is_negative { + format!("mainasu {} ten {}", int_words, frac_words) + } else { + format!("{} ten {}", int_words, frac_words) + }; + + if let Some(suf) = suffix { + result.push(' '); + result.push_str(suf); + } + + Some(result) +} + +/// Extract a quantity suffix from the end if present. +fn extract_suffix(input: &str) -> (&str, Option<&str>) { + for &suf in QUANTITY_SUFFIXES { + if let Some(before) = input.strip_suffix(suf) { + let before = before.trim_end(); + if !before.is_empty() { + return (before, Some(suf)); + } + } + } + (input, None) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_decimal() { + assert_eq!(parse("3.14"), Some("san ten ichi yon".to_string())); + assert_eq!(parse("0.5"), Some("zero ten go".to_string())); + assert_eq!(parse("1.0"), Some("ichi ten zero".to_string())); + } + + #[test] + fn test_negative_decimal() { + assert_eq!(parse("-3.14"), Some("mainasu san ten ichi yon".to_string())); + assert_eq!(parse("-0.5"), Some("mainasu zero ten go".to_string())); + } + + #[test] + fn test_comma_decimal() { + assert_eq!(parse("3,14"), Some("san ten ichi yon".to_string())); + } + + #[test] + fn test_with_quantity() { + assert_eq!(parse("1.5 man"), Some("ichi ten go man".to_string())); + assert_eq!(parse("4.85 oku"), Some("yon ten hachi go oku".to_string())); + } + + #[test] + fn test_non_decimal() { + assert_eq!(parse("123"), None); + assert_eq!(parse("hello"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/ja/electronic.rs b/src/tts/ja/electronic.rs new file mode 100644 index 0000000..efe1bdc --- /dev/null +++ b/src/tts/ja/electronic.rs @@ -0,0 +1,162 @@ +//! Electronic TN tagger for Japanese (romaji output). +//! +//! Converts written emails and URLs to spoken Japanese romaji form: +//! - "test@gmail.com" → "t e s t atto g m a i l dotto c o m" +//! - "https://example.com" → "h t t p s koron surasshu surasshu e x a m p l e dotto c o m" + +/// Parse an email or URL to spoken Japanese romaji form. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Email detection: contains @ with text on both sides + if trimmed.contains('@') { + return parse_email(trimmed); + } + + // URL detection: starts with http://, https://, or www. + let lower = trimmed.to_lowercase(); + if lower.starts_with("http://") || lower.starts_with("https://") || lower.starts_with("www.") { + return parse_url(trimmed); + } + + None +} + +/// Parse an email address to spoken Japanese romaji form. +fn parse_email(input: &str) -> Option { + let parts: Vec<&str> = input.splitn(2, '@').collect(); + if parts.len() != 2 || parts[0].is_empty() || parts[1].is_empty() { + return None; + } + + let local = spell_domain(parts[0]); + let domain = spell_domain(parts[1]); + + Some(format!("{} atto {}", local, domain)) +} + +/// Parse a URL to spoken Japanese romaji form. +fn parse_url(input: &str) -> Option { + let mut result = String::new(); + let lower = input.to_lowercase(); + + let rest = if lower.starts_with("https://") { + result.push_str("h t t p s koron surasshu surasshu"); + &input["https://".len()..] + } else if lower.starts_with("http://") { + result.push_str("h t t p koron surasshu surasshu"); + &input["http://".len()..] + } else { + input + }; + + if !result.is_empty() && !rest.is_empty() { + result.push(' '); + } + + result.push_str(&spell_domain(rest)); + + Some(result) +} + +/// Spell out a domain name, using "dotto" for periods. +fn spell_domain(domain: &str) -> String { + let parts: Vec<&str> = domain.split('.').collect(); + let spelled: Vec = parts.iter().map(|p| spell_electronic(p)).collect(); + spelled.join(" dotto ") +} + +/// Spell out an electronic string in Japanese romaji. +/// +/// Letters are spelled individually (lowercase). +/// Digits use Japanese romaji words. +/// Special characters use Japanese connector words. +fn spell_electronic(s: &str) -> String { + let mut parts: Vec = Vec::new(); + + for c in s.chars() { + match c { + '-' => parts.push("haifen".to_string()), + '_' => parts.push("anda baa".to_string()), + '/' => parts.push("surasshu".to_string()), + '~' => parts.push("chiruda".to_string()), + ':' => parts.push("koron".to_string()), + c if c.is_ascii_alphabetic() => { + parts.push(c.to_lowercase().to_string()); + } + c if c.is_ascii_digit() => { + parts.push(digit_word(c)); + } + _ => { + // Skip unknown characters + } + } + } + + parts.join(" ") +} + +fn digit_word(c: char) -> String { + match c { + '0' => "zero", + '1' => "ichi", + '2' => "ni", + '3' => "san", + '4' => "yon", + '5' => "go", + '6' => "roku", + '7' => "nana", + '8' => "hachi", + '9' => "kyuu", + _ => "", + } + .to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_email() { + assert_eq!( + parse("test@gmail.com"), + Some("t e s t atto g m a i l dotto c o m".to_string()) + ); + assert_eq!( + parse("user123@example.co.jp"), + Some("u s e r ichi ni san atto e x a m p l e dotto c o dotto j p".to_string()) + ); + } + + #[test] + fn test_url_http() { + assert_eq!( + parse("http://www.example.com"), + Some( + "h t t p koron surasshu surasshu w w w dotto e x a m p l e dotto c o m".to_string() + ) + ); + assert_eq!( + parse("https://google.com"), + Some("h t t p s koron surasshu surasshu g o o g l e dotto c o m".to_string()) + ); + } + + #[test] + fn test_www_url() { + assert_eq!( + parse("www.example.com"), + Some("w w w dotto e x a m p l e dotto c o m".to_string()) + ); + } + + #[test] + fn test_non_electronic() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + } +} diff --git a/src/tts/ja/measure.rs b/src/tts/ja/measure.rs new file mode 100644 index 0000000..7c5bffc --- /dev/null +++ b/src/tts/ja/measure.rs @@ -0,0 +1,223 @@ +//! Measure TN tagger for Japanese (romaji output). +//! +//! Converts written measurements to spoken Japanese in romaji: +//! - "200 km/h" → "ni hyaku kiromeetoru mai ji" +//! - "1 kg" → "ichi kiroguramu" +//! - "25°C" → "ni juu go do" +//! - "50%" → "go juu paasento" + +use super::number_to_words; + +use lazy_static::lazy_static; +use std::collections::HashMap; + +struct UnitInfo { + /// Japanese unit name in romaji (no singular/plural distinction in Japanese) + name: &'static str, +} + +lazy_static! { + static ref UNITS: HashMap<&'static str, UnitInfo> = { + let mut m = HashMap::new(); + + // Length + m.insert("mm", UnitInfo { name: "mirimeetoru" }); + m.insert("cm", UnitInfo { name: "senchimeetoru" }); + m.insert("m", UnitInfo { name: "meetoru" }); + m.insert("km", UnitInfo { name: "kiromeetoru" }); + m.insert("in", UnitInfo { name: "inchi" }); + m.insert("ft", UnitInfo { name: "fiito" }); + m.insert("mi", UnitInfo { name: "mairu" }); + + // Weight + m.insert("mg", UnitInfo { name: "miriguramu" }); + m.insert("g", UnitInfo { name: "guramu" }); + m.insert("kg", UnitInfo { name: "kiroguramu" }); + m.insert("lb", UnitInfo { name: "pondo" }); + m.insert("oz", UnitInfo { name: "onsu" }); + m.insert("t", UnitInfo { name: "ton" }); + + // Volume + m.insert("ml", UnitInfo { name: "miririttoru" }); + m.insert("l", UnitInfo { name: "rittoru" }); + m.insert("L", UnitInfo { name: "rittoru" }); + + // Speed + m.insert("km/h", UnitInfo { name: "kiromeetoru mai ji" }); + m.insert("mph", UnitInfo { name: "mairu mai ji" }); + m.insert("m/s", UnitInfo { name: "meetoru mai byou" }); + + // Time + m.insert("s", UnitInfo { name: "byou" }); + m.insert("sec", UnitInfo { name: "byou" }); + m.insert("min", UnitInfo { name: "fun" }); + m.insert("h", UnitInfo { name: "jikan" }); + m.insert("hr", UnitInfo { name: "jikan" }); + + // Temperature + m.insert("\u{00B0}C", UnitInfo { name: "do" }); + m.insert("\u{00B0}F", UnitInfo { name: "do" }); + + // Data + m.insert("KB", UnitInfo { name: "kirobaito" }); + m.insert("MB", UnitInfo { name: "megabaito" }); + m.insert("GB", UnitInfo { name: "gigabaito" }); + m.insert("TB", UnitInfo { name: "terabaito" }); + + // Percentage + m.insert("%", UnitInfo { name: "paasento" }); + + // Frequency + m.insert("Hz", UnitInfo { name: "herutsu" }); + m.insert("kHz", UnitInfo { name: "kiroherutsu" }); + m.insert("MHz", UnitInfo { name: "megaherutsu" }); + m.insert("GHz", UnitInfo { name: "gigaherutsu" }); + + m + }; +} + +/// Parse a written measurement to spoken Japanese in romaji. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + let mut unit_matches: Vec<(&str, &UnitInfo)> = UNITS + .iter() + .filter(|(unit, _)| { + trimmed.ends_with(*unit) + && (trimmed.len() == unit.len() || { + let before = &trimmed[..trimmed.len() - unit.len()]; + if unit.len() == 1 && unit.chars().all(|c| c.is_ascii_alphabetic()) { + before.ends_with(' ') + } else { + before.ends_with(' ') || before.ends_with(|c: char| c.is_ascii_digit()) + } + }) + }) + .map(|(k, v)| (*k, v)) + .collect(); + + // Sort by unit length descending to prefer longer matches (km/h over h) + unit_matches.sort_by(|a, b| b.0.len().cmp(&a.0.len())); + + for (unit_str, unit_info) in unit_matches { + let num_part = trimmed[..trimmed.len() - unit_str.len()].trim(); + if num_part.is_empty() { + continue; + } + + let (is_negative, digits) = if let Some(rest) = num_part.strip_prefix('-') { + (true, rest.trim()) + } else { + (false, num_part) + }; + + let clean: String = digits + .chars() + .filter(|c| c.is_ascii_digit() || *c == '.' || *c == ',') + .collect(); + + if clean.is_empty() + || !clean + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',') + { + continue; + } + + // Handle decimals + let decimal_sep = if clean.contains(',') { ',' } else { '.' }; + if clean.contains(decimal_sep) { + let parts: Vec<&str> = clean.splitn(2, decimal_sep).collect(); + if parts.len() == 2 { + let int_val: i64 = if parts[0].is_empty() { + 0 + } else { + let Ok(v) = parts[0].parse::() else { + continue; + }; + v + }; + let int_words = number_to_words(int_val); + let frac_words = super::spell_digits(parts[1]); + let num_words = if is_negative { + format!("mainasu {} ten {}", int_words, frac_words) + } else { + format!("{} ten {}", int_words, frac_words) + }; + return Some(format!("{} {}", num_words, unit_info.name)); + } + continue; + } + + let Ok(n) = clean.parse::() else { + continue; + }; + let num_words = if is_negative { + format!("mainasu {}", number_to_words(n)) + } else { + number_to_words(n) + }; + + return Some(format!("{} {}", num_words, unit_info.name)); + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!( + parse("200 km/h"), + Some("ni hyaku kiromeetoru mai ji".to_string()) + ); + assert_eq!(parse("1 kg"), Some("ichi kiroguramu".to_string())); + assert_eq!(parse("5 km"), Some("go kiromeetoru".to_string())); + } + + #[test] + fn test_temperature() { + assert_eq!(parse("25\u{00B0}C"), Some("ni juu go do".to_string())); + assert_eq!(parse("0\u{00B0}C"), Some("zero do".to_string())); + } + + #[test] + fn test_percentage() { + assert_eq!(parse("50%"), Some("go juu paasento".to_string())); + assert_eq!(parse("100%"), Some("hyaku paasento".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!(parse("-5\u{00B0}C"), Some("mainasu go do".to_string())); + assert_eq!( + parse("-66 kg"), + Some("mainasu roku juu roku kiroguramu".to_string()) + ); + } + + #[test] + fn test_data() { + assert_eq!(parse("500 MB"), Some("go hyaku megabaito".to_string())); + assert_eq!(parse("1 GB"), Some("ichi gigabaito".to_string())); + } + + #[test] + fn test_decimal_with_empty_integer() { + assert_eq!(parse(".5 kg"), Some("zero ten go kiroguramu".to_string())); + } + + #[test] + fn test_non_measure() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/ja/mod.rs b/src/tts/ja/mod.rs new file mode 100644 index 0000000..0aa2633 --- /dev/null +++ b/src/tts/ja/mod.rs @@ -0,0 +1,254 @@ +//! Text Normalization taggers for Japanese (romaji output). +//! +//! Converts written-form text to spoken Japanese in romaji: +//! - "200" → "ni hyaku" +//! - "5000円" → "go sen en" +//! - "2025年1月5日" → "ni sen ni juu go nen ichi gatsu itsuka" + +pub mod cardinal; +pub mod date; +pub mod decimal; +pub mod electronic; +pub mod measure; +pub mod money; +pub mod ordinal; +pub mod telephone; +pub mod time; +pub mod whitelist; + +/// Digit words indexed by value (0..10). +const ONES: [&str; 10] = [ + "zero", "ichi", "ni", "san", "yon", "go", "roku", "nana", "hachi", "kyuu", +]; + +/// Convert an integer to Japanese words in romaji. +/// +/// Groups by man (10,000) and oku (100,000,000) following the Japanese +/// number system. Handles special sound changes (rendaku): +/// - 300 = sanbyaku, 600 = roppyaku, 800 = happyaku +/// - 3000 = sanzen, 8000 = hassen +/// +/// Examples: +/// - `0` → `"zero"` +/// - `21` → `"ni juu ichi"` +/// - `123` → `"hyaku ni juu san"` +/// - `10000` → `"ichi man"` +/// - `-42` → `"mainasu yon juu ni"` +pub fn number_to_words(n: i64) -> String { + if n == 0 { + return "zero".to_string(); + } + + if n < 0 { + let abs_val = (n as u64).wrapping_neg(); + return format!("mainasu {}", unsigned_to_words(abs_val)); + } + + unsigned_to_words(n as u64) +} + +fn unsigned_to_words(n: u64) -> String { + if n == 0 { + return "zero".to_string(); + } + + let mut parts: Vec = Vec::new(); + let mut remaining = n; + + // Japanese groups by man (10,000) and oku (100,000,000) + + // Process oku groups + if remaining >= 100_000_000 { + let oku_count = remaining / 100_000_000; + remaining %= 100_000_000; + // oku_count itself could be large, convert it recursively using sub-man grouping + let oku_words = sub_oku_to_words(oku_count); + parts.push(format!("{} oku", oku_words)); + } + + // Process man groups + if remaining >= 10_000 { + let man_count = remaining / 10_000; + remaining %= 10_000; + let man_words = chunk_to_words(man_count as u32); + parts.push(format!("{} man", man_words)); + } + + // Remainder (0..9999) + if remaining > 0 { + parts.push(chunk_to_words(remaining as u32)); + } + + parts.join(" ") +} + +/// Convert a number that will precede "oku" — it could be up to 9999 man range +/// but for oku prefix we just need 1..9999 range of sub-man grouping. +fn sub_oku_to_words(n: u64) -> String { + if n == 0 { + return "zero".to_string(); + } + + let mut parts: Vec = Vec::new(); + let mut remaining = n; + + // The oku prefix could itself be in the man range + if remaining >= 10_000 { + let man_count = remaining / 10_000; + remaining %= 10_000; + let man_words = chunk_to_words(man_count as u32); + parts.push(format!("{} man", man_words)); + } + + if remaining > 0 { + parts.push(chunk_to_words(remaining as u32)); + } + + parts.join(" ") +} + +/// Convert a number 1..9999 to Japanese words in romaji. +/// Handles sen (1000), hyaku (100), juu (10), and ones. +fn chunk_to_words(n: u32) -> String { + debug_assert!(n > 0 && n <= 9999); + let mut parts: Vec = Vec::new(); + + let thousands = n / 1000; + let rest_after_thou = n % 1000; + let hundreds = rest_after_thou / 100; + let rest_after_hund = rest_after_thou % 100; + let tens = rest_after_hund / 10; + let ones = rest_after_hund % 10; + + // Thousands (sen) with special sound changes + if thousands > 0 { + parts.push(sen_word(thousands)); + } + + // Hundreds (hyaku) with special sound changes + if hundreds > 0 { + parts.push(hyaku_word(hundreds)); + } + + // Tens (juu) + if tens > 0 { + if tens == 1 { + parts.push("juu".to_string()); + } else { + parts.push(format!("{} juu", ONES[tens as usize])); + } + } + + // Ones + if ones > 0 { + parts.push(ONES[ones as usize].to_string()); + } + + parts.join(" ") +} + +/// Convert thousands digit to the appropriate sen form. +/// Special: 3000=sanzen, 8000=hassen, 1000=sen +fn sen_word(thousands: u32) -> String { + match thousands { + 1 => "sen".to_string(), + 3 => "sanzen".to_string(), + 8 => "hassen".to_string(), + _ => format!("{} sen", ONES[thousands as usize]), + } +} + +/// Convert hundreds digit to the appropriate hyaku form. +/// Special: 300=sanbyaku, 600=roppyaku, 800=happyaku, 100=hyaku +fn hyaku_word(hundreds: u32) -> String { + match hundreds { + 1 => "hyaku".to_string(), + 3 => "sanbyaku".to_string(), + 6 => "roppyaku".to_string(), + 8 => "happyaku".to_string(), + _ => format!("{} hyaku", ONES[hundreds as usize]), + } +} + +/// Spell each digit of a string individually in Japanese romaji. +/// +/// "14" → "ichi yon" +pub fn spell_digits(s: &str) -> String { + s.chars() + .filter_map(|c| c.to_digit(10).map(|d| ONES[d as usize])) + .collect::>() + .join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(number_to_words(0), "zero"); + assert_eq!(number_to_words(1), "ichi"); + assert_eq!(number_to_words(5), "go"); + assert_eq!(number_to_words(10), "juu"); + assert_eq!(number_to_words(11), "juu ichi"); + assert_eq!(number_to_words(15), "juu go"); + assert_eq!(number_to_words(20), "ni juu"); + assert_eq!(number_to_words(21), "ni juu ichi"); + assert_eq!(number_to_words(99), "kyuu juu kyuu"); + } + + #[test] + fn test_hundreds() { + assert_eq!(number_to_words(100), "hyaku"); + assert_eq!(number_to_words(200), "ni hyaku"); + assert_eq!(number_to_words(300), "sanbyaku"); + assert_eq!(number_to_words(600), "roppyaku"); + assert_eq!(number_to_words(800), "happyaku"); + assert_eq!(number_to_words(123), "hyaku ni juu san"); + assert_eq!(number_to_words(999), "kyuu hyaku kyuu juu kyuu"); + } + + #[test] + fn test_thousands() { + assert_eq!(number_to_words(1000), "sen"); + assert_eq!(number_to_words(2000), "ni sen"); + assert_eq!(number_to_words(3000), "sanzen"); + assert_eq!(number_to_words(8000), "hassen"); + assert_eq!(number_to_words(1500), "sen go hyaku"); + assert_eq!(number_to_words(9999), "kyuu sen kyuu hyaku kyuu juu kyuu"); + } + + #[test] + fn test_man() { + assert_eq!(number_to_words(10000), "ichi man"); + assert_eq!(number_to_words(20000), "ni man"); + assert_eq!(number_to_words(50000), "go man"); + assert_eq!( + number_to_words(12345), + "ichi man ni sen sanbyaku yon juu go" + ); + } + + #[test] + fn test_oku() { + assert_eq!(number_to_words(100_000_000), "ichi oku"); + assert_eq!(number_to_words(200_000_000), "ni oku"); + assert_eq!( + number_to_words(123_456_789), + "ichi oku ni sen sanbyaku yon juu go man roku sen nana hyaku hachi juu kyuu" + ); + } + + #[test] + fn test_negative() { + assert_eq!(number_to_words(-42), "mainasu yon juu ni"); + assert_eq!(number_to_words(-1000), "mainasu sen"); + } + + #[test] + fn test_spell_digits() { + assert_eq!(spell_digits("14"), "ichi yon"); + assert_eq!(spell_digits("0"), "zero"); + assert_eq!(spell_digits("987"), "kyuu hachi nana"); + } +} diff --git a/src/tts/ja/money.rs b/src/tts/ja/money.rs new file mode 100644 index 0000000..c67da45 --- /dev/null +++ b/src/tts/ja/money.rs @@ -0,0 +1,291 @@ +//! Money TN tagger for Japanese (romaji output). +//! +//! Converts written currency expressions to spoken Japanese in romaji: +//! - "¥100" → "hyaku en" +//! - "¥1500" → "sen go hyaku en" +//! - "$5.50" → "go doru go juu sento" +//! - "€100" → "hyaku yuuro" + +use super::number_to_words; + +/// Scale suffixes recognized after a currency amount. +/// oku (億) = hundred million, man (万) = ten thousand +const SCALE_SUFFIXES: &[&str] = &["oku", "man"]; + +/// Japanese has no singular/plural distinction, so we use a single name per currency. +struct Currency { + name: &'static str, + cent_name: &'static str, +} + +const YEN: Currency = Currency { + name: "en", + cent_name: "", +}; + +const DOLLAR: Currency = Currency { + name: "doru", + cent_name: "sento", +}; + +const EURO: Currency = Currency { + name: "yuuro", + cent_name: "sento", +}; + +const POUND: Currency = Currency { + name: "pondo", + cent_name: "pensu", +}; + +/// Parse a written money expression to spoken Japanese in romaji. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Try suffix symbol: "100円", "100 円" + if let Some(result) = parse_suffix_currency(trimmed) { + return Some(result); + } + + // Try prefix symbol: "¥100", "$5.50", "€100", "£1" + if let Some(result) = parse_prefix_currency(trimmed) { + return Some(result); + } + + None +} + +fn parse_suffix_currency(input: &str) -> Option { + // Handle 円 suffix + let amount_str = input.strip_suffix('\u{5186}')?; // 円 + let amount_str = amount_str.trim(); + parse_amount(amount_str, &YEN) +} + +fn parse_prefix_currency(input: &str) -> Option { + let (currency, rest) = if let Some(r) = input.strip_prefix('\u{00A5}') { + // ¥ + (&YEN, r) + } else if let Some(r) = input.strip_prefix('$') { + (&DOLLAR, r) + } else if let Some(r) = input.strip_prefix('\u{20AC}') { + // € + (&EURO, r) + } else if let Some(r) = input.strip_prefix('\u{00A3}') { + // £ + (&POUND, r) + } else { + return None; + }; + + let rest = rest.trim(); + if rest.is_empty() { + return None; + } + + // Check for scale suffix: "¥2.5 man" (2.5万円) + let (amount_str, scale) = extract_scale(rest); + + // Without a scale suffix, the amount must be purely numeric + if scale.is_none() + && !amount_str + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == ' ') + { + return None; + } + + if let Some(scale_word) = scale { + // With scale: "¥2.5 man" → "ni ten go man en" + let sep = if amount_str.contains('.') { + '.' + } else if amount_str.contains(',') { + ',' + } else { + // No decimal: "¥50 man" → "go juu man en" + let clean: String = amount_str.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + let words = number_to_words(n); + return Some(format!("{} {} {}", words, scale_word, currency.name)); + }; + + let parts: Vec<&str> = amount_str.splitn(2, sep).collect(); + if parts.len() == 2 { + let int_val: i64 = parts[0].parse().ok()?; + let int_words = number_to_words(int_val); + let frac_words = super::spell_digits(parts[1]); + return Some(format!( + "{} ten {} {} {}", + int_words, frac_words, scale_word, currency.name + )); + } + } + + parse_amount(amount_str, currency) +} + +/// Extract scale suffix from the amount string. +fn extract_scale(input: &str) -> (&str, Option<&str>) { + for &scale in SCALE_SUFFIXES { + if let Some(before) = input.strip_suffix(scale) { + let before = before.trim_end(); + if !before.is_empty() + && before + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == ' ') + { + return (before, Some(scale)); + } + } + } + (input, None) +} + +fn parse_amount(amount_str: &str, currency: &Currency) -> Option { + if amount_str.is_empty() { + return None; + } + + // Determine decimal separator + let sep = if amount_str.contains('.') { + '.' + } else if amount_str.contains(',') { + ',' + } else { + // No decimal — whole amount only + let clean: String = amount_str.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + return Some(format_currency(n, 0, currency)); + }; + + let parts: Vec<&str> = amount_str.splitn(2, sep).collect(); + if parts.len() == 2 { + let int_clean: String = parts[0].chars().filter(|c| c.is_ascii_digit()).collect(); + let main_amount: i64 = if int_clean.is_empty() { + 0 + } else { + int_clean.parse().ok()? + }; + + let cents_str = parts[1].trim(); + let cents: i64 = if cents_str.is_empty() { + 0 + } else if cents_str.len() == 1 { + cents_str.parse::().ok()? * 10 + } else if cents_str.len() == 2 { + cents_str.parse().ok()? + } else { + cents_str[..2].parse().ok()? + }; + + return Some(format_currency(main_amount, cents, currency)); + } + + None +} + +fn format_currency(main_amount: i64, cents: i64, currency: &Currency) -> String { + let main_words = number_to_words(main_amount); + + // Yen has no sub-unit in modern usage + if currency.cent_name.is_empty() { + if main_amount == 0 { + return format!("zero {}", currency.name); + } + return format!("{} {}", main_words, currency.name); + } + + if main_amount == 0 && cents == 0 { + return format!("zero {}", currency.name); + } + + if main_amount == 0 { + let cents_words = number_to_words(cents); + return format!("{} {}", cents_words, currency.cent_name); + } + + if cents == 0 { + return format!("{} {}", main_words, currency.name); + } + + let cents_words = number_to_words(cents); + format!( + "{} {} {} {}", + main_words, currency.name, cents_words, currency.cent_name + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_yen_prefix() { + assert_eq!(parse("\u{00A5}100"), Some("hyaku en".to_string())); + assert_eq!(parse("\u{00A5}1500"), Some("sen go hyaku en".to_string())); + assert_eq!(parse("\u{00A5}1"), Some("ichi en".to_string())); + assert_eq!(parse("\u{00A5}10000"), Some("ichi man en".to_string())); + } + + #[test] + fn test_yen_suffix() { + assert_eq!(parse("100\u{5186}"), Some("hyaku en".to_string())); + assert_eq!(parse("500\u{5186}"), Some("go hyaku en".to_string())); + } + + #[test] + fn test_dollar() { + assert_eq!(parse("$100"), Some("hyaku doru".to_string())); + assert_eq!(parse("$5.50"), Some("go doru go juu sento".to_string())); + assert_eq!(parse("$1"), Some("ichi doru".to_string())); + } + + #[test] + fn test_euro() { + assert_eq!(parse("\u{20AC}100"), Some("hyaku yuuro".to_string())); + assert_eq!( + parse("\u{20AC}2.50"), + Some("ni yuuro go juu sento".to_string()) + ); + } + + #[test] + fn test_pound() { + assert_eq!(parse("\u{00A3}1"), Some("ichi pondo".to_string())); + assert_eq!( + parse("\u{00A3}3.99"), + Some("san pondo kyuu juu kyuu pensu".to_string()) + ); + } + + #[test] + fn test_dollars_and_cents() { + assert_eq!(parse("$1.01"), Some("ichi doru ichi sento".to_string())); + assert_eq!(parse("$0.99"), Some("kyuu juu kyuu sento".to_string())); + } + + #[test] + fn test_large_amounts() { + assert_eq!( + parse("\u{00A5}2.5 man"), + Some("ni ten go man en".to_string()) + ); + assert_eq!(parse("$50 oku"), Some("go juu oku doru".to_string())); + } + + #[test] + fn test_trailing_dot() { + assert_eq!(parse("$5."), Some("go doru".to_string())); + assert_eq!(parse("$1."), Some("ichi doru".to_string())); + } + + #[test] + fn test_non_money() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/ja/ordinal.rs b/src/tts/ja/ordinal.rs new file mode 100644 index 0000000..1dc9164 --- /dev/null +++ b/src/tts/ja/ordinal.rs @@ -0,0 +1,99 @@ +//! Ordinal TN tagger for Japanese (romaji output). +//! +//! Converts written ordinal numbers to spoken Japanese in romaji: +//! - "第1" → "dai ichi" +//! - "第100" → "dai hyaku" +//! - "1st" → "dai ichi" +//! - "3rd" → "dai san" + +use super::number_to_words; + +/// Parse a written ordinal to spoken Japanese words in romaji. +/// +/// Supports two formats: +/// - Japanese: "第1", "第100" +/// - English suffixes: "1st", "2nd", "3rd", "4th", etc. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Try Japanese format: 第N + if let Some(result) = parse_dai_format(trimmed) { + return Some(result); + } + + // Try English ordinal suffixes: 1st, 2nd, 3rd, 4th... + if let Some(result) = parse_english_suffix(trimmed) { + return Some(result); + } + + None +} + +fn parse_dai_format(input: &str) -> Option { + let rest = input.strip_prefix('\u{7B2C}')?; // 第 + if rest.is_empty() || !rest.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let n: i64 = rest.parse().ok()?; + if n <= 0 { + return None; + } + + Some(format!("dai {}", number_to_words(n))) +} + +fn parse_english_suffix(input: &str) -> Option { + let num_str = if let Some(s) = input.strip_suffix("st") { + s + } else if let Some(s) = input.strip_suffix("nd") { + s + } else if let Some(s) = input.strip_suffix("rd") { + s + } else if let Some(s) = input.strip_suffix("th") { + s + } else { + return None; + }; + + if num_str.is_empty() || !num_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let n: i64 = num_str.parse().ok()?; + if n <= 0 { + return None; + } + + Some(format!("dai {}", number_to_words(n))) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_dai_format() { + assert_eq!(parse("\u{7B2C}1"), Some("dai ichi".to_string())); + assert_eq!(parse("\u{7B2C}3"), Some("dai san".to_string())); + assert_eq!(parse("\u{7B2C}100"), Some("dai hyaku".to_string())); + assert_eq!(parse("\u{7B2C}10"), Some("dai juu".to_string())); + } + + #[test] + fn test_english_suffix() { + assert_eq!(parse("1st"), Some("dai ichi".to_string())); + assert_eq!(parse("2nd"), Some("dai ni".to_string())); + assert_eq!(parse("3rd"), Some("dai san".to_string())); + assert_eq!(parse("4th"), Some("dai yon".to_string())); + assert_eq!(parse("10th"), Some("dai juu".to_string())); + assert_eq!(parse("21st"), Some("dai ni juu ichi".to_string())); + } + + #[test] + fn test_non_ordinals() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("0th"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/ja/telephone.rs b/src/tts/ja/telephone.rs new file mode 100644 index 0000000..9c536fc --- /dev/null +++ b/src/tts/ja/telephone.rs @@ -0,0 +1,170 @@ +//! Telephone TN tagger for Japanese (romaji output). +//! +//! Converts written phone numbers to spoken Japanese romaji form: +//! - "03-1234-5678" → "zero san, ichi ni san yon, go roku nana hachi" +//! - "+81-3-1234-5678" → "purasu hachi ichi, san, ichi ni san yon, go roku nana hachi" +//! - "(03) 1234-5678" → "zero san, ichi ni san yon, go roku nana hachi" + +/// Parse a written phone number to spoken Japanese romaji form. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Phone numbers contain digits and separators (-, ., space, parens) + // Must have mostly digits + let digit_count = trimmed.chars().filter(|c| c.is_ascii_digit()).count(); + let non_digit_non_sep = trimmed + .chars() + .filter(|c| { + !c.is_ascii_digit() + && *c != '-' + && *c != '.' + && *c != ' ' + && *c != '(' + && *c != ')' + && *c != '+' + }) + .count(); + + // Must have at least 7 digits and no unexpected characters + if digit_count < 7 || non_digit_non_sep > 0 { + return None; + } + + // Must contain at least one separator (-, ., space, parens) to distinguish + // from plain numbers like "1000000" + let has_separator = trimmed + .chars() + .any(|c| c == '-' || c == '.' || c == ' ' || c == '(' || c == ')'); + if !has_separator { + return None; + } + + let mut parts: Vec = Vec::new(); + let mut has_plus = false; + + // Handle leading + + let rest = if let Some(r) = trimmed.strip_prefix('+') { + has_plus = true; + r.trim_start() + } else { + trimmed + }; + + // Split by common separators + let groups = split_phone_groups(rest); + + if has_plus && !groups.is_empty() { + // The first group after + is the country code + let mut first = String::from("purasu "); + first.push_str(&spell_digit_group(&groups[0])); + parts.push(first); + for g in &groups[1..] { + parts.push(spell_digit_group(g)); + } + } else { + for g in &groups { + parts.push(spell_digit_group(g)); + } + } + + if parts.is_empty() { + return None; + } + + Some(parts.join(", ")) +} + +/// Split phone number into groups by separators. +fn split_phone_groups(input: &str) -> Vec { + let mut groups: Vec = Vec::new(); + let mut current = String::new(); + + for c in input.chars() { + match c { + '0'..='9' => current.push(c), + '-' | '.' | ' ' | '(' | ')' => { + if !current.is_empty() { + groups.push(current.clone()); + current.clear(); + } + } + _ => {} + } + } + + if !current.is_empty() { + groups.push(current); + } + + groups +} + +/// Spell each digit in a group using Japanese romaji. +fn spell_digit_group(group: &str) -> String { + group + .chars() + .filter_map(|c| { + let word = match c { + '0' => "zero", + '1' => "ichi", + '2' => "ni", + '3' => "san", + '4' => "yon", + '5' => "go", + '6' => "roku", + '7' => "nana", + '8' => "hachi", + '9' => "kyuu", + _ => return None, + }; + Some(word) + }) + .collect::>() + .join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_standard_phone() { + assert_eq!( + parse("03-1234-5678"), + Some("zero san, ichi ni san yon, go roku nana hachi".to_string()) + ); + } + + #[test] + fn test_with_country_code() { + assert_eq!( + parse("+81-3-1234-5678"), + Some("purasu hachi ichi, san, ichi ni san yon, go roku nana hachi".to_string()) + ); + } + + #[test] + fn test_parentheses() { + assert_eq!( + parse("(03) 1234-5678"), + Some("zero san, ichi ni san yon, go roku nana hachi".to_string()) + ); + } + + #[test] + fn test_dots() { + assert_eq!( + parse("555.123.4567"), + Some("go go go, ichi ni san, yon go roku nana".to_string()) + ); + } + + #[test] + fn test_non_phone() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); // too few digits + } +} diff --git a/src/tts/ja/time.rs b/src/tts/ja/time.rs new file mode 100644 index 0000000..f4e7e8f --- /dev/null +++ b/src/tts/ja/time.rs @@ -0,0 +1,239 @@ +//! Time TN tagger for Japanese (romaji output). +//! +//! Converts written time expressions to spoken Japanese in romaji: +//! - "14:30" → "juu yo ji san juppun" +//! - "9:00" → "ku ji" +//! - "7:15" → "shichi ji juu go fun" + +use super::number_to_words; + +/// Special hour readings for Japanese. +/// 4=yo ji, 7=shichi ji, 9=ku ji; compound hours preserve these: +/// 14=juu yo ji, 17=juu shichi ji, 19=juu ku ji. +fn hour_to_romaji(hour: u32) -> String { + match hour { + 0 => "zero ji".to_string(), + 4 => "yo ji".to_string(), + 7 => "shichi ji".to_string(), + 9 => "ku ji".to_string(), + 14 => "juu yo ji".to_string(), + 17 => "juu shichi ji".to_string(), + 19 => "juu ku ji".to_string(), + _ => format!("{} ji", number_to_words(hour as i64)), + } +} + +/// Special minute readings with sound changes (rendaku). +/// 1=ippun, 3=sanpun, 4=yonpun, 6=roppun, 8=happun, 10=juppun +/// Others: number + fun +fn minute_to_romaji(minute: u32) -> String { + match minute { + 0 => String::new(), + 1 => "ippun".to_string(), + 2 => "ni fun".to_string(), + 3 => "sanpun".to_string(), + 4 => "yonpun".to_string(), + 5 => "go fun".to_string(), + 6 => "roppun".to_string(), + 7 => "nana fun".to_string(), + 8 => "happun".to_string(), + 9 => "kyuu fun".to_string(), + 10 => "juppun".to_string(), + _ => { + // For compound minutes, apply rules to the ones digit + let tens = minute / 10; + let ones = minute % 10; + + if ones == 0 { + // Exact tens: 20, 30, 40, 50 + let tens_word = number_to_words(tens as i64); + match tens { + 2 => "ni juppun".to_string(), + 3 => "san juppun".to_string(), + 4 => "yon juppun".to_string(), + 5 => "go juppun".to_string(), + _ => format!("{} juppun", tens_word), + } + } else { + // Compound: tens + ones minute reading + let tens_part = if tens > 1 { + format!("{} juu", number_to_words(tens as i64)) + } else if tens == 1 { + "juu".to_string() + } else { + String::new() + }; + + let ones_part = match ones { + 1 => "ippun".to_string(), + 2 => "ni fun".to_string(), + 3 => "sanpun".to_string(), + 4 => "yonpun".to_string(), + 5 => "go fun".to_string(), + 6 => "roppun".to_string(), + 7 => "nana fun".to_string(), + 8 => "happun".to_string(), + 9 => "kyuu fun".to_string(), + _ => unreachable!(), + }; + + if tens_part.is_empty() { + ones_part + } else { + format!("{} {}", tens_part, ones_part) + } + } + } + } +} + +/// Parse a written time expression to spoken Japanese in romaji. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Try "14:30" format + if let Some(result) = parse_colon_format(trimmed) { + return Some(result); + } + + // Try "14時30分" format + if let Some(result) = parse_japanese_format(trimmed) { + return Some(result); + } + + None +} + +fn parse_colon_format(input: &str) -> Option { + if !input.contains(':') { + return None; + } + + let parts: Vec<&str> = input.splitn(2, ':').collect(); + if parts.len() != 2 { + return None; + } + + let hour_str = parts[0].trim(); + let min_str = parts[1].trim(); + + if !hour_str.chars().all(|c| c.is_ascii_digit()) || hour_str.is_empty() { + return None; + } + if !min_str.chars().all(|c| c.is_ascii_digit()) || min_str.is_empty() { + return None; + } + + let hour: u32 = hour_str.parse().ok()?; + let minute: u32 = min_str.parse().ok()?; + + if hour > 23 || minute > 59 { + return None; + } + + Some(format_time(hour, minute)) +} + +fn parse_japanese_format(input: &str) -> Option { + // Pattern: H時M分 + let ji_pos = input.find('\u{6642}')?; // 時 + let hour_str = &input[..ji_pos]; + if !hour_str.chars().all(|c| c.is_ascii_digit()) || hour_str.is_empty() { + return None; + } + let hour: u32 = hour_str.parse().ok()?; + if hour > 23 { + return None; + } + + let after_ji = &input[ji_pos + '\u{6642}'.len_utf8()..]; + + let minute: u32 = if after_ji.is_empty() { + 0 + } else { + let min_str = if let Some(fun_pos) = after_ji.find('\u{5206}') { + // 分 + &after_ji[..fun_pos] + } else { + after_ji.trim() + }; + if min_str.is_empty() { + 0 + } else if !min_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } else { + let m: u32 = min_str.parse().ok()?; + if m > 59 { + return None; + } + m + } + }; + + Some(format_time(hour, minute)) +} + +fn format_time(hour: u32, minute: u32) -> String { + let hour_words = hour_to_romaji(hour); + + if minute == 0 { + hour_words + } else { + let min_words = minute_to_romaji(minute); + format!("{} {}", hour_words, min_words) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_colon_format() { + assert_eq!(parse("14:30"), Some("juu yo ji san juppun".to_string())); + assert_eq!(parse("9:00"), Some("ku ji".to_string())); + assert_eq!(parse("7:15"), Some("shichi ji juu go fun".to_string())); + assert_eq!(parse("4:00"), Some("yo ji".to_string())); + } + + #[test] + fn test_minute_sound_changes() { + assert_eq!(parse("3:01"), Some("san ji ippun".to_string())); + assert_eq!(parse("3:03"), Some("san ji sanpun".to_string())); + assert_eq!(parse("3:06"), Some("san ji roppun".to_string())); + assert_eq!(parse("3:08"), Some("san ji happun".to_string())); + assert_eq!(parse("3:10"), Some("san ji juppun".to_string())); + } + + #[test] + fn test_japanese_format() { + assert_eq!( + parse("14\u{6642}30\u{5206}"), + Some("juu yo ji san juppun".to_string()) + ); + assert_eq!(parse("9\u{6642}"), Some("ku ji".to_string())); + } + + #[test] + fn test_compound_minutes() { + assert_eq!(parse("3:21"), Some("san ji ni juu ippun".to_string())); + assert_eq!(parse("3:45"), Some("san ji yon juu go fun".to_string())); + } + + #[test] + fn test_24h() { + assert_eq!(parse("14:00"), Some("juu yo ji".to_string())); + assert_eq!( + parse("23:59"), + Some("ni juu san ji go juu kyuu fun".to_string()) + ); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("25:00"), None); + assert_eq!(parse("12:60"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/ja/whitelist.rs b/src/tts/ja/whitelist.rs new file mode 100644 index 0000000..a6f84a8 --- /dev/null +++ b/src/tts/ja/whitelist.rs @@ -0,0 +1,87 @@ +//! Whitelist TN tagger for Japanese (romaji output). +//! +//! Lookup table for common abbreviations translated to Japanese romaji: +//! - "Dr." → "dokutaa" +//! - "Mr." → "misutaa" +//! - "etc." → "nado" +//! - "vs." → "tai" + +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + static ref WHITELIST: HashMap<&'static str, &'static str> = { + let mut m = HashMap::new(); + // Titles + m.insert("Dr.", "dokutaa"); + m.insert("Dr", "dokutaa"); + m.insert("Mr.", "misutaa"); + m.insert("Mr", "misutaa"); + m.insert("Mrs.", "misizu"); + m.insert("Mrs", "misizu"); + m.insert("Ms.", "mizu"); + m.insert("Ms", "mizu"); + m.insert("Prof.", "kyouju"); + m.insert("St.", "seinto"); + m.insert("Jr.", "junia"); + m.insert("Sr.", "shinia"); + + // Abbreviations + m.insert("etc.", "nado"); + m.insert("vs.", "tai"); + m.insert("vs", "tai"); + m.insert("No.", "bangou"); + + // Business + m.insert("Inc.", "kabushiki gaisha"); + m.insert("Ltd.", "yuugen gaisha"); + m.insert("Co.", "gaisha"); + + m + }; +} + +/// Parse a whitelist abbreviation to its spoken Japanese romaji form. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Direct lookup (case-sensitive) + if let Some(&spoken) = WHITELIST.get(trimmed) { + return Some(spoken.to_string()); + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_titles() { + assert_eq!(parse("Dr."), Some("dokutaa".to_string())); + assert_eq!(parse("Mr."), Some("misutaa".to_string())); + assert_eq!(parse("Mrs."), Some("misizu".to_string())); + assert_eq!(parse("Ms."), Some("mizu".to_string())); + } + + #[test] + fn test_abbreviations() { + assert_eq!(parse("etc."), Some("nado".to_string())); + assert_eq!(parse("vs."), Some("tai".to_string())); + assert_eq!(parse("No."), Some("bangou".to_string())); + } + + #[test] + fn test_business() { + assert_eq!(parse("Inc."), Some("kabushiki gaisha".to_string())); + assert_eq!(parse("Ltd."), Some("yuugen gaisha".to_string())); + assert_eq!(parse("Co."), Some("gaisha".to_string())); + } + + #[test] + fn test_no_match() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("world"), None); + } +} diff --git a/src/tts/mod.rs b/src/tts/mod.rs index 74f8ded..41fea87 100644 --- a/src/tts/mod.rs +++ b/src/tts/mod.rs @@ -1,10 +1,13 @@ -//! Text Normalization taggers for English. +//! Text Normalization taggers. //! //! Converts written-form text to spoken form (the reverse of ITN): -//! - "200" → "two hundred" -//! - "$5.50" → "five dollars and fifty cents" -//! - "January 5, 2025" → "january fifth twenty twenty five" +//! - "200" → "two hundred" (English) +//! - "$5.50" → "five dollars and fifty cents" (English) +//! - "January 5, 2025" → "january fifth twenty twenty five" (English) +//! +//! Supports multiple languages via submodules. +// English (default) pub mod cardinal; pub mod date; pub mod decimal; @@ -16,6 +19,14 @@ pub mod telephone; pub mod time; pub mod whitelist; +// Additional languages +pub mod de; +pub mod es; +pub mod fr; +pub mod hi; +pub mod ja; +pub mod zh; + /// Ones words indexed by value (0..20). const ONES: [&str; 20] = [ "zero", diff --git a/src/tts/zh/cardinal.rs b/src/tts/zh/cardinal.rs new file mode 100644 index 0000000..261c40c --- /dev/null +++ b/src/tts/zh/cardinal.rs @@ -0,0 +1,90 @@ +//! Cardinal TN tagger for Mandarin Chinese. +//! +//! Converts written cardinal numbers to spoken Mandarin pinyin: +//! - "123" -> "yi bai er shi san" +//! - "-42" -> "fu si shi er" +//! - "10000" -> "yi wan" + +use super::number_to_words; + +/// Parse a written cardinal number to spoken Mandarin pinyin words. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + let (is_negative, digits_part) = if let Some(rest) = trimmed.strip_prefix('-') { + (true, rest) + } else { + (false, trimmed) + }; + + // Must be digits (with optional commas, dots, or spaces as thousands separators) + if !digits_part + .chars() + .all(|c| c.is_ascii_digit() || c == ',' || c == '.' || c == ' ' || c == '\u{a0}') + { + return None; + } + + if !digits_part.chars().any(|c| c.is_ascii_digit()) { + return None; + } + + // Strip thousands separators + let clean: String = digits_part.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + + if is_negative { + Some(format!("fu {}", number_to_words(n))) + } else { + Some(number_to_words(n)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(parse("0"), Some("ling".to_string())); + assert_eq!(parse("1"), Some("yi".to_string())); + assert_eq!(parse("21"), Some("er shi yi".to_string())); + assert_eq!(parse("100"), Some("yi bai".to_string())); + assert_eq!(parse("123"), Some("yi bai er shi san".to_string())); + } + + #[test] + fn test_wan_grouping() { + assert_eq!(parse("10000"), Some("yi wan".to_string())); + assert_eq!( + parse("12345"), + Some("yi wan er qian san bai si shi wu".to_string()) + ); + assert_eq!(parse("100000000"), Some("yi yi".to_string())); + } + + #[test] + fn test_thousands_separators() { + assert_eq!(parse("1 000"), Some("yi qian".to_string())); + assert_eq!(parse("1,000"), Some("yi qian".to_string())); + assert_eq!(parse("1.000"), Some("yi qian".to_string())); + assert_eq!(parse("1 000 000"), Some("yi bai wan".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!(parse("-42"), Some("fu si shi er".to_string())); + assert_eq!(parse("-1"), Some("fu yi".to_string())); + assert_eq!(parse("-10000"), Some("fu yi wan".to_string())); + } + + #[test] + fn test_non_numbers() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("12abc"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/zh/date.rs b/src/tts/zh/date.rs new file mode 100644 index 0000000..09c235d --- /dev/null +++ b/src/tts/zh/date.rs @@ -0,0 +1,345 @@ +//! Date TN tagger for Mandarin Chinese. +//! +//! Converts written date expressions to spoken Mandarin pinyin: +//! - "2025年1月5日" -> "er ling er wu nian yi yue wu ri" +//! - "2025-01-05" -> "er ling er wu nian yi yue wu ri" +//! - "January 5, 2025" -> "er ling er wu nian yi yue wu ri" +//! +//! Year: each digit spelled out + "nian" +//! Month: cardinal number + "yue" +//! Day: cardinal number + "ri" (or "hao") + +use super::{number_to_words, spell_digits}; + +const MONTHS_EN: &[(&str, u32)] = &[ + ("january", 1), + ("february", 2), + ("march", 3), + ("april", 4), + ("may", 5), + ("june", 6), + ("july", 7), + ("august", 8), + ("september", 9), + ("october", 10), + ("november", 11), + ("december", 12), +]; + +/// Parse a written date to spoken Mandarin pinyin. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Try decade: "1980s" → "yi jiu ba ling nian dai" (一九八零年代) + if let Some(result) = parse_decade(trimmed) { + return Some(result); + } + + // Try Chinese format: 2025年1月5日 + if let Some(result) = parse_chinese_date(trimmed) { + return Some(result); + } + + // Try English month format: "January 5, 2025" + if let Some(result) = parse_english_month_date(trimmed) { + return Some(result); + } + + // Try numeric YYYY-MM-DD or YYYY/MM/DD + if let Some(result) = parse_numeric_date(trimmed) { + return Some(result); + } + + None +} + +/// Parse decade: "1980s" → "yi jiu ba ling nian dai" (一九八零年代) +fn parse_decade(input: &str) -> Option { + let s = input.strip_suffix('s')?; + if s.len() != 4 || !s.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let year: u32 = s.parse().ok()?; + if year < 1000 { + return None; + } + + // Must be a round decade (ends in 0) + if year % 10 != 0 { + return None; + } + + // Chinese: spell each digit + "nian dai" + let year_words = spell_digits(s); + Some(format!("{} nian dai", year_words)) +} + +fn parse_chinese_date(input: &str) -> Option { + // Look for 年 (nian), 月 (yue), 日 (ri) markers + let nian_char = '\u{5E74}'; // 年 + let yue_char = '\u{6708}'; // 月 + let ri_char = '\u{65E5}'; // 日 + let hao_char = '\u{53F7}'; // 号 + + let has_nian = input.contains(nian_char); + let has_yue = input.contains(yue_char); + + if !has_nian && !has_yue { + return None; + } + + let mut parts: Vec = Vec::new(); + + let mut remaining = input; + + // Extract year (before 年) + if has_nian { + let nian_pos = remaining.find(nian_char)?; + let year_str = &remaining[..nian_pos]; + if !year_str.is_empty() && year_str.chars().all(|c| c.is_ascii_digit()) { + let year_words = spell_digits(year_str); + parts.push(format!("{} nian", year_words)); + } + remaining = &remaining[nian_pos + nian_char.len_utf8()..]; + } + + // Extract month (before 月) + if has_yue { + let yue_pos = remaining.find(yue_char)?; + let month_str = &remaining[..yue_pos].trim(); + if !month_str.is_empty() && month_str.chars().all(|c| c.is_ascii_digit()) { + let month: u32 = month_str.parse().ok()?; + if month == 0 || month > 12 { + return None; + } + parts.push(format!("{} yue", number_to_words(month as i64))); + } + remaining = &remaining[yue_pos + yue_char.len_utf8()..]; + } + + // Extract day (before 日 or 号) + let day_end = remaining.find(ri_char).or_else(|| remaining.find(hao_char)); + if let Some(pos) = day_end { + let day_str = &remaining[..pos].trim(); + if !day_str.is_empty() && day_str.chars().all(|c| c.is_ascii_digit()) { + let day: u32 = day_str.parse().ok()?; + if day == 0 || day > 31 { + return None; + } + parts.push(format!("{} ri", number_to_words(day as i64))); + } + } else { + // No 日/号 marker, check if there are trailing digits for the day + let day_str = remaining.trim(); + if !day_str.is_empty() && day_str.chars().all(|c| c.is_ascii_digit()) { + let day: u32 = day_str.parse().ok()?; + if day > 0 && day <= 31 { + parts.push(format!("{} ri", number_to_words(day as i64))); + } + } + } + + if parts.is_empty() { + return None; + } + + Some(parts.join(" ")) +} + +fn parse_english_month_date(input: &str) -> Option { + let lower = input.to_lowercase(); + + let mut month_num = None; + let mut rest = ""; + for &(name, num) in MONTHS_EN { + if let Some(r) = lower.strip_prefix(name) { + if r.is_empty() || r.starts_with(' ') || r.starts_with(',') { + month_num = Some(num); + rest = r.trim_start_matches(|c: char| c == ' ' || c == ','); + break; + } + } + } + + let month_num = month_num?; + if rest.is_empty() { + return None; + } + + // Parse day + let (day_str, year_part) = if let Some(comma_pos) = rest.find(',') { + (&rest[..comma_pos], Some(rest[comma_pos + 1..].trim())) + } else { + let tokens: Vec<&str> = rest.splitn(2, ' ').collect(); + if tokens.len() == 2 + && tokens[0] + .trim_end_matches("st") + .trim_end_matches("nd") + .trim_end_matches("rd") + .trim_end_matches("th") + .chars() + .all(|c| c.is_ascii_digit()) + { + let year_clean = + tokens[1].trim_end_matches(|c: char| c == '.' || c == ',' || c == '!' || c == '?'); + if year_clean.chars().all(|c| c.is_ascii_digit()) && year_clean.len() == 4 { + (tokens[0], Some(year_clean)) + } else { + (rest, None) + } + } else { + (rest, None) + } + }; + + let day_digits = day_str + .trim() + .trim_end_matches("st") + .trim_end_matches("nd") + .trim_end_matches("rd") + .trim_end_matches("th"); + + if !day_digits.chars().all(|c| c.is_ascii_digit()) || day_digits.is_empty() { + return None; + } + + let day: u32 = day_digits.parse().ok()?; + if day == 0 || day > 31 { + return None; + } + + let mut parts: Vec = Vec::new(); + + if let Some(year_str) = year_part { + let year_str = year_str + .trim() + .trim_end_matches(|c: char| c == '.' || c == ',' || c == '!' || c == '?'); + if !year_str.is_empty() && year_str.chars().all(|c| c.is_ascii_digit()) { + parts.push(format!("{} nian", spell_digits(year_str))); + } + } + + parts.push(format!("{} yue", number_to_words(month_num as i64))); + parts.push(format!("{} ri", number_to_words(day as i64))); + + Some(parts.join(" ")) +} + +/// Parse numeric date YYYY-MM-DD or YYYY/MM/DD. +fn parse_numeric_date(input: &str) -> Option { + let sep = if input.contains('/') { + '/' + } else if input.contains('-') && input.chars().filter(|c| *c == '-').count() == 2 { + '-' + } else { + return None; + }; + + let tokens: Vec<&str> = input.splitn(3, sep).collect(); + if tokens.len() != 3 { + return None; + } + + if !tokens + .iter() + .all(|p| !p.is_empty() && p.chars().all(|c| c.is_ascii_digit())) + { + return None; + } + + // Determine if YYYY-MM-DD or DD-MM-YYYY by checking first token length + let (year, month_num, day) = if tokens[0].len() == 4 { + // YYYY-MM-DD + let y: u32 = tokens[0].parse().ok()?; + let m: u32 = tokens[1].parse().ok()?; + let d: u32 = tokens[2].parse().ok()?; + (y, m, d) + } else { + // Assume DD-MM-YYYY (less common for Chinese context, but support it) + let d: u32 = tokens[0].parse().ok()?; + let m: u32 = tokens[1].parse().ok()?; + let y: u32 = tokens[2].parse().ok()?; + (y, m, d) + }; + + if month_num == 0 || month_num > 12 || day == 0 || day > 31 { + return None; + } + + let year_words = spell_digits(&year.to_string()); + let month_words = number_to_words(month_num as i64); + let day_words = number_to_words(day as i64); + + Some(format!( + "{} nian {} yue {} ri", + year_words, month_words, day_words + )) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_chinese_date() { + assert_eq!( + parse("2025\u{5E74}1\u{6708}5\u{65E5}"), + Some("er ling er wu nian yi yue wu ri".to_string()) + ); + assert_eq!( + parse("2025\u{5E74}12\u{6708}31\u{65E5}"), + Some("er ling er wu nian shi er yue san shi yi ri".to_string()) + ); + } + + #[test] + fn test_english_month() { + assert_eq!( + parse("January 5, 2025"), + Some("er ling er wu nian yi yue wu ri".to_string()) + ); + assert_eq!( + parse("December 25, 2000"), + Some("er ling ling ling nian shi er yue er shi wu ri".to_string()) + ); + } + + #[test] + fn test_numeric_date() { + assert_eq!( + parse("2025-01-05"), + Some("er ling er wu nian yi yue wu ri".to_string()) + ); + assert_eq!( + parse("2025/03/15"), + Some("er ling er wu nian san yue shi wu ri".to_string()) + ); + } + + #[test] + fn test_decade() { + assert_eq!(parse("1980s"), Some("yi jiu ba ling nian dai".to_string())); + assert_eq!( + parse("2000s"), + Some("er ling ling ling nian dai".to_string()) + ); + assert_eq!(parse("1990s"), Some("yi jiu jiu ling nian dai".to_string())); + } + + #[test] + fn test_year_verbalization() { + // In Chinese, year digits are spelled individually + assert_eq!(spell_digits("2025"), "er ling er wu".to_string()); + assert_eq!(spell_digits("2000"), "er ling ling ling".to_string()); + assert_eq!(spell_digits("1990"), "yi jiu jiu ling".to_string()); + assert_eq!(spell_digits("1900"), "yi jiu ling ling".to_string()); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/zh/decimal.rs b/src/tts/zh/decimal.rs new file mode 100644 index 0000000..4879394 --- /dev/null +++ b/src/tts/zh/decimal.rs @@ -0,0 +1,118 @@ +//! Decimal TN tagger for Mandarin Chinese. +//! +//! Converts written decimal numbers to spoken Mandarin pinyin: +//! - "3.14" -> "san dian yi si" +//! - "0.5" -> "ling dian wu" +//! - "-2.7" -> "fu er dian qi" +//! +//! The decimal point is read as "dian" (点) in Chinese. +//! Fractional digits are spelled out individually. + +use super::{number_to_words, spell_digits}; + +/// Mandarin quantity suffixes recognized after a decimal number. +/// yi = 亿 (hundred million), wan = 万 (ten thousand) +const QUANTITY_SUFFIXES: &[&str] = &["yi", "wan"]; + +/// Parse a written decimal number to spoken Mandarin pinyin. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Check for quantity suffix: "1.5 yi" (1.5 hundred million) + let (number_part, suffix) = extract_suffix(trimmed); + + // Chinese uses period as decimal separator + if !number_part.contains('.') { + return None; + } + + let parts: Vec<&str> = number_part.splitn(2, '.').collect(); + if parts.len() != 2 { + return None; + } + + let int_str = parts[0]; + let frac_str = parts[1]; + + let (is_negative, int_digits) = if let Some(rest) = int_str.strip_prefix('-') { + (true, rest) + } else { + (false, int_str) + }; + + if !int_digits.chars().all(|c| c.is_ascii_digit()) { + return None; + } + if frac_str.is_empty() || !frac_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let int_val: i64 = if int_digits.is_empty() { + 0 + } else { + int_digits.parse().ok()? + }; + + let int_words = number_to_words(int_val); + let frac_words = spell_digits(frac_str); + + let mut result = if is_negative { + format!("fu {} dian {}", int_words, frac_words) + } else { + format!("{} dian {}", int_words, frac_words) + }; + + if let Some(suf) = suffix { + result.push(' '); + result.push_str(suf); + } + + Some(result) +} + +/// Extract a quantity suffix from the end if present. +fn extract_suffix(input: &str) -> (&str, Option<&str>) { + for &suf in QUANTITY_SUFFIXES { + if let Some(before) = input.strip_suffix(suf) { + let before = before.trim_end(); + if !before.is_empty() { + return (before, Some(suf)); + } + } + } + (input, None) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_decimal() { + assert_eq!(parse("3.14"), Some("san dian yi si".to_string())); + assert_eq!(parse("0.5"), Some("ling dian wu".to_string())); + assert_eq!(parse("100.01"), Some("yi bai dian ling yi".to_string())); + } + + #[test] + fn test_negative_decimal() { + assert_eq!(parse("-2.7"), Some("fu er dian qi".to_string())); + assert_eq!(parse("-3.14"), Some("fu san dian yi si".to_string())); + } + + #[test] + fn test_with_quantity() { + assert_eq!(parse("1.5 yi"), Some("yi dian wu yi".to_string())); + assert_eq!(parse("4.85 wan"), Some("si dian ba wu wan".to_string())); + } + + #[test] + fn test_non_decimal() { + assert_eq!(parse("123"), None); + assert_eq!(parse("hello"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/zh/electronic.rs b/src/tts/zh/electronic.rs new file mode 100644 index 0000000..cb8f4ee --- /dev/null +++ b/src/tts/zh/electronic.rs @@ -0,0 +1,158 @@ +//! Electronic TN tagger for Mandarin Chinese (pinyin output). +//! +//! Converts written emails and URLs to spoken form in pinyin: +//! - "test@gmail.com" -> "t e s t at g m a i l dian c o m" +//! - "http://www.example.com" -> "h t t p mao hao xie gang xie gang w w w dian e x a m p l e dian c o m" + +/// Parse an email or URL to spoken form in Mandarin Chinese pinyin. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Email detection: contains @ with text on both sides + if trimmed.contains('@') { + return parse_email(trimmed); + } + + // URL detection: starts with http://, https://, or www. + let lower = trimmed.to_lowercase(); + if lower.starts_with("http://") || lower.starts_with("https://") || lower.starts_with("www.") { + return parse_url(trimmed); + } + + None +} + +/// Parse an email address to spoken form in pinyin. +fn parse_email(input: &str) -> Option { + let parts: Vec<&str> = input.splitn(2, '@').collect(); + if parts.len() != 2 || parts[0].is_empty() || parts[1].is_empty() { + return None; + } + + let local = spell_domain(parts[0]); + let domain = spell_domain(parts[1]); + + Some(format!("{} at {}", local, domain)) +} + +/// Parse a URL to spoken form in pinyin. +fn parse_url(input: &str) -> Option { + let mut result = String::new(); + let lower = input.to_lowercase(); + + let rest = if lower.starts_with("https://") { + result.push_str("h t t p s mao hao xie gang xie gang"); + &input["https://".len()..] + } else if lower.starts_with("http://") { + result.push_str("h t t p mao hao xie gang xie gang"); + &input["http://".len()..] + } else { + input + }; + + if !result.is_empty() && !rest.is_empty() { + result.push(' '); + } + + result.push_str(&spell_domain(rest)); + + Some(result) +} + +/// Spell out a domain name, using "dian" for periods. +fn spell_domain(domain: &str) -> String { + let parts: Vec<&str> = domain.split('.').collect(); + let spelled: Vec = parts.iter().map(|p| spell_electronic(p)).collect(); + spelled.join(" dian ") +} + +/// Spell out an electronic string in Mandarin Chinese pinyin. +/// +/// Letters are spelled individually (lowercase). +/// Digits use Chinese pinyin words. +/// Special characters use Chinese names. +fn spell_electronic(s: &str) -> String { + let mut parts: Vec = Vec::new(); + + for c in s.chars() { + match c { + '-' => parts.push("gang".to_string()), + '_' => parts.push("xia hua xian".to_string()), + '/' => parts.push("xie gang".to_string()), + '~' => parts.push("bo lang hao".to_string()), + ':' => parts.push("mao hao".to_string()), + c if c.is_ascii_alphabetic() => { + parts.push(c.to_lowercase().to_string()); + } + c if c.is_ascii_digit() => { + parts.push(digit_pinyin(c)); + } + _ => { + // Skip unknown characters + } + } + } + + parts.join(" ") +} + +fn digit_pinyin(c: char) -> String { + match c { + '0' => "ling", + '1' => "yi", + '2' => "er", + '3' => "san", + '4' => "si", + '5' => "wu", + '6' => "liu", + '7' => "qi", + '8' => "ba", + '9' => "jiu", + _ => "", + } + .to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_email() { + assert_eq!( + parse("test@gmail.com"), + Some("t e s t at g m a i l dian c o m".to_string()) + ); + assert_eq!( + parse("user123@example.org"), + Some("u s e r yi er san at e x a m p l e dian o r g".to_string()) + ); + } + + #[test] + fn test_url_http() { + assert_eq!( + parse("http://www.example.com"), + Some( + "h t t p mao hao xie gang xie gang w w w dian e x a m p l e dian c o m".to_string() + ) + ); + } + + #[test] + fn test_url_www() { + assert_eq!( + parse("www.baidu.com"), + Some("w w w dian b a i d u dian c o m".to_string()) + ); + } + + #[test] + fn test_non_electronic() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("12345"), None); + } +} diff --git a/src/tts/zh/measure.rs b/src/tts/zh/measure.rs new file mode 100644 index 0000000..2bb2a80 --- /dev/null +++ b/src/tts/zh/measure.rs @@ -0,0 +1,278 @@ +//! Measure TN tagger for Mandarin Chinese. +//! +//! Converts written measurements to spoken Mandarin pinyin: +//! - "200 km/h" -> "er bai gongli mei xiaoshi" +//! - "1 kg" -> "yi gongjin" +//! - "72°C" -> "qi shi er shengshi du" +//! - "50%" -> "bai fen zhi wu shi" +//! +//! Chinese unit names are given in pinyin. "mei" is used for "per". +//! Percentage uses the Chinese idiom "bai fen zhi N" (百分之N, literally "of 100 parts, N"). + +use super::number_to_words; + +use lazy_static::lazy_static; +use std::collections::HashMap; + +struct UnitInfo { + /// Spoken name in pinyin (Chinese has no singular/plural distinction) + name: &'static str, +} + +lazy_static! { + static ref UNITS: HashMap<&'static str, UnitInfo> = { + let mut m = HashMap::new(); + + // Length + m.insert("mm", UnitInfo { name: "haomi" }); // 毫米 + m.insert("cm", UnitInfo { name: "limi" }); // 厘米 + m.insert("m", UnitInfo { name: "mi" }); // 米 + m.insert("km", UnitInfo { name: "gongli" }); // 公里 + m.insert("in", UnitInfo { name: "yingcun" }); // 英寸 + m.insert("ft", UnitInfo { name: "yingchi" }); // 英尺 + m.insert("mi", UnitInfo { name: "yingli" }); // 英里 + + // Weight + m.insert("mg", UnitInfo { name: "haoke" }); // 毫克 + m.insert("g", UnitInfo { name: "ke" }); // 克 + m.insert("kg", UnitInfo { name: "gongjin" }); // 公斤 + m.insert("lb", UnitInfo { name: "bang" }); // 磅 + m.insert("oz", UnitInfo { name: "angsi" }); // 盎司 + m.insert("t", UnitInfo { name: "dun" }); // 吨 + + // Volume + m.insert("ml", UnitInfo { name: "haosheng" }); // 毫升 + m.insert("l", UnitInfo { name: "sheng" }); // 升 + m.insert("L", UnitInfo { name: "sheng" }); // 升 + + // Speed + m.insert("km/h", UnitInfo { name: "gongli mei xiaoshi" }); // 公里每小时 + m.insert("mph", UnitInfo { name: "yingli mei xiaoshi" }); // 英里每小时 + m.insert("m/s", UnitInfo { name: "mi mei miao" }); // 米每秒 + + // Time + m.insert("s", UnitInfo { name: "miao" }); // 秒 + m.insert("sec", UnitInfo { name: "miao" }); // 秒 + m.insert("min", UnitInfo { name: "fenzhong" }); // 分钟 + m.insert("h", UnitInfo { name: "xiaoshi" }); // 小时 + m.insert("hr", UnitInfo { name: "xiaoshi" }); // 小时 + + // Temperature + m.insert("\u{00B0}C", UnitInfo { name: "sheshidu" }); // 摄氏度 + m.insert("\u{00B0}F", UnitInfo { name: "huashidu" }); // 华氏度 + + // Data + m.insert("KB", UnitInfo { name: "qianzi jie" }); // 千字节 + m.insert("MB", UnitInfo { name: "zhaozi jie" }); // 兆字节 + m.insert("GB", UnitInfo { name: "jizi jie" }); // 吉字节 + m.insert("TB", UnitInfo { name: "taizi jie" }); // 太字节 + + // Frequency + m.insert("Hz", UnitInfo { name: "hezi" }); // 赫兹 + m.insert("kHz", UnitInfo { name: "qianhezi" }); // 千赫兹 + m.insert("MHz", UnitInfo { name: "zhaohezi" }); // 兆赫兹 + m.insert("GHz", UnitInfo { name: "jihezi" }); // 吉赫兹 + + m + }; +} + +/// Parse a written measurement to spoken Mandarin pinyin. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Special handling for percentage: "50%" -> "bai fen zhi wu shi" + if let Some(result) = parse_percentage(trimmed) { + return Some(result); + } + + // Try matching known units (longest match first) + let mut unit_matches: Vec<(&str, &UnitInfo)> = UNITS + .iter() + .filter(|(unit, _)| { + trimmed.ends_with(*unit) + && (trimmed.len() == unit.len() || { + let before = &trimmed[..trimmed.len() - unit.len()]; + if unit.len() == 1 && unit.chars().all(|c| c.is_ascii_alphabetic()) { + before.ends_with(' ') + } else { + before.ends_with(' ') || before.ends_with(|c: char| c.is_ascii_digit()) + } + }) + }) + .map(|(k, v)| (*k, v)) + .collect(); + + unit_matches.sort_by(|a, b| b.0.len().cmp(&a.0.len())); + + for (unit_str, unit_info) in unit_matches { + let num_part = trimmed[..trimmed.len() - unit_str.len()].trim(); + if num_part.is_empty() { + continue; + } + + let (is_negative, digits) = if let Some(rest) = num_part.strip_prefix('-') { + (true, rest.trim()) + } else { + (false, num_part) + }; + + let clean: String = digits + .chars() + .filter(|c| c.is_ascii_digit() || *c == '.' || *c == ',') + .collect(); + + if clean.is_empty() + || !clean + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',') + { + continue; + } + + // Handle decimals + if clean.contains('.') { + let parts: Vec<&str> = clean.splitn(2, '.').collect(); + if parts.len() == 2 { + let int_val: i64 = if parts[0].is_empty() { + 0 + } else { + let Ok(v) = parts[0].parse::() else { + continue; + }; + v + }; + let int_words = number_to_words(int_val); + let frac_words = super::spell_digits(parts[1]); + let num_words = if is_negative { + format!("fu {} dian {}", int_words, frac_words) + } else { + format!("{} dian {}", int_words, frac_words) + }; + return Some(format!("{} {}", num_words, unit_info.name)); + } + continue; + } + + let Ok(n) = clean.parse::() else { + continue; + }; + let num_words = if is_negative { + format!("fu {}", number_to_words(n)) + } else { + number_to_words(n) + }; + + return Some(format!("{} {}", num_words, unit_info.name)); + } + + None +} + +/// Parse percentage: "50%" -> "bai fen zhi wu shi" (百分之五十) +fn parse_percentage(input: &str) -> Option { + let num_str = input.strip_suffix('%')?; + let num_str = num_str.trim(); + + if num_str.is_empty() { + return None; + } + + let (is_negative, digits) = if let Some(rest) = num_str.strip_prefix('-') { + (true, rest.trim()) + } else { + (false, num_str) + }; + + // Handle decimal percentages + if digits.contains('.') { + let parts: Vec<&str> = digits.splitn(2, '.').collect(); + if parts.len() == 2 + && !parts[0].is_empty() + && parts[0].chars().all(|c| c.is_ascii_digit()) + && !parts[1].is_empty() + && parts[1].chars().all(|c| c.is_ascii_digit()) + { + let int_val: i64 = parts[0].parse().ok()?; + let int_words = number_to_words(int_val); + let frac_words = super::spell_digits(parts[1]); + let num_words = format!("{} dian {}", int_words, frac_words); + if is_negative { + return Some(format!("fu bai fen zhi {}", num_words)); + } else { + return Some(format!("bai fen zhi {}", num_words)); + } + } + return None; + } + + if !digits.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let n: i64 = digits.parse().ok()?; + let num_words = number_to_words(n); + + if is_negative { + Some(format!("fu bai fen zhi {}", num_words)) + } else { + Some(format!("bai fen zhi {}", num_words)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_units() { + assert_eq!( + parse("200 km/h"), + Some("er bai gongli mei xiaoshi".to_string()) + ); + assert_eq!(parse("1 kg"), Some("yi gongjin".to_string())); + assert_eq!(parse("5 m"), Some("wu mi".to_string())); + } + + #[test] + fn test_temperature() { + assert_eq!(parse("72\u{00B0}C"), Some("qi shi er sheshidu".to_string())); + assert_eq!( + parse("98\u{00B0}F"), + Some("jiu shi ba huashidu".to_string()) + ); + } + + #[test] + fn test_percentage() { + assert_eq!(parse("50%"), Some("bai fen zhi wu shi".to_string())); + assert_eq!(parse("100%"), Some("bai fen zhi yi bai".to_string())); + assert_eq!(parse("3.5%"), Some("bai fen zhi san dian wu".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!(parse("-66 kg"), Some("fu liu shi liu gongjin".to_string())); + } + + #[test] + fn test_data() { + assert_eq!(parse("500 MB"), Some("wu bai zhaozi jie".to_string())); + assert_eq!(parse("1 GB"), Some("yi jizi jie".to_string())); + } + + #[test] + fn test_decimal_with_empty_integer() { + assert_eq!(parse(".5 kg"), Some("ling dian wu gongjin".to_string())); + } + + #[test] + fn test_non_measure() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/zh/mod.rs b/src/tts/zh/mod.rs new file mode 100644 index 0000000..7428ef9 --- /dev/null +++ b/src/tts/zh/mod.rs @@ -0,0 +1,198 @@ +//! Text Normalization taggers for Mandarin Chinese. +//! +//! Converts written-form text to spoken Mandarin in pinyin: +//! - "200" -> "er bai" +//! - "3.14" -> "san dian yi si" +//! - "2025年1月5日" -> "er ling er wu nian yi yue wu ri" + +pub mod cardinal; +pub mod date; +pub mod decimal; +pub mod electronic; +pub mod measure; +pub mod money; +pub mod ordinal; +pub mod telephone; +pub mod time; +pub mod whitelist; + +/// Digit words indexed by value (0..10). +const DIGITS: [&str; 10] = [ + "ling", "yi", "er", "san", "si", "wu", "liu", "qi", "ba", "jiu", +]; + +/// Convert an integer to Mandarin Chinese words (pinyin). +/// +/// Uses the Chinese grouping system based on wan (10,000) and yi (100,000,000) +/// instead of the Western thousand-based system. +/// +/// Examples: +/// - `0` -> `"ling"` +/// - `21` -> `"er shi yi"` +/// - `123` -> `"yi bai er shi san"` +/// - `10000` -> `"yi wan"` +/// - `100000000` -> `"yi yi"` +/// - `-42` -> `"fu si shi er"` +pub fn number_to_words(n: i64) -> String { + if n == 0 { + return "ling".to_string(); + } + + if n < 0 { + let abs_val = (n as u64).wrapping_neg(); + return format!("fu {}", unsigned_to_words(abs_val)); + } + + unsigned_to_words(n as u64) +} + +fn unsigned_to_words(n: u64) -> String { + if n == 0 { + return "ling".to_string(); + } + + let mut parts: Vec = Vec::new(); + let mut remaining = n; + + // Chinese scale units: yi (亿, 10^8) and wan (万, 10^4) + let scales: &[(u64, &str)] = &[ + (1_000_000_000_000_000_0, "jing"), // 京, 10^16 + (1_000_000_000_000, "zhao"), // 兆, 10^12 + (100_000_000, "yi"), // 亿, 10^8 + (10_000, "wan"), // 万, 10^4 + ]; + + for &(scale_value, scale_name) in scales { + if remaining >= scale_value { + let chunk = remaining / scale_value; + remaining %= scale_value; + + // The chunk within a scale is always < 10000 (a wan-group) + let chunk_words = wan_group_to_words(chunk as u32); + parts.push(format!("{} {}", chunk_words, scale_name)); + } + } + + // Remainder (0..9999) + if remaining > 0 { + // Insert "ling" if there's a gap: e.g. 10003 = "yi wan ling san" + if !parts.is_empty() && remaining < 1000 { + parts.push("ling".to_string()); + } + parts.push(wan_group_to_words(remaining as u32)); + } + + parts.join(" ") +} + +/// Convert a number 1..9999 to Mandarin pinyin words. +/// This handles a single wan-group (4-digit group). +fn wan_group_to_words(n: u32) -> String { + debug_assert!(n > 0 && n <= 9999); + let mut parts: Vec = Vec::new(); + + let qian = n / 1000; + let bai = (n % 1000) / 100; + let shi = (n % 100) / 10; + let ge = n % 10; + + if qian > 0 { + parts.push(format!("{} qian", DIGITS[qian as usize])); + } + + if bai > 0 { + parts.push(format!("{} bai", DIGITS[bai as usize])); + } else if qian > 0 && (shi > 0 || ge > 0) { + // Zero placeholder: yi qian ling san shi (1030) + parts.push("ling".to_string()); + } + + if shi > 0 { + if shi == 1 && qian == 0 && bai == 0 { + // For numbers 10-19 at the top level, just say "shi" not "yi shi" + parts.push("shi".to_string()); + } else { + parts.push(format!("{} shi", DIGITS[shi as usize])); + } + } else if bai > 0 && ge > 0 { + // Zero placeholder: yi bai ling san (103) + parts.push("ling".to_string()); + } + + if ge > 0 { + parts.push(DIGITS[ge as usize].to_string()); + } + + parts.join(" ") +} + +/// Spell each digit of a string individually in Mandarin pinyin. +/// +/// "14" -> "yi si" +pub fn spell_digits(s: &str) -> String { + s.chars() + .filter_map(|c| c.to_digit(10).map(|d| DIGITS[d as usize])) + .collect::>() + .join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(number_to_words(0), "ling"); + assert_eq!(number_to_words(1), "yi"); + assert_eq!(number_to_words(10), "shi"); + assert_eq!(number_to_words(11), "shi yi"); + assert_eq!(number_to_words(20), "er shi"); + assert_eq!(number_to_words(21), "er shi yi"); + assert_eq!(number_to_words(99), "jiu shi jiu"); + } + + #[test] + fn test_hundreds() { + assert_eq!(number_to_words(100), "yi bai"); + assert_eq!(number_to_words(103), "yi bai ling san"); + assert_eq!(number_to_words(110), "yi bai yi shi"); + assert_eq!(number_to_words(200), "er bai"); + assert_eq!(number_to_words(999), "jiu bai jiu shi jiu"); + } + + #[test] + fn test_thousands() { + assert_eq!(number_to_words(1000), "yi qian"); + assert_eq!(number_to_words(1030), "yi qian ling san shi"); + assert_eq!(number_to_words(1003), "yi qian ling san"); + assert_eq!(number_to_words(2025), "er qian ling er shi wu"); + assert_eq!(number_to_words(9999), "jiu qian jiu bai jiu shi jiu"); + } + + #[test] + fn test_wan() { + assert_eq!(number_to_words(10000), "yi wan"); + assert_eq!(number_to_words(10003), "yi wan ling san"); + assert_eq!(number_to_words(50000), "wu wan"); + assert_eq!(number_to_words(12345), "yi wan er qian san bai si shi wu"); + } + + #[test] + fn test_yi_unit() { + assert_eq!(number_to_words(100_000_000), "yi yi"); + assert_eq!(number_to_words(200_000_000), "er yi"); + } + + #[test] + fn test_negative() { + assert_eq!(number_to_words(-42), "fu si shi er"); + assert_eq!(number_to_words(-1000), "fu yi qian"); + } + + #[test] + fn test_spell_digits() { + assert_eq!(spell_digits("14"), "yi si"); + assert_eq!(spell_digits("0"), "ling"); + assert_eq!(spell_digits("2025"), "er ling er wu"); + } +} diff --git a/src/tts/zh/money.rs b/src/tts/zh/money.rs new file mode 100644 index 0000000..65e394c --- /dev/null +++ b/src/tts/zh/money.rs @@ -0,0 +1,303 @@ +//! Money TN tagger for Mandarin Chinese. +//! +//! Converts written currency expressions to spoken Mandarin pinyin: +//! - "¥100" -> "yi bai yuan" +//! - "¥5.50" -> "wu yuan wu jiao" +//! - "$100" -> "yi bai meiyuan" +//! - "€50" -> "wu shi ouyuan" +//! - "£20" -> "er shi yingbang" + +use super::number_to_words; + +/// Scale suffixes recognized after a currency amount. +/// yi = 亿 (hundred million), wan = 万 (ten thousand) +const SCALE_SUFFIXES: &[&str] = &["yi", "wan"]; + +struct Currency { + /// Main unit name in pinyin + unit: &'static str, + /// Sub-unit at 0.1 level (jiao for RMB) + sub_unit_tenth: Option<&'static str>, + /// Sub-unit at 0.01 level (fen for RMB) + _sub_unit_hundredth: Option<&'static str>, +} + +const RMB: Currency = Currency { + unit: "yuan", + sub_unit_tenth: Some("jiao"), + _sub_unit_hundredth: Some("fen"), +}; + +const DOLLAR: Currency = Currency { + unit: "meiyuan", + sub_unit_tenth: None, + _sub_unit_hundredth: None, +}; + +const EURO: Currency = Currency { + unit: "ouyuan", + sub_unit_tenth: None, + _sub_unit_hundredth: None, +}; + +const POUND: Currency = Currency { + unit: "yingbang", + sub_unit_tenth: None, + _sub_unit_hundredth: None, +}; + +/// Parse a written money expression to spoken Mandarin pinyin. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Try prefix symbol: ¥100, $50, €20, £10 + if let Some(result) = parse_prefix_currency(trimmed) { + return Some(result); + } + + // Try suffix with Chinese character: 100元 + if let Some(result) = parse_chinese_suffix(trimmed) { + return Some(result); + } + + None +} + +fn parse_prefix_currency(input: &str) -> Option { + let (currency, rest) = if let Some(r) = input.strip_prefix('\u{00A5}') { + // ¥ + (&RMB, r) + } else if let Some(r) = input.strip_prefix('$') { + (&DOLLAR, r) + } else if let Some(r) = input.strip_prefix('\u{20AC}') { + // € + (&EURO, r) + } else if let Some(r) = input.strip_prefix('\u{00A3}') { + // £ + (&POUND, r) + } else { + return None; + }; + + let rest = rest.trim(); + if rest.is_empty() { + return None; + } + + // Check for scale suffix: "$2.5 yi" (2.5 hundred million dollars) + let (amount_str, scale) = extract_scale(rest); + + // Without a scale suffix, the amount must be purely numeric + if scale.is_none() + && !amount_str + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == ' ') + { + return None; + } + + if let Some(scale_word) = scale { + // With scale: "$2.5 yi" → "er dian wu yi meiyuan" + if amount_str.contains('.') { + let parts: Vec<&str> = amount_str.splitn(2, '.').collect(); + if parts.len() == 2 { + let int_val: i64 = parts[0].parse().ok()?; + let int_words = number_to_words(int_val); + let frac_words = super::spell_digits(parts[1]); + return Some(format!( + "{} dian {} {} {}", + int_words, frac_words, scale_word, currency.unit + )); + } + } else { + // No decimal: "$50 yi" → "wu shi yi meiyuan" + let clean: String = amount_str.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + let words = number_to_words(n); + return Some(format!("{} {} {}", words, scale_word, currency.unit)); + } + } + + parse_amount(amount_str, currency) +} + +/// Extract scale suffix from the amount string. +fn extract_scale(input: &str) -> (&str, Option<&str>) { + for &scale in SCALE_SUFFIXES { + if let Some(before) = input.strip_suffix(scale) { + let before = before.trim_end(); + if !before.is_empty() + && before + .chars() + .all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == ' ') + { + return (before, Some(scale)); + } + } + } + (input, None) +} + +fn parse_chinese_suffix(input: &str) -> Option { + // Handle 100元 format + let amount_str = input.strip_suffix('\u{5143}')?; // 元 + let amount_str = amount_str.trim(); + + if amount_str.is_empty() { + return None; + } + + if !amount_str.chars().all(|c| c.is_ascii_digit() || c == '.') { + return None; + } + + parse_amount(amount_str, &RMB) +} + +fn parse_amount(amount_str: &str, currency: &Currency) -> Option { + if amount_str.is_empty() { + return None; + } + + if amount_str.contains('.') { + let parts: Vec<&str> = amount_str.splitn(2, '.').collect(); + if parts.len() == 2 { + let int_clean: String = parts[0].chars().filter(|c| c.is_ascii_digit()).collect(); + let main_val: i64 = if int_clean.is_empty() { + 0 + } else { + int_clean.parse().ok()? + }; + + let frac_str = parts[1].trim(); + + // For RMB, handle jiao and fen + if currency.sub_unit_tenth.is_some() { + let jiao: i64; + let fen: i64; + if frac_str.len() >= 2 { + jiao = frac_str[..1].parse().ok()?; + fen = frac_str[1..2].parse().ok()?; + } else if frac_str.len() == 1 { + jiao = frac_str.parse().ok()?; + fen = 0; + } else { + jiao = 0; + fen = 0; + } + return Some(format_rmb(main_val, jiao, fen)); + } + + // For foreign currencies, say "N dian M M unit" + if main_val == 0 && frac_str == "0" { + return Some(format!("ling {}", currency.unit)); + } + let cents: i64 = if frac_str.is_empty() { + 0 + } else if frac_str.len() == 1 { + frac_str.parse::().ok()? * 10 + } else if frac_str.len() == 2 { + frac_str.parse().ok()? + } else { + frac_str[..2].parse().ok()? + }; + + if cents == 0 { + return Some(format!("{} {}", number_to_words(main_val), currency.unit)); + } + + return Some(format!( + "{} dian {} {}", + number_to_words(main_val), + super::spell_digits(&format!("{:02}", cents)), + currency.unit + )); + } + } + + let clean: String = amount_str.chars().filter(|c| c.is_ascii_digit()).collect(); + let n: i64 = clean.parse().ok()?; + Some(format!("{} {}", number_to_words(n), currency.unit)) +} + +fn format_rmb(yuan: i64, jiao: i64, fen: i64) -> String { + let mut parts: Vec = Vec::new(); + + if yuan > 0 { + parts.push(format!("{} yuan", number_to_words(yuan))); + } + + if jiao > 0 { + parts.push(format!("{} jiao", number_to_words(jiao))); + } + + if fen > 0 { + parts.push(format!("{} fen", number_to_words(fen))); + } + + if parts.is_empty() { + return "ling yuan".to_string(); + } + + parts.join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_rmb() { + assert_eq!(parse("\u{00A5}100"), Some("yi bai yuan".to_string())); + assert_eq!(parse("\u{00A5}5.50"), Some("wu yuan wu jiao".to_string())); + assert_eq!( + parse("\u{00A5}3.25"), + Some("san yuan er jiao wu fen".to_string()) + ); + assert_eq!(parse("\u{00A5}1"), Some("yi yuan".to_string())); + } + + #[test] + fn test_foreign_currencies() { + assert_eq!(parse("$100"), Some("yi bai meiyuan".to_string())); + assert_eq!(parse("\u{20AC}50"), Some("wu shi ouyuan".to_string())); + assert_eq!(parse("\u{00A3}20"), Some("er shi yingbang".to_string())); + } + + #[test] + fn test_chinese_suffix() { + assert_eq!(parse("100\u{5143}"), Some("yi bai yuan".to_string())); + } + + #[test] + fn test_dollars_and_cents() { + assert_eq!(parse("$5.50"), Some("wu dian wu ling meiyuan".to_string())); + assert_eq!(parse("$1.01"), Some("yi dian ling yi meiyuan".to_string())); + assert_eq!( + parse("$0.99"), + Some("ling dian jiu jiu meiyuan".to_string()) + ); + } + + #[test] + fn test_large_amounts() { + assert_eq!(parse("$2.5 yi"), Some("er dian wu yi meiyuan".to_string())); + assert_eq!(parse("$50 wan"), Some("wu shi wan meiyuan".to_string())); + } + + #[test] + fn test_trailing_dot() { + assert_eq!(parse("$5."), Some("wu meiyuan".to_string())); + assert_eq!(parse("$1."), Some("yi meiyuan".to_string())); + } + + #[test] + fn test_non_money() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/zh/ordinal.rs b/src/tts/zh/ordinal.rs new file mode 100644 index 0000000..0bc746b --- /dev/null +++ b/src/tts/zh/ordinal.rs @@ -0,0 +1,87 @@ +//! Ordinal TN tagger for Mandarin Chinese. +//! +//! Converts written ordinal numbers to spoken Mandarin pinyin: +//! - "第1" -> "di yi" +//! - "第2" -> "di er" +//! - "第100" -> "di yi bai" +//! +//! Chinese ordinals are formed by prefixing "di" (第) to the cardinal number. + +use super::number_to_words; + +/// Parse a written ordinal to spoken Mandarin pinyin words. +/// +/// Supports formats: +/// - Chinese style: "第1", "第2", "第100" +/// - English suffix style: "1st", "2nd", "3rd", "4th" (also converted to Chinese ordinals) +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Try Chinese ordinal prefix: 第N + if let Some(num_str) = trimmed.strip_prefix('\u{7B2C}') { + // 第 = U+7B2C + let num_str = num_str.trim(); + if num_str.is_empty() || !num_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } + let n: i64 = num_str.parse().ok()?; + if n <= 0 { + return None; + } + return Some(format!("di {}", number_to_words(n))); + } + + // Try English ordinal suffixes: 1st, 2nd, 3rd, 4th, etc. + let num_str = if let Some(s) = trimmed.strip_suffix("st") { + s + } else if let Some(s) = trimmed.strip_suffix("nd") { + s + } else if let Some(s) = trimmed.strip_suffix("rd") { + s + } else if let Some(s) = trimmed.strip_suffix("th") { + s + } else { + return None; + }; + + if num_str.is_empty() || !num_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let n: i64 = num_str.parse().ok()?; + if n <= 0 { + return None; + } + + Some(format!("di {}", number_to_words(n))) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_chinese_ordinal() { + assert_eq!(parse("\u{7B2C}1"), Some("di yi".to_string())); + assert_eq!(parse("\u{7B2C}2"), Some("di er".to_string())); + assert_eq!(parse("\u{7B2C}10"), Some("di shi".to_string())); + assert_eq!(parse("\u{7B2C}100"), Some("di yi bai".to_string())); + } + + #[test] + fn test_english_suffix() { + assert_eq!(parse("1st"), Some("di yi".to_string())); + assert_eq!(parse("2nd"), Some("di er".to_string())); + assert_eq!(parse("3rd"), Some("di san".to_string())); + assert_eq!(parse("4th"), Some("di si".to_string())); + assert_eq!(parse("21st"), Some("di er shi yi".to_string())); + } + + #[test] + fn test_non_ordinals() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("0th"), None); + assert_eq!(parse("\u{7B2C}0"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/zh/telephone.rs b/src/tts/zh/telephone.rs new file mode 100644 index 0000000..54ff8e3 --- /dev/null +++ b/src/tts/zh/telephone.rs @@ -0,0 +1,170 @@ +//! Telephone TN tagger for Mandarin Chinese (pinyin output). +//! +//! Converts written phone numbers to spoken form in pinyin: +//! - "123-456-7890" -> "yi er san, si wu liu, qi ba jiu ling" +//! - "+86-10-1234-5678" -> "jia ba liu, yi ling, yi er san si, wu liu qi ba" +//! - "(010) 1234-5678" -> "ling yi ling, yi er san si, wu liu qi ba" + +/// Parse a written phone number to spoken form in Mandarin Chinese pinyin. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + // Phone numbers contain digits and separators (-, ., space, parens) + // Must have mostly digits + let digit_count = trimmed.chars().filter(|c| c.is_ascii_digit()).count(); + let non_digit_non_sep = trimmed + .chars() + .filter(|c| { + !c.is_ascii_digit() + && *c != '-' + && *c != '.' + && *c != ' ' + && *c != '(' + && *c != ')' + && *c != '+' + }) + .count(); + + // Must have at least 7 digits and no unexpected characters + if digit_count < 7 || non_digit_non_sep > 0 { + return None; + } + + // Must contain at least one separator (-, ., space, parens) to distinguish + // from plain numbers like "1000000" + let has_separator = trimmed + .chars() + .any(|c| c == '-' || c == '.' || c == ' ' || c == '(' || c == ')'); + if !has_separator { + return None; + } + + let mut parts: Vec = Vec::new(); + let mut has_plus = false; + + // Handle leading + + let rest = if let Some(r) = trimmed.strip_prefix('+') { + has_plus = true; + r.trim_start() + } else { + trimmed + }; + + // Split by common separators + let groups = split_phone_groups(rest); + + if has_plus && !groups.is_empty() { + // The first group after + is the country code + let mut first = String::from("jia "); + first.push_str(&spell_digit_group(&groups[0])); + parts.push(first); + for g in &groups[1..] { + parts.push(spell_digit_group(g)); + } + } else { + for g in &groups { + parts.push(spell_digit_group(g)); + } + } + + if parts.is_empty() { + return None; + } + + Some(parts.join(", ")) +} + +/// Split phone number into groups by separators. +fn split_phone_groups(input: &str) -> Vec { + let mut groups: Vec = Vec::new(); + let mut current = String::new(); + + for c in input.chars() { + match c { + '0'..='9' => current.push(c), + '-' | '.' | ' ' | '(' | ')' => { + if !current.is_empty() { + groups.push(current.clone()); + current.clear(); + } + } + _ => {} + } + } + + if !current.is_empty() { + groups.push(current); + } + + groups +} + +/// Spell each digit in a group using Chinese pinyin. +fn spell_digit_group(group: &str) -> String { + group + .chars() + .filter_map(|c| { + let word = match c { + '0' => "ling", + '1' => "yi", + '2' => "er", + '3' => "san", + '4' => "si", + '5' => "wu", + '6' => "liu", + '7' => "qi", + '8' => "ba", + '9' => "jiu", + _ => return None, + }; + Some(word) + }) + .collect::>() + .join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_standard_phone() { + assert_eq!( + parse("123-456-7890"), + Some("yi er san, si wu liu, qi ba jiu ling".to_string()) + ); + } + + #[test] + fn test_with_country_code() { + assert_eq!( + parse("+86-10-1234-5678"), + Some("jia ba liu, yi ling, yi er san si, wu liu qi ba".to_string()) + ); + } + + #[test] + fn test_parentheses() { + assert_eq!( + parse("(010) 1234-5678"), + Some("ling yi ling, yi er san si, wu liu qi ba".to_string()) + ); + } + + #[test] + fn test_dots() { + assert_eq!( + parse("555.123.4567"), + Some("wu wu wu, yi er san, si wu liu qi".to_string()) + ); + } + + #[test] + fn test_non_phone() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("123"), None); + } +} diff --git a/src/tts/zh/time.rs b/src/tts/zh/time.rs new file mode 100644 index 0000000..afa34ba --- /dev/null +++ b/src/tts/zh/time.rs @@ -0,0 +1,156 @@ +//! Time TN tagger for Mandarin Chinese. +//! +//! Converts written time expressions to spoken Mandarin pinyin: +//! - "14:30" -> "shi si dian san shi fen" +//! - "3:05" -> "san dian ling wu fen" +//! - "12:00" -> "shi er dian zheng" +//! +//! Format: HOUR "dian" MINUTES "fen" + +use super::number_to_words; + +/// Parse a written time expression to spoken Mandarin pinyin. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Try Chinese format: 14时30分 or 14点30分 + if let Some(result) = parse_chinese_format(trimmed) { + return Some(result); + } + + // Try "14:30" colon format + if let Some(result) = parse_colon_format(trimmed) { + return Some(result); + } + + None +} + +fn parse_chinese_format(input: &str) -> Option { + // 时 (shi, U+65F6) or 点 (dian, U+70B9) as hour marker + let shi_char = '\u{65F6}'; // 时 + let dian_char = '\u{70B9}'; // 点 + let fen_char = '\u{5206}'; // 分 + + let hour_sep_pos = input.find(shi_char).or_else(|| input.find(dian_char)); + let hour_sep_pos = hour_sep_pos?; + + let hour_str = &input[..hour_sep_pos]; + if hour_str.is_empty() || !hour_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + let hour: u32 = hour_str.parse().ok()?; + if hour > 23 { + return None; + } + + // Find the separator character to know its byte length + let sep_char = input[hour_sep_pos..].chars().next()?; + let after_sep = &input[hour_sep_pos + sep_char.len_utf8()..]; + + let min_str = after_sep.trim_end_matches(fen_char).trim(); + let minute: u32 = if min_str.is_empty() { + 0 + } else { + if !min_str.chars().all(|c| c.is_ascii_digit()) { + return None; + } + let m: u32 = min_str.parse().ok()?; + if m > 59 { + return None; + } + m + }; + + Some(format_time(hour, minute)) +} + +fn parse_colon_format(input: &str) -> Option { + if !input.contains(':') { + return None; + } + + let parts: Vec<&str> = input.splitn(2, ':').collect(); + if parts.len() != 2 { + return None; + } + + let hour_str = parts[0].trim(); + let min_str = parts[1].trim(); + + if !hour_str.chars().all(|c| c.is_ascii_digit()) || hour_str.is_empty() { + return None; + } + if !min_str.chars().all(|c| c.is_ascii_digit()) || min_str.is_empty() { + return None; + } + + let hour: u32 = hour_str.parse().ok()?; + let minute: u32 = min_str.parse().ok()?; + + if hour > 23 || minute > 59 { + return None; + } + + Some(format_time(hour, minute)) +} + +fn format_time(hour: u32, minute: u32) -> String { + let hour_words = number_to_words(hour as i64); + + if minute == 0 { + format!("{} dian zheng", hour_words) + } else if minute < 10 { + // "ling" as placeholder for single-digit minutes: san dian ling wu fen + format!( + "{} dian ling {} fen", + hour_words, + number_to_words(minute as i64) + ) + } else { + format!("{} dian {} fen", hour_words, number_to_words(minute as i64)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_colon_format() { + assert_eq!(parse("14:30"), Some("shi si dian san shi fen".to_string())); + assert_eq!(parse("3:05"), Some("san dian ling wu fen".to_string())); + assert_eq!(parse("12:00"), Some("shi er dian zheng".to_string())); + assert_eq!(parse("0:00"), Some("ling dian zheng".to_string())); + } + + #[test] + fn test_chinese_format() { + assert_eq!( + parse("14\u{65F6}30\u{5206}"), + Some("shi si dian san shi fen".to_string()) + ); + assert_eq!( + parse("8\u{70B9}15\u{5206}"), + Some("ba dian shi wu fen".to_string()) + ); + } + + #[test] + fn test_24h() { + assert_eq!(parse("14:00"), Some("shi si dian zheng".to_string())); + assert_eq!( + parse("23:59"), + Some("er shi san dian wu shi jiu fen".to_string()) + ); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("25:00"), None); + assert_eq!(parse("12:60"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/tts/zh/whitelist.rs b/src/tts/zh/whitelist.rs new file mode 100644 index 0000000..a578b20 --- /dev/null +++ b/src/tts/zh/whitelist.rs @@ -0,0 +1,87 @@ +//! Whitelist TN tagger for Mandarin Chinese (pinyin output). +//! +//! Lookup table for common abbreviations with pinyin spoken forms: +//! - "Dr." -> "boshi" +//! - "Mr." -> "xiansheng" +//! - "Mrs." -> "taitai" +//! - "etc." -> "deng deng" + +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + static ref WHITELIST: HashMap<&'static str, &'static str> = { + let mut m = HashMap::new(); + // Titles + m.insert("Dr.", "boshi"); + m.insert("Dr", "boshi"); + m.insert("Mr.", "xiansheng"); + m.insert("Mr", "xiansheng"); + m.insert("Mrs.", "taitai"); + m.insert("Mrs", "taitai"); + m.insert("Ms.", "nvshi"); + m.insert("Ms", "nvshi"); + m.insert("Prof.", "jiaoshou"); + m.insert("St.", "sheng"); + m.insert("Jr.", "xiao"); + m.insert("Sr.", "lao"); + + // Common abbreviations + m.insert("etc.", "deng deng"); + m.insert("vs.", "dui"); + m.insert("vs", "dui"); + m.insert("No.", "hao"); + + // Business terms + m.insert("Inc.", "gongsi"); + m.insert("Ltd.", "youxian gongsi"); + m.insert("Co.", "gongsi"); + + m + }; +} + +/// Parse a whitelist abbreviation to its spoken form in Mandarin Chinese pinyin. +pub fn parse(input: &str) -> Option { + let trimmed = input.trim(); + + // Direct lookup (case-sensitive) + if let Some(&spoken) = WHITELIST.get(trimmed) { + return Some(spoken.to_string()); + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_titles() { + assert_eq!(parse("Dr."), Some("boshi".to_string())); + assert_eq!(parse("Mr."), Some("xiansheng".to_string())); + assert_eq!(parse("Mrs."), Some("taitai".to_string())); + assert_eq!(parse("Ms."), Some("nvshi".to_string())); + } + + #[test] + fn test_abbreviations() { + assert_eq!(parse("etc."), Some("deng deng".to_string())); + assert_eq!(parse("vs."), Some("dui".to_string())); + assert_eq!(parse("No."), Some("hao".to_string())); + } + + #[test] + fn test_business() { + assert_eq!(parse("Inc."), Some("gongsi".to_string())); + assert_eq!(parse("Ltd."), Some("youxian gongsi".to_string())); + assert_eq!(parse("Co."), Some("gongsi".to_string())); + } + + #[test] + fn test_no_match() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("world"), None); + } +} diff --git a/swift/NemoTextProcessing.swift b/swift/NemoTextProcessing.swift index a1a9c79..12a501c 100644 --- a/swift/NemoTextProcessing.swift +++ b/swift/NemoTextProcessing.swift @@ -137,6 +137,72 @@ public enum NemoTextProcessing { return String(cString: resultPtr) } + // MARK: - Language-Specific Text Normalization + + /// Normalize written-form text to spoken form for a specific language. + /// + /// Supported languages: "en", "fr", "es", "de", "zh", "hi", "ja". + /// Falls back to English for unrecognized language codes. + /// + /// - Parameters: + /// - input: Written-form text + /// - language: ISO 639-1 language code + /// - Returns: Spoken-form text, or original if no normalization applies + /// + /// Example: + /// ```swift + /// let result = NemoTextProcessing.tnNormalize("123", language: "fr") + /// // result is "cent vingt-trois" + /// ``` + public static func tnNormalize(_ input: String, language: String) -> String { + guard let inputC = input.cString(using: .utf8), + let langC = language.cString(using: .utf8) else { + return input + } + guard let resultPtr = nemo_tn_normalize_lang(inputC, langC) else { + return input + } + defer { nemo_free_string(resultPtr) } + return String(cString: resultPtr) + } + + /// Normalize a full sentence for a specific language, replacing written-form spans with spoken form. + /// + /// - Parameters: + /// - input: Sentence containing written-form spans + /// - language: ISO 639-1 language code + /// - Returns: Sentence with written-form spans replaced with spoken form + public static func tnNormalizeSentence(_ input: String, language: String) -> String { + guard let inputC = input.cString(using: .utf8), + let langC = language.cString(using: .utf8) else { + return input + } + guard let resultPtr = nemo_tn_normalize_sentence_lang(inputC, langC) else { + return input + } + defer { nemo_free_string(resultPtr) } + return String(cString: resultPtr) + } + + /// Normalize a full sentence for a specific language with a configurable max span size. + /// + /// - Parameters: + /// - input: Sentence containing written-form spans + /// - language: ISO 639-1 language code + /// - maxSpanTokens: Maximum consecutive tokens per normalizable span (default 16) + /// - Returns: Sentence with written-form spans replaced with spoken form + public static func tnNormalizeSentence(_ input: String, language: String, maxSpanTokens: UInt32) -> String { + guard let inputC = input.cString(using: .utf8), + let langC = language.cString(using: .utf8) else { + return input + } + guard let resultPtr = nemo_tn_normalize_sentence_with_max_span_lang(inputC, langC, maxSpanTokens) else { + return input + } + defer { nemo_free_string(resultPtr) } + return String(cString: resultPtr) + } + // MARK: - Custom Rules /// Add a custom spoken→written normalization rule. diff --git a/swift/include/nemo_text_processing.h b/swift/include/nemo_text_processing.h index 479ee1e..b8ec478 100644 --- a/swift/include/nemo_text_processing.h +++ b/swift/include/nemo_text_processing.h @@ -100,6 +100,39 @@ char* nemo_tn_normalize_sentence(const char* input); */ char* nemo_tn_normalize_sentence_with_max_span(const char* input, uint32_t max_span_tokens); +/** + * Text Normalization: convert written-form text to spoken form for a specific language. + * + * Supported language codes: "en", "fr", "es", "de", "zh", "hi", "ja". + * Falls back to English for unrecognized codes. + * + * @param input Null-terminated UTF-8 string of written text + * @param lang Null-terminated language code (e.g. "fr", "de") + * @return Newly allocated string with spoken form, or NULL on error. + * Must be freed with nemo_free_string(). + */ +char* nemo_tn_normalize_lang(const char* input, const char* lang); + +/** + * Text Normalization: normalize a full sentence for a specific language. + * + * @param input Null-terminated UTF-8 string + * @param lang Null-terminated language code (e.g. "fr", "de") + * @return Newly allocated string, must be freed with nemo_free_string(). + */ +char* nemo_tn_normalize_sentence_lang(const char* input, const char* lang); + +/** + * Text Normalization: normalize a full sentence for a specific language + * with configurable max span size. + * + * @param input Null-terminated UTF-8 string + * @param lang Null-terminated language code (e.g. "fr", "de") + * @param max_span_tokens Maximum number of consecutive tokens per span (default 16) + * @return Newly allocated string, must be freed with nemo_free_string(). + */ +char* nemo_tn_normalize_sentence_with_max_span_lang(const char* input, const char* lang, uint32_t max_span_tokens); + /** * Free a string allocated by nemo_normalize or nemo_normalize_sentence. * diff --git a/tests/multilang_tn_tests.rs b/tests/multilang_tn_tests.rs new file mode 100644 index 0000000..b85400c --- /dev/null +++ b/tests/multilang_tn_tests.rs @@ -0,0 +1,245 @@ +//! Multi-language text normalization integration tests. +//! +//! Tests the language dispatch API: tn_normalize_lang() and tn_normalize_sentence_lang(). +//! Ensures all 6 languages are wired up and producing correct output through the public API. + +use text_processing_rs::{tn_normalize_lang, tn_normalize_sentence_lang}; + +// ── French ─────────────────────────────────────────────────────────── + +#[test] +fn test_fr_cardinal() { + assert_eq!(tn_normalize_lang("123", "fr"), "cent vingt-trois"); + assert_eq!(tn_normalize_lang("0", "fr"), "zero"); + assert_eq!(tn_normalize_lang("71", "fr"), "soixante et onze"); + assert_eq!(tn_normalize_lang("80", "fr"), "quatre-vingts"); + assert_eq!(tn_normalize_lang("99", "fr"), "quatre-vingt-dix-neuf"); +} + +#[test] +fn test_fr_money() { + assert_eq!( + tn_normalize_lang("5,50 \u{20ac}", "fr"), + "cinq euros et cinquante centimes" + ); + assert_eq!(tn_normalize_lang("$1", "fr"), "un dollar"); +} + +#[test] +fn test_fr_time() { + assert_eq!(tn_normalize_lang("14:30", "fr"), "quatorze heures trente"); + assert_eq!(tn_normalize_lang("0:00", "fr"), "minuit"); + assert_eq!(tn_normalize_lang("12:00", "fr"), "midi"); +} + +#[test] +fn test_fr_sentence() { + assert_eq!( + tn_normalize_sentence_lang("Il a 123 chats", "fr"), + "Il a cent vingt-trois chats" + ); +} + +// ── Spanish ────────────────────────────────────────────────────────── + +#[test] +fn test_es_cardinal() { + assert_eq!(tn_normalize_lang("123", "es"), "ciento veintitres"); + assert_eq!(tn_normalize_lang("21", "es"), "veintiuno"); + assert_eq!(tn_normalize_lang("100", "es"), "cien"); + assert_eq!(tn_normalize_lang("500", "es"), "quinientos"); +} + +#[test] +fn test_es_money() { + assert_eq!(tn_normalize_lang("$5", "es"), "cinco dolares"); + assert_eq!(tn_normalize_lang("$1", "es"), "un dolar"); +} + +#[test] +fn test_es_time() { + assert_eq!(tn_normalize_lang("14:30", "es"), "catorce treinta"); + assert_eq!(tn_normalize_lang("0:00", "es"), "medianoche"); + assert_eq!(tn_normalize_lang("12:00", "es"), "mediodia"); +} + +#[test] +fn test_es_sentence() { + assert_eq!( + tn_normalize_sentence_lang("Tengo 21 gatos", "es"), + "Tengo veintiuno gatos" + ); +} + +// ── German ─────────────────────────────────────────────────────────── + +#[test] +fn test_de_cardinal() { + assert_eq!(tn_normalize_lang("21", "de"), "einundzwanzig"); + assert_eq!(tn_normalize_lang("123", "de"), "einhundertdreiundzwanzig"); + assert_eq!( + tn_normalize_lang("2025", "de"), + "zweitausendfuenfundzwanzig" + ); +} + +#[test] +fn test_de_money() { + assert_eq!( + tn_normalize_lang("5,50 \u{20ac}", "de"), + "fuenf euro und fuenfzig cent" + ); +} + +#[test] +fn test_de_time() { + assert_eq!(tn_normalize_lang("14:30", "de"), "vierzehn uhr dreissig"); + assert_eq!(tn_normalize_lang("0:00", "de"), "mitternacht"); + assert_eq!(tn_normalize_lang("12:00", "de"), "mittag"); +} + +#[test] +fn test_de_sentence() { + assert_eq!( + tn_normalize_sentence_lang("Ich habe 123 Katzen", "de"), + "Ich habe einhundertdreiundzwanzig Katzen" + ); +} + +// ── Mandarin Chinese ───────────────────────────────────────────────── + +#[test] +fn test_zh_cardinal() { + assert_eq!(tn_normalize_lang("123", "zh"), "yi bai er shi san"); + assert_eq!(tn_normalize_lang("10000", "zh"), "yi wan"); + assert_eq!(tn_normalize_lang("100000000", "zh"), "yi yi"); +} + +#[test] +fn test_zh_money() { + assert_eq!(tn_normalize_lang("\u{00a5}100", "zh"), "yi bai yuan"); + assert_eq!(tn_normalize_lang("$50", "zh"), "wu shi meiyuan"); +} + +#[test] +fn test_zh_time() { + assert_eq!(tn_normalize_lang("14:30", "zh"), "shi si dian san shi fen"); + assert_eq!(tn_normalize_lang("12:00", "zh"), "shi er dian zheng"); +} + +#[test] +fn test_zh_sentence() { + assert_eq!( + tn_normalize_sentence_lang("price is $50 ok", "zh"), + "price is wu shi meiyuan ok" + ); +} + +// ── Hindi ──────────────────────────────────────────────────────────── + +#[test] +fn test_hi_cardinal() { + assert_eq!(tn_normalize_lang("123", "hi"), "ek sau teis"); + assert_eq!(tn_normalize_lang("100000", "hi"), "ek lakh"); + assert_eq!(tn_normalize_lang("10000000", "hi"), "ek crore"); +} + +#[test] +fn test_hi_money() { + assert_eq!(tn_normalize_lang("\u{20b9}100", "hi"), "ek sau rupaye"); +} + +#[test] +fn test_hi_time() { + assert_eq!( + tn_normalize_lang("14:30", "hi"), + "chaudah baj kar tees minat" + ); +} + +#[test] +fn test_hi_sentence() { + assert_eq!( + tn_normalize_sentence_lang("price 100 rupees", "hi"), + "price ek sau rupees" + ); +} + +// ── Japanese ───────────────────────────────────────────────────────── + +#[test] +fn test_ja_cardinal() { + assert_eq!(tn_normalize_lang("123", "ja"), "hyaku ni juu san"); + assert_eq!(tn_normalize_lang("10000", "ja"), "ichi man"); + assert_eq!(tn_normalize_lang("300", "ja"), "sanbyaku"); +} + +#[test] +fn test_ja_money() { + assert_eq!(tn_normalize_lang("\u{00a5}100", "ja"), "hyaku en"); + assert_eq!(tn_normalize_lang("$5", "ja"), "go doru"); +} + +#[test] +fn test_ja_time() { + assert_eq!(tn_normalize_lang("14:30", "ja"), "juu yo ji san juppun"); + assert_eq!(tn_normalize_lang("9:00", "ja"), "ku ji"); +} + +#[test] +fn test_ja_sentence() { + assert_eq!( + tn_normalize_sentence_lang("price is $5 ok", "ja"), + "price is go doru ok" + ); +} + +// ── Cross-language dispatch ────────────────────────────────────────── + +#[test] +fn test_same_input_different_languages() { + // "123" should produce different output per language + let results: Vec<(&str, String)> = ["en", "fr", "es", "de", "zh", "hi", "ja"] + .iter() + .map(|lang| (*lang, tn_normalize_lang("123", lang))) + .collect(); + + assert_eq!(results[0].1, "one hundred twenty three"); // en + assert_eq!(results[1].1, "cent vingt-trois"); // fr + assert_eq!(results[2].1, "ciento veintitres"); // es + assert_eq!(results[3].1, "einhundertdreiundzwanzig"); // de + assert_eq!(results[4].1, "yi bai er shi san"); // zh + assert_eq!(results[5].1, "ek sau teis"); // hi + assert_eq!(results[6].1, "hyaku ni juu san"); // ja +} + +#[test] +fn test_unknown_lang_falls_back_to_english() { + assert_eq!(tn_normalize_lang("123", "xx"), "one hundred twenty three"); + assert_eq!(tn_normalize_lang("123", ""), "one hundred twenty three"); +} + +#[test] +fn test_sentence_lang_passthrough() { + // Non-normalizable text should pass through unchanged for all languages + for lang in &["fr", "es", "de", "zh", "hi", "ja"] { + assert_eq!( + tn_normalize_sentence_lang("hello world", lang), + "hello world", + "passthrough failed for lang={}", + lang + ); + } +} + +#[test] +fn test_sentence_lang_empty() { + for lang in &["fr", "es", "de", "zh", "hi", "ja"] { + assert_eq!( + tn_normalize_sentence_lang("", lang), + "", + "empty failed for lang={}", + lang + ); + } +}