diff --git a/src/asr/de/cardinal.rs b/src/asr/de/cardinal.rs new file mode 100644 index 0000000..dfd3104 --- /dev/null +++ b/src/asr/de/cardinal.rs @@ -0,0 +1,447 @@ +//! Cardinal number tagger for German. +//! +//! Converts spoken German number words to digits: +//! - "einhundert" → "100" +//! - "einundzwanzig" → "21" (reversed tens) +//! - "minus fünfundzwanzigtausendsiebenunddreißig" → "-25037" +//! - "eine million" → "1000000" + +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + /// Single digit and special number words + static ref ONES: HashMap<&'static str, i64> = { + let mut m = HashMap::new(); + m.insert("null", 0); + m.insert("eins", 1); + m.insert("ein", 1); + m.insert("eine", 1); + m.insert("einer", 1); + m.insert("zwei", 2); + m.insert("drei", 3); + m.insert("vier", 4); + m.insert("fünf", 5); + m.insert("sechs", 6); + m.insert("sieben", 7); + m.insert("acht", 8); + m.insert("neun", 9); + m.insert("zehn", 10); + m.insert("elf", 11); + m.insert("zwölf", 12); + m.insert("dreizehn", 13); + m.insert("vierzehn", 14); + m.insert("fünfzehn", 15); + m.insert("sechzehn", 16); + m.insert("siebzehn", 17); + m.insert("achtzehn", 18); + m.insert("neunzehn", 19); + m + }; + + /// Tens + static ref TENS: HashMap<&'static str, i64> = { + let mut m = HashMap::new(); + m.insert("zwanzig", 20); + m.insert("dreißig", 30); + m.insert("dreissig", 30); + m.insert("vierzig", 40); + m.insert("fünfzig", 50); + m.insert("sechzig", 60); + m.insert("siebzig", 70); + m.insert("achtzig", 80); + m.insert("neunzig", 90); + m + }; + + /// Scale words (long scale for German) + static ref SCALES: HashMap<&'static str, i128> = { + let mut m = HashMap::new(); + m.insert("hundert", 100); + m.insert("tausend", 1_000); + m.insert("million", 1_000_000); + m.insert("millionen", 1_000_000); + m.insert("milliarde", 1_000_000_000); + m.insert("milliarden", 1_000_000_000); + m.insert("billion", 1_000_000_000_000); + m.insert("billionen", 1_000_000_000_000); + m.insert("billiarde", 1_000_000_000_000_000); + m.insert("billiarden", 1_000_000_000_000_000); + m.insert("trillion", 1_000_000_000_000_000_000); + m.insert("trillionen", 1_000_000_000_000_000_000); + m + }; + + /// Small numbers that pass through as words (0-9) + static ref PASSTHROUGH: Vec<&'static str> = vec![ + "null", "eins", "ein", "eine", "einer", "zwei", "drei", + "vier", "fünf", "sechs", "sieben", "acht", "neun", + ]; +} + +/// Parse spoken German cardinal number to string representation. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + if input_trim.is_empty() { + return None; + } + + // Pass-through single small numbers (0-9) + if PASSTHROUGH.contains(&input_trim) { + return Some(input_trim.to_string()); + } + + // Don't parse space-separated sequences of plain digit/ones words + // without any scale words (hundert, tausend, million, etc.) or "und" + // This prevents catching phone number digit sequences like + // "null vier eins eins eins zwei drei vier" + if input_trim.contains(' ') && !contains_structure_word(input_trim) { + return None; + } + + // Check for negative + let (is_negative, rest) = if input_trim.starts_with("minus ") { + (true, input_trim.strip_prefix("minus ")?) + } else { + (false, input_trim) + }; + + let num = words_to_number(rest)?; + + if is_negative { + Some(format!("-{}", num)) + } else { + Some(num.to_string()) + } +} + +/// Check if input contains structure words that indicate a compound number +/// (not just a list of digit words) +fn contains_structure_word(input: &str) -> bool { + let structure_words = [ + "hundert", + "tausend", + "million", + "millionen", + "milliarde", + "milliarden", + "billion", + "billionen", + "billiarde", + "billiarden", + "trillion", + "trillionen", + "und", + "minus", + ]; + let tokens: Vec<&str> = input.split_whitespace().collect(); + tokens + .iter() + .any(|t| structure_words.contains(t) || contains_compound_structure(t)) +} + +/// Check if a compound word contains scale words +fn contains_compound_structure(word: &str) -> bool { + let scale_fragments = [ + "hundert", "tausend", "million", "milliard", "billion", "billiard", "trillion", "und", + ]; + // Only check if the word is longer than any known simple word + if word.len() <= 9 { + // "neunzehn" is 8 chars, "sechzehn" is 8 + return false; + } + scale_fragments.iter().any(|&f| word.contains(f)) +} + +/// Convert German number words to a number. +/// Handles both spaced and compound forms. +/// +/// Uses a multi-level accumulator: +/// - `result`: flushed value from million+ scale words +/// - `thousands`: value accumulated at the thousands level +/// - `sub`: current ones/tens/hundreds accumulator +pub fn words_to_number(input: &str) -> Option { + let normalized = decompose_compound(input); + let normalized = normalized.replace(" und ", " ").replace(" ", " "); + + let tokens: Vec<&str> = normalized.split_whitespace().collect(); + if tokens.is_empty() { + return None; + } + + // Check if it's just a single pass-through word + if tokens.len() == 1 { + if let Some(&val) = ONES.get(tokens[0]) { + if val == 0 { + return None; // "null" should not return 0 from words_to_number + } + return Some(val as i128); + } + if let Some(&val) = TENS.get(tokens[0]) { + return Some(val as i128); + } + return None; + } + + let mut result: i128 = 0; // million+ level + let mut thousands: i128 = 0; // thousands level + let mut sub: i128 = 0; // ones/tens/hundreds accumulator + + for token in &tokens { + if let Some(&scale) = SCALES.get(token) { + if scale == 100 { + // hundert multiplies sub or assumes 1 + if sub == 0 { + sub = 100; + } else { + sub *= 100; + } + } else if scale == 1000 { + // tausend: flush sub into thousands + if sub == 0 { + sub = 1; + } + thousands += sub * 1000; + sub = 0; + } else { + // million, milliarde, billion, etc. + // flush sub + thousands into multiplier for this scale + let multiplier = thousands + sub; + let multiplier = if multiplier == 0 { 1 } else { multiplier }; + result += multiplier * scale; + thousands = 0; + sub = 0; + } + } else if let Some(&val) = ONES.get(token) { + sub += val as i128; + } else if let Some(&val) = TENS.get(token) { + sub += val as i128; + } else { + return None; // Unknown word + } + } + + result += thousands + sub; + + if result == 0 { + None + } else { + Some(result) + } +} + +/// Public wrapper for decompose_compound, used by date parser for year patterns. +pub fn decompose_compound_public(input: &str) -> String { + decompose_compound(input) +} + +/// Decompose German compound number words into space-separated tokens. +/// +/// E.g., "einhundertzwei" → "ein hundert zwei" +/// "fünfundzwanzigtausendsiebenunddreißig" → "fünf und zwanzig tausend sieben und dreißig" +fn decompose_compound(input: &str) -> String { + // First, normalize the input by replacing hyphens with spaces + let input = input.replace('-', " "); + + // Process each space-separated token + let tokens: Vec<&str> = input.split_whitespace().collect(); + let mut result_parts: Vec = Vec::new(); + + for token in tokens { + // Check if the token is already a known word + if is_known_word(token) { + result_parts.push(token.to_string()); + continue; + } + + // Try to decompose the compound word + if let Some(decomposed) = decompose_single_compound(token) { + result_parts.push(decomposed); + } else { + result_parts.push(token.to_string()); + } + } + + result_parts.join(" ") +} + +/// Check if a token is a known number word +fn is_known_word(token: &str) -> bool { + ONES.contains_key(token) + || TENS.contains_key(token) + || SCALES.contains_key(token) + || token == "und" + || token == "minus" +} + +/// Decompose a single compound German number word. +fn decompose_single_compound(word: &str) -> Option { + let mut remaining = word.to_string(); + let mut parts: Vec = Vec::new(); + + while !remaining.is_empty() { + let mut found = false; + + // Try scale words first (longest match) + let scale_words = [ + "trillionen", + "trillion", + "billiarden", + "billiarde", + "billionen", + "billion", + "milliarden", + "milliarde", + "millionen", + "million", + "tausend", + "hundert", + ]; + + for &sw in &scale_words { + if remaining.starts_with(sw) { + parts.push(sw.to_string()); + remaining = remaining[sw.len()..].to_string(); + found = true; + break; + } + } + if found { + continue; + } + + // Try "und" connector + if remaining.starts_with("und") { + parts.push("und".to_string()); + remaining = remaining[3..].to_string(); + continue; + } + + // Try teens and special words (longest first) + let teen_words = [ + "neunzehn", + "achtzehn", + "siebzehn", + "sechzehn", + "fünfzehn", + "vierzehn", + "dreizehn", + "zwölf", + "elf", + ]; + for &tw in &teen_words { + if remaining.starts_with(tw) { + parts.push(tw.to_string()); + remaining = remaining[tw.len()..].to_string(); + found = true; + break; + } + } + if found { + continue; + } + + // Try tens (longest first) + let tens_words = [ + "neunzig", "achtzig", "siebzig", "sechzig", "fünfzig", "vierzig", "dreißig", + "dreissig", "zwanzig", + ]; + for &tw in &tens_words { + if remaining.starts_with(tw) { + parts.push(tw.to_string()); + remaining = remaining[tw.len()..].to_string(); + found = true; + break; + } + } + if found { + continue; + } + + // Try ones (check longer words first to avoid partial matches) + let ones_words = [ + "sieben", "einer", "eine", "eins", "ein", "neun", "acht", "fünf", "vier", "drei", + "zwei", "sechs", "zehn", "null", + ]; + for &ow in &ones_words { + if remaining.starts_with(ow) { + parts.push(ow.to_string()); + remaining = remaining[ow.len()..].to_string(); + found = true; + break; + } + } + if found { + continue; + } + + // Unknown character sequence - not a valid compound number + return None; + } + + if parts.len() > 1 { + Some(parts.join(" ")) + } else if parts.len() == 1 { + Some(parts[0].clone()) + } else { + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_passthrough() { + assert_eq!(parse("null"), Some("null".to_string())); + assert_eq!(parse("eins"), Some("eins".to_string())); + assert_eq!(parse("ein"), Some("ein".to_string())); + assert_eq!(parse("eine"), Some("eine".to_string())); + assert_eq!(parse("einer"), Some("einer".to_string())); + assert_eq!(parse("zwei"), Some("zwei".to_string())); + assert_eq!(parse("neun"), Some("neun".to_string())); + } + + #[test] + fn test_teens() { + assert_eq!(parse("zehn"), Some("10".to_string())); + assert_eq!(parse("elf"), Some("11".to_string())); + assert_eq!(parse("zwölf"), Some("12".to_string())); + assert_eq!(parse("achtzehn"), Some("18".to_string())); + } + + #[test] + fn test_tens() { + assert_eq!(parse("zwanzig"), Some("20".to_string())); + assert_eq!(parse("dreißig"), Some("30".to_string())); + assert_eq!(parse("neunzig"), Some("90".to_string())); + } + + #[test] + fn test_hundreds() { + assert_eq!(parse("einhundert"), Some("100".to_string())); + assert_eq!(parse("ein hundert"), Some("100".to_string())); + assert_eq!(parse("einhundertzwei"), Some("102".to_string())); + } + + #[test] + fn test_compound() { + assert_eq!(parse("einundzwanzig"), Some("21".to_string())); + assert_eq!(parse("eintausend"), Some("1000".to_string())); + assert_eq!(parse("eintausendzwanzig"), Some("1020".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!(parse("minus sechzig"), Some("-60".to_string())); + } + + #[test] + fn test_large() { + assert_eq!(parse("eine million"), Some("1000000".to_string())); + assert_eq!(parse("zwei millionen drei"), Some("2000003".to_string())); + } +} diff --git a/src/asr/de/date.rs b/src/asr/de/date.rs new file mode 100644 index 0000000..cd014e6 --- /dev/null +++ b/src/asr/de/date.rs @@ -0,0 +1,275 @@ +//! Date tagger for German. +//! +//! Converts spoken German date expressions to written form: +//! - "vierundzwanzigster juli zwei tausend dreizehn" → "24. Jul. 2013" +//! - "neunzehn achtzig" → "1980" +//! - "januar zweitausendneun" → "Jan. 2009" +//! - "vierzehnter januar" → "14. Jan." + +use super::cardinal; + +const MONTHS: [(&str, &str); 12] = [ + ("januar", "Jan."), + ("februar", "Feb."), + ("märz", "Mär."), + ("april", "Apr."), + ("mai", "Mai"), + ("juni", "Jun."), + ("juli", "Jul."), + ("august", "Aug."), + ("september", "Sep."), + ("oktober", "Okt."), + ("november", "Nov."), + ("dezember", "Dez."), +]; + +/// Parse spoken German date expression to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Try full date: "vierundzwanzigster juli zwei tausend dreizehn" + if let Some(result) = parse_full_date(input_trim) { + return Some(result); + } + + // Try day + month: "vierzehnter januar" + if let Some(result) = parse_day_month(input_trim) { + return Some(result); + } + + // Try month + year: "januar zweitausendneun" + if let Some(result) = parse_month_year(input_trim) { + return Some(result); + } + + // Try year patterns: "neunzehn achtzig" → 1980 + if let Some(result) = parse_year_pattern(input_trim) { + return Some(result); + } + + None +} + +/// Parse full date: "Nter MONAT JAHR" +fn parse_full_date(input: &str) -> Option { + for &(month_name, month_abbr) in &MONTHS { + if let Some(pos) = input.find(month_name) { + let before = input[..pos].trim(); + let after = input[pos + month_name.len()..].trim(); + + // Parse day (ordinal) before month + let day = parse_ordinal_day(before)?; + if day < 1 || day > 31 { + return None; + } + + // Parse year after month + if after.is_empty() { + return None; // This is day+month, handled by parse_day_month + } + + let year = parse_year(after)?; + + return Some(format!("{}. {} {}", day, month_abbr, year)); + } + } + None +} + +/// Parse day + month: "vierzehnter januar" → "14. Jan." +fn parse_day_month(input: &str) -> Option { + for &(month_name, month_abbr) in &MONTHS { + if input.ends_with(month_name) { + let before = input[..input.len() - month_name.len()].trim(); + let day = parse_ordinal_day(before)?; + if day < 1 || day > 31 { + return None; + } + return Some(format!("{}. {}", day, month_abbr)); + } + } + None +} + +/// Parse month + year: "januar zweitausendneun" → "Jan. 2009" +fn parse_month_year(input: &str) -> Option { + for &(month_name, month_abbr) in &MONTHS { + if input.starts_with(month_name) { + let after = input[month_name.len()..].trim(); + if after.is_empty() { + continue; + } + // Reject compound: "januarzweitausendneun" (no space) + if !input.contains(' ') { + return None; + } + let year = parse_year(after)?; + return Some(format!("{} {}", month_abbr, year)); + } + } + None +} + +/// Parse year patterns: +/// - "neunzehn achtzig" → 1980 +/// - "neunzehnhundertachtzig" → 1980 +/// - "zwei tausend zwanzig" → 2020 +/// - "zwanzig zwanzig" → 2020 +fn parse_year_pattern(input: &str) -> Option { + // Reject if contains "achtziger" etc. (decade reference, not year) + if input.ends_with("iger") || input.ends_with("er") { + // Check if it ends with a decade suffix + let decade_suffixes = [ + "achtziger", + "siebziger", + "sechziger", + "fünfziger", + "vierziger", + "dreißiger", + "zwanziger", + "neunziger", + ]; + for &suffix in &decade_suffixes { + if input.ends_with(suffix) { + // This is "neunzehn achtziger" → "19 achtziger" + let before = input[..input.len() - suffix.len()].trim(); + if !before.is_empty() { + let num = cardinal::words_to_number(before)?; + return Some(format!("{} {}", num, suffix)); + } + return None; + } + } + } + + let year = parse_year(input)?; + Some(year.to_string()) +} + +/// Parse a year value from German words +fn parse_year(input: &str) -> Option { + // Try direct cardinal parsing first + if let Some(num) = cardinal::words_to_number(input) { + if num >= 1000 && num <= 9999 { + return Some(num); + } + } + + // Try "CENTURY DECADE" pattern: "neunzehn achtzig" → 1980 + // Also handles compound form: "neunzehnachtzig" → decompose → "neunzehn achtzig" + // And spaced compound decades: "neunzehn vierundneunzig" → 1994 + + // First try with original whitespace-split tokens + let tokens: Vec<&str> = input.split_whitespace().collect(); + if tokens.len() == 2 { + if let Some(century) = cardinal::words_to_number(tokens[0]) { + if let Some(decade) = cardinal::words_to_number(tokens[1]) { + if century >= 10 && century <= 99 && decade >= 0 && decade <= 99 { + let year = century * 100 + decade; + if year >= 1000 && year <= 9999 { + return Some(year); + } + } + } + } + } + + // Try compound form: "neunzehnachtzig" → decompose → "neunzehn achtzig" + if tokens.len() == 1 { + let decomposed = cardinal::decompose_compound_public(input); + let dtokens: Vec<&str> = decomposed.split_whitespace().collect(); + if dtokens.len() == 2 { + if let Some(century) = cardinal::words_to_number(dtokens[0]) { + if let Some(decade) = cardinal::words_to_number(dtokens[1]) { + if century >= 10 && century <= 99 && decade >= 0 && decade <= 99 { + let year = century * 100 + decade; + if year >= 1000 && year <= 9999 { + return Some(year); + } + } + } + } + } + } + + None +} + +/// Parse ordinal day number from German ordinal word. +/// "erster" → 1, "vierundzwanzigster" → 24, "dreißigster" → 30 +fn parse_ordinal_day(input: &str) -> Option { + // Strip ordinal suffix + let ordinal_suffixes = [ + "ster", "sten", "stem", "stes", "ste", "ter", "ten", "tem", "tes", "te", + ]; + + for &suffix in &ordinal_suffixes { + if input.ends_with(suffix) { + let stem = &input[..input.len() - suffix.len()]; + // Reconstruct the cardinal form + let cardinal_form = reconstruct_cardinal_from_ordinal(stem); + return cardinal::words_to_number(&cardinal_form); + } + } + + None +} + +/// Reconstruct cardinal form from ordinal stem. +/// "er" → "eins" (from "erster"), "vierundzwanzig" stays, etc. +fn reconstruct_cardinal_from_ordinal(stem: &str) -> String { + match stem { + "er" | "ers" => "eins".to_string(), + "zwei" => "zwei".to_string(), + "drit" => "drei".to_string(), + "vier" => "vier".to_string(), + "fünf" => "fünf".to_string(), + "sechs" => "sechs".to_string(), + "sieb" => "sieben".to_string(), + "ach" => "acht".to_string(), + "neun" => "neun".to_string(), + "zehn" => "zehn".to_string(), + "elf" => "elf".to_string(), + "zwölf" => "zwölf".to_string(), + _ => { + // For compound ordinals, the stem is already the cardinal form + // e.g., "vierundzwanzig" from "vierundzwanzigster" + // But we need to handle "hundert" → "hundert" (from "hundertste") + stem.to_string() + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_full_date() { + assert_eq!( + parse("vierundzwanzigster juli zwei tausend dreizehn"), + Some("24. Jul. 2013".to_string()) + ); + } + + #[test] + fn test_day_month() { + assert_eq!(parse("vierzehnter januar"), Some("14. Jan.".to_string())); + assert_eq!(parse("erster januar"), Some("1. Jan.".to_string())); + } + + #[test] + fn test_year() { + assert_eq!(parse("neunzehn achtzig"), Some("1980".to_string())); + assert_eq!(parse("zwei tausend zwanzig"), Some("2020".to_string())); + } + + #[test] + fn test_month_year() { + assert_eq!( + parse("januar zweitausendneun"), + Some("Jan. 2009".to_string()) + ); + } +} diff --git a/src/asr/de/decimal.rs b/src/asr/de/decimal.rs new file mode 100644 index 0000000..cf3cd7d --- /dev/null +++ b/src/asr/de/decimal.rs @@ -0,0 +1,188 @@ +//! Decimal number tagger for German. +//! +//! Converts spoken German decimal numbers to written form: +//! - "eins komma zwei millionen" → "1,2 millionen" +//! - "minus sechzig komma zwei vier null null" → "-60,2400" +//! - "acht hundert achtzehn komma drei null drei" → "818,303" + +use super::cardinal; + +/// Parse spoken German decimal number to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + if !input_trim.contains("komma") { + // Check for scale-only patterns: "eine million" → "1 million", etc. + return parse_scale_only(input_trim); + } + + // Check for negative + let (is_negative, rest) = if input_trim.starts_with("minus ") { + (true, input_trim.strip_prefix("minus ")?) + } else { + (false, input_trim) + }; + + // Split on "komma" + let parts: Vec<&str> = rest.splitn(2, "komma").collect(); + if parts.len() != 2 { + return None; + } + + let integer_part = parts[0].trim(); + let decimal_rest = parts[1].trim(); + + // Parse integer part + let int_value = if integer_part.is_empty() || integer_part == "null" { + "0".to_string() + } else { + let num = cardinal::words_to_number(integer_part)?; + num.to_string() + }; + + // Check for scale suffix in decimal part + let scale_words = [ + "millionen", + "million", + "milliarden", + "milliarde", + "billionen", + "billion", + "billiarden", + "billiarde", + "trillionen", + "trillion", + "tausend", + ]; + + let mut scale_suffix = None; + let mut decimal_digits_str = decimal_rest.to_string(); + + for &sw in &scale_words { + if decimal_rest.ends_with(sw) { + let before = decimal_rest[..decimal_rest.len() - sw.len()].trim(); + decimal_digits_str = before.to_string(); + scale_suffix = Some(sw); + break; + } + } + + // Parse decimal digits + let decimal_digits = parse_decimal_digits(&decimal_digits_str)?; + + let sign = if is_negative { "-" } else { "" }; + + if let Some(scale) = scale_suffix { + Some(format!( + "{}{},{} {}", + sign, int_value, decimal_digits, scale + )) + } else { + Some(format!("{}{},{}", sign, int_value, decimal_digits)) + } +} + +/// Parse scale-only patterns: "eine million" → "1 million" +fn parse_scale_only(input: &str) -> Option { + let scale_patterns = [ + ("millionen", "millionen"), + ("million", "million"), + ("milliarden", "milliarden"), + ("milliarde", "milliarde"), + ("billionen", "billionen"), + ("billion", "billion"), + ]; + + for &(spoken, written) in &scale_patterns { + if input.ends_with(spoken) { + let num_part = input[..input.len() - spoken.len()].trim(); + if num_part.is_empty() { + continue; + } + let num = cardinal::words_to_number(num_part)?; + return Some(format!("{} {}", num, written)); + } + } + + None +} + +/// Parse decimal digit words to digit string. +/// "zwei vier null null" → "2400" +/// "drei null drei" → "303" +fn parse_decimal_digits(input: &str) -> Option { + let digit_map = [ + ("null", "0"), + ("eins", "1"), + ("ein", "1"), + ("zwei", "2"), + ("drei", "3"), + ("vier", "4"), + ("fünf", "5"), + ("sechs", "6"), + ("sieben", "7"), + ("acht", "8"), + ("neun", "9"), + ]; + + let tokens: Vec<&str> = input.split_whitespace().collect(); + if tokens.is_empty() { + return None; + } + + let mut result = String::new(); + for token in &tokens { + let mut found = false; + for &(word, digit) in &digit_map { + if token == &word { + result.push_str(digit); + found = true; + break; + } + } + if !found { + return None; + } + } + + if result.is_empty() { + None + } else { + Some(result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_decimal() { + assert_eq!( + parse("acht hundert achtzehn komma drei null drei"), + Some("818,303".to_string()) + ); + } + + #[test] + fn test_negative() { + assert_eq!( + parse("minus sechzig komma zwei vier null null"), + Some("-60,2400".to_string()) + ); + } + + #[test] + fn test_scale() { + assert_eq!( + parse("eins komma zwei millionen"), + Some("1,2 millionen".to_string()) + ); + } + + #[test] + fn test_scale_only() { + assert_eq!(parse("eine million"), Some("1 million".to_string())); + } +} diff --git a/src/asr/de/electronic.rs b/src/asr/de/electronic.rs new file mode 100644 index 0000000..450dd25 --- /dev/null +++ b/src/asr/de/electronic.rs @@ -0,0 +1,115 @@ +//! Electronic tagger for German. +//! +//! Converts spoken German email/URL descriptions to written form: +//! - "a b c at g mail punkt com" → "abc@gmail.com" +//! - "h t t p s doppelpunkt slash slash w w w punkt a b c punkt com" → "https://www.abc.com" + +/// Parse spoken German electronic address to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Must contain "at" (email) or "doppelpunkt" or "punkt" (URL) + if !input_trim.contains(" at ") + && !input_trim.contains("doppelpunkt") + && !input_trim.contains(" punkt ") + { + return None; + } + + let tokens: Vec<&str> = input_trim.split_whitespace().collect(); + if tokens.len() < 3 { + return None; + } + + // Convert tokens to characters/symbols + let mut result = String::new(); + let mut i = 0; + + while i < tokens.len() { + let token = tokens[i]; + match token { + "at" => result.push('@'), + "punkt" => result.push('.'), + "bindestrich" => result.push('-'), + "unterstrich" => result.push('_'), + "doppelpunkt" => result.push(':'), + "slash" => result.push('/'), + "fragezeichen" => result.push('?'), + "gleichheitszeichen" => result.push('='), + "tilde" => result.push('~'), + _ => { + // Single letter + if token.len() == 1 && token.chars().all(|c| c.is_ascii_alphabetic()) { + result.push_str(token); + } else if let Some(digit) = word_to_digit(token) { + result.push_str(digit); + } else { + // Multi-char token that's not a keyword - treat as literal + result.push_str(token); + } + } + } + i += 1; + } + + if result.is_empty() { + None + } else { + Some(result) + } +} + +/// Convert German digit word to digit string +fn word_to_digit(word: &str) -> Option<&'static str> { + match word { + "null" => Some("0"), + "eins" | "ein" | "eine" => Some("1"), + "zwei" => Some("2"), + "drei" => Some("3"), + "vier" => Some("4"), + "fünf" => Some("5"), + "sechs" => Some("6"), + "sieben" => Some("7"), + "acht" => Some("8"), + "neun" => Some("9"), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_email() { + assert_eq!( + parse("a b c at g mail punkt com"), + Some("abc@gmail.com".to_string()) + ); + assert_eq!( + parse("a b c at a b c punkt com"), + Some("abc@abc.com".to_string()) + ); + } + + #[test] + fn test_email_with_digits() { + assert_eq!( + parse("a eins b zwei at a b c punkt com"), + Some("a1b2@abc.com".to_string()) + ); + } + + #[test] + fn test_url() { + assert_eq!( + parse("h t t p s doppelpunkt slash slash w w w punkt a b c punkt com"), + Some("https://www.abc.com".to_string()) + ); + assert_eq!( + parse("w w w punkt a b c punkt com"), + Some("www.abc.com".to_string()) + ); + } +} diff --git a/src/asr/de/fraction.rs b/src/asr/de/fraction.rs new file mode 100644 index 0000000..d7716c8 --- /dev/null +++ b/src/asr/de/fraction.rs @@ -0,0 +1,224 @@ +//! Fraction tagger for German. +//! +//! Converts spoken German fractions to written form: +//! - "ein halb" → "1/2" +//! - "ein drittel" → "1/3" +//! - "ein ein halb" → "1 1/2" +//! - "minus ein zwei und zwanzigstel" → "-1/22" + +use super::cardinal; + +/// Parse spoken German fraction to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Check for negative + let (is_negative, rest) = if input_trim.starts_with("minus ") { + (true, input_trim.strip_prefix("minus ")?) + } else { + (false, input_trim) + }; + + let sign = if is_negative { "-" } else { "" }; + + // Try simple fraction first: "ein halb" → "1/2" + // This also handles compound denominators: "ein zwei und zwanzigstel" → "1/22" + // and "ein ein hundertstel" → "1/100" (compound denom "ein hundertstel" = 100) + if let Some(result) = parse_simple_fraction(rest) { + return Some(format!("{}{}", sign, result)); + } + + // Try mixed fraction: "ein ein halb" → "1 1/2" + if let Some(result) = parse_mixed_fraction(rest) { + return Some(format!("{}{}", sign, result)); + } + + None +} + +/// Parse mixed fraction: "ein ein halb" → "1 1/2" +/// Only matches when the fraction part uses a simple (single-word) denominator. +/// Compound denominators like "ein hundertstel" are left to parse_simple_fraction +/// so that "ein ein hundertstel" parses as "1/100" (numer=1, denom="ein hundertstel"=100). +fn parse_mixed_fraction(input: &str) -> Option { + let tokens: Vec<&str> = input.split_whitespace().collect(); + if tokens.len() < 3 { + return None; + } + + // Only try mixed when the last token is a simple denominator word + let last = *tokens.last()?; + if parse_denominator(last).is_none() { + return None; + } + + // The fraction part is exactly 2 tokens: "NUMER DENOM" + // E.g., "ein halb", "zwei drittel" + if tokens.len() >= 3 { + let frac_part = tokens[tokens.len() - 2..].join(" "); + if let Some(frac) = parse_simple_fraction(&frac_part) { + let whole_part = tokens[..tokens.len() - 2].join(" "); + let whole = cardinal::words_to_number(&whole_part)?; + return Some(format!("{} {}", whole, frac)); + } + } + + None +} + +/// Parse simple fraction: "ein halb" → "1/2", "vier halbe" → "4/2" +fn parse_simple_fraction(input: &str) -> Option { + let tokens: Vec<&str> = input.split_whitespace().collect(); + if tokens.is_empty() { + return None; + } + + let last = *tokens.last()?; + let last_idx = tokens.len() - 1; + + // Try compound denominator FIRST (handles "ein hundertstel", "zwei und zwanzigstel") + // This takes priority because "hundertstel" as a simple denom = 100, but + // "ein hundertstel" as compound denom = 100 with the "ein" being part of the denom + if last.ends_with("stel") + || last.ends_with("halb") + || last.ends_with("halbe") + || last.ends_with("halbes") + || last.ends_with("halber") + || last.ends_with("halben") + { + // Try compound denominators with increasing scope + for j in 1..=last_idx { + let denom_str = tokens[j..].join(" "); + if let Some(denom) = parse_compound_denominator(&denom_str) { + let numer_tokens = &tokens[..j]; + if numer_tokens.is_empty() { + continue; + } + let numer_str = numer_tokens.join(" "); + if let Some(numer) = parse_numerator(&numer_str) { + return Some(format!("{}/{}", numer, denom)); + } + } + } + } + + // Simple denominator: last token is a known fraction word. + // Only accept single-token numerators here to avoid "ein ein" → 2 misparse. + // Multi-token numerators with simple denoms go through mixed fraction instead. + if let Some(denom) = parse_denominator(last) { + if last_idx == 1 { + // Exactly one numerator token + let numer_str = tokens[0]; + if let Some(numer) = parse_numerator(numer_str) { + return Some(format!("{}/{}", numer, denom)); + } + } + } + + None +} + +/// Parse a numerator (number word or "null") +fn parse_numerator(input: &str) -> Option { + if input == "null" { + return Some(0); + } + cardinal::words_to_number(input) +} + +/// Parse a denominator word to its numeric value +fn parse_denominator(word: &str) -> Option { + match word { + "halb" | "halbe" | "halbes" | "halber" | "halben" | "halbem" => Some(2), + "drittel" | "drittels" => Some(3), + "viertel" | "viertels" => Some(4), + "fünftel" | "fünftels" => Some(5), + "sechstel" | "sechstels" => Some(6), + "siebtel" | "siebtels" => Some(7), + "achtel" | "achtels" => Some(8), + "neuntel" | "neuntels" => Some(9), + "zehntel" | "zehntels" => Some(10), + "elftel" | "elftels" => Some(11), + "zwölftel" | "zwölftels" => Some(12), + "dreizehntel" => Some(13), + "vierzehntel" => Some(14), + "fünfzehntel" => Some(15), + "sechzehntel" => Some(16), + "siebzehntel" => Some(17), + "achtzehntel" => Some(18), + "neunzehntel" => Some(19), + "zwanzigstel" => Some(20), + "dreißigstel" | "dreissigstel" => Some(30), + "vierzigstel" => Some(40), + "fünfzigstel" => Some(50), + "sechzigstel" => Some(60), + "siebzigstel" => Some(70), + "achtzigstel" => Some(80), + "neunzigstel" => Some(90), + "hundertstel" => Some(100), + "nulltel" => Some(0), + _ => None, + } +} + +/// Parse compound denominator: "zwei und zwanzigstel" → 22 +/// Only handles multi-token denominators. Single-token denominators +/// are handled by parse_denominator in the simple path. +fn parse_compound_denominator(input: &str) -> Option { + let tokens: Vec<&str> = input.split_whitespace().collect(); + if tokens.len() <= 1 { + return None; + } + + // Pattern: "X und Ystel" → reconstruct number + // E.g., "zwei und zwanzigstel" → "zwei und zwanzig" → 22 + let last = *tokens.last()?; + + // Try to extract the base number from the -stel suffix + if let Some(stem) = last.strip_suffix("stel") { + // Reconstruct: everything before last token + stem + let mut num_parts: Vec<&str> = tokens[..tokens.len() - 1].to_vec(); + num_parts.push(stem); + let num_str = num_parts.join(" "); + return cardinal::words_to_number(&num_str); + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple_fractions() { + assert_eq!(parse("ein halb"), Some("1/2".to_string())); + assert_eq!(parse("ein drittel"), Some("1/3".to_string())); + assert_eq!(parse("ein viertel"), Some("1/4".to_string())); + assert_eq!(parse("zwei neuntel"), Some("2/9".to_string())); + } + + #[test] + fn test_mixed() { + assert_eq!(parse("ein ein halb"), Some("1 1/2".to_string())); + } + + #[test] + fn test_compound_denom() { + assert_eq!(parse("ein zwei und zwanzigstel"), Some("1/22".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!( + parse("minus ein zwei und zwanzigstel"), + Some("-1/22".to_string()) + ); + } + + #[test] + fn test_null() { + assert_eq!(parse("null nulltel"), Some("0/0".to_string())); + } +} diff --git a/src/asr/de/measure.rs b/src/asr/de/measure.rs new file mode 100644 index 0000000..333149d --- /dev/null +++ b/src/asr/de/measure.rs @@ -0,0 +1,253 @@ +//! Measure tagger for German. +//! +//! Converts spoken German measurements to written form: +//! - "zwei hundert kilometer pro stunde" → "200 km/h" +//! - "minus sechs und sechzig kilogramm" → "-66 kg" +//! - "eins komma eins zentimeter" → "1,1 cm" +//! - "ein halb fuß" → "1/2 ft" + +use super::cardinal; +use super::decimal; +use super::fraction; + +/// Unit definition +struct Unit { + spoken: &'static [&'static str], + symbol: &'static str, +} + +const COMPOUND_UNITS: &[(&str, &str)] = &[ + ("kilometer pro stunde", "km/h"), + ("meter pro sekunde", "m/s"), +]; + +const MODIFIER_UNITS: &[(&str, &str, &str)] = &[ + // (modifier, base_spoken, symbol_suffix) + ("quadrat kilometer", "km²", ""), + ("quadrat meter", "m²", ""), + ("kubik zentimeter", "cm³", ""), + ("kubik meter", "m³", ""), +]; + +const SIMPLE_UNITS: &[(&str, &str)] = &[ + // Longest first to avoid partial matches + ("kilowattstunden", "kwh"), + ("kilowattstunde", "kwh"), + ("mikrometer", "μm"), + ("millimeter", "mm"), + ("zentimeter", "cm"), + ("kilometer", "km"), + ("millivolt", "mv"), + ("milliliter", "ml"), + ("kilogramm", "kg"), + ("milligramm", "mg"), + ("meter", "m"), + ("gramm", "g"), + ("tonnen", "t"), + ("tonne", "t"), + ("liter", "l"), + ("stunden", "h"), + ("stunde", "h"), + ("minuten", "min"), + ("minute", "min"), + ("sekunden", "s"), + ("sekunde", "s"), + ("hertz", "hz"), + ("volt", "v"), + ("watt", "w"), + ("fuß", "ft"), + ("fuss", "ft"), +]; + +/// Parse spoken German measurement expression to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Try "pro MODIFIER UNIT" pattern: "X pro quadrat kilometer" + if let Some(result) = parse_per_unit(input_trim) { + return Some(result); + } + + // Try compound units: "X kilometer pro stunde" + if let Some(result) = parse_compound_unit(input_trim) { + return Some(result); + } + + // Try modifier units: "X quadrat kilometer", "X kubik meter" + if let Some(result) = parse_modifier_unit(input_trim) { + return Some(result); + } + + // Try simple unit + if let Some(result) = parse_simple_unit(input_trim) { + return Some(result); + } + + None +} + +/// Parse "X pro MODIFIER UNIT" → "X /UNIT²" +fn parse_per_unit(input: &str) -> Option { + if !input.contains(" pro ") { + return None; + } + + let parts: Vec<&str> = input.splitn(2, " pro ").collect(); + if parts.len() != 2 { + return None; + } + + let num_part = parts[0].trim(); + let unit_part = parts[1].trim(); + + // Parse unit with modifier + let unit_symbol = parse_unit_symbol(unit_part)?; + let num_value = parse_number_value(num_part)?; + + Some(format!("{} /{}", num_value, unit_symbol)) +} + +/// Parse compound unit: "X kilometer pro stunde" +fn parse_compound_unit(input: &str) -> Option { + for &(spoken, symbol) in COMPOUND_UNITS { + if input.ends_with(spoken) { + let num_part = input[..input.len() - spoken.len()].trim(); + let num_value = parse_number_value(num_part)?; + return Some(format!("{} {}", num_value, symbol)); + } + } + None +} + +/// Parse modifier unit: "X quadrat kilometer", "X kubik meter" +fn parse_modifier_unit(input: &str) -> Option { + for &(spoken, symbol, _) in MODIFIER_UNITS { + if input.ends_with(spoken) { + let num_part = input[..input.len() - spoken.len()].trim(); + let num_value = parse_number_value(num_part)?; + return Some(format!("{} {}", num_value, symbol)); + } + } + None +} + +/// Parse simple unit: "X kilogramm" +fn parse_simple_unit(input: &str) -> Option { + // Handle negative + let (is_negative, rest) = if input.starts_with("minus ") { + (true, input.strip_prefix("minus ")?) + } else { + (false, input) + }; + + for &(spoken, symbol) in SIMPLE_UNITS { + if rest.ends_with(spoken) { + let num_part = rest[..rest.len() - spoken.len()].trim(); + let sign = if is_negative { "-" } else { "" }; + let num_value = parse_number_value(num_part)?; + return Some(format!("{}{} {}", sign, num_value, symbol)); + } + } + + None +} + +/// Parse unit symbol from spoken form +fn parse_unit_symbol(input: &str) -> Option { + // Check modifier units + for &(spoken, symbol, _) in MODIFIER_UNITS { + if input == spoken { + return Some(symbol.to_string()); + } + } + + // Check simple units + for &(spoken, symbol) in SIMPLE_UNITS { + if input == spoken { + return Some(symbol.to_string()); + } + } + + None +} + +/// Parse number value - handles cardinal, decimal, fraction, and scale +fn parse_number_value(input: &str) -> Option { + if input.is_empty() { + return None; + } + + // Check for fraction: "ein halb", "ein ein halb" + if let Some(frac_result) = fraction::parse(input) { + return Some(frac_result); + } + + // Check for decimal: contains "komma" + if input.contains("komma") { + return decimal::parse(input); + } + + // Check for scale: "eine million", "neunzig millionen" + let scale_words = ["millionen", "million", "milliarden", "milliarde"]; + for &sw in &scale_words { + if input.ends_with(sw) { + let num_part = input[..input.len() - sw.len()].trim(); + if let Some(num) = cardinal::words_to_number(num_part) { + return Some(format!("{} {}", num, sw)); + } + } + } + + // Cardinal number + if let Some(num) = cardinal::words_to_number(input) { + return Some(num.to_string()); + } + + // Try as "null" → 0 + if input == "null" { + return Some("0".to_string()); + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple() { + assert_eq!(parse("zwei hundert meter"), Some("200 m".to_string())); + assert_eq!(parse("neunzig gramm"), Some("90 g".to_string())); + } + + #[test] + fn test_compound() { + assert_eq!( + parse("zwei hundert kilometer pro stunde"), + Some("200 km/h".to_string()) + ); + } + + #[test] + fn test_negative() { + assert_eq!( + parse("minus sechs und sechzig kilogramm"), + Some("-66 kg".to_string()) + ); + } + + #[test] + fn test_per_unit() { + assert_eq!( + parse("sechs und fünfzig komma drei pro quadrat kilometer"), + Some("56,3 /km²".to_string()) + ); + } + + #[test] + fn test_fraction_measure() { + assert_eq!(parse("ein halb fuß"), Some("1/2 ft".to_string())); + } +} diff --git a/src/asr/de/mod.rs b/src/asr/de/mod.rs new file mode 100644 index 0000000..df8cead --- /dev/null +++ b/src/asr/de/mod.rs @@ -0,0 +1,20 @@ +//! Inverse Text Normalization taggers for German. +//! +//! Converts spoken-form German to written form: +//! - "einhundert" → "100" +//! - "zwei euro und zwanzig cent" → "€2,20" +//! - "vierundzwanzigster juli zwei tausend dreizehn" → "24. Jul. 2013" + +pub mod cardinal; +pub mod date; +pub mod decimal; +pub mod electronic; +pub mod fraction; +pub mod measure; +pub mod money; +pub mod ordinal; +pub mod punctuation; +pub mod telephone; +pub mod time; +pub mod whitelist; +pub mod word; diff --git a/src/asr/de/money.rs b/src/asr/de/money.rs new file mode 100644 index 0000000..2687b32 --- /dev/null +++ b/src/asr/de/money.rs @@ -0,0 +1,408 @@ +//! Money tagger for German. +//! +//! Converts spoken German currency expressions to written form: +//! - "zwei dollar" → "$2" +//! - "zwei euro und zwanzig cent" → "€2,20" +//! - "zwei pfund und ein penny" → "£2,01" +//! - "zwei millionen euro" → "€2 millionen" + +use super::cardinal; + +struct Currency { + names: &'static [&'static str], + symbol: &'static str, + prefix: bool, // true = $X, false = X € + cent_names: &'static [&'static str], + cent_singular: &'static str, +} + +const CURRENCIES: &[Currency] = &[ + Currency { + names: &["dollar", "dollars"], + symbol: "$", + prefix: true, + cent_names: &["cent", "cents"], + cent_singular: "cent", + }, + Currency { + names: &["euro", "euros"], + symbol: "€", + prefix: false, + cent_names: &["cent", "cents"], + cent_singular: "cent", + }, + Currency { + names: &["pfund"], + symbol: "£", + prefix: false, + cent_names: &["pence", "penny"], + cent_singular: "penny", + }, +]; + +/// Parse spoken German money expression to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Reject compound words without spaces (e.g., "zweidollarzwanzig") + if !input_trim.contains(' ') { + // Allow single-word currency amounts that are just "X dollar" etc. + // Actually for no-space cases like "zweidollarzwanzig", reject + for cur in CURRENCIES { + for &name in cur.names { + if input_trim.contains(name) && input_trim != name { + return None; + } + } + } + return None; + } + + // Try scale patterns first: "zwei millionen euro" → "€2 millionen" + if let Some(result) = parse_scale_money(input_trim) { + return Some(result); + } + + // Try decimal money: "zwei komma null null dollar" → "$2,00" + if let Some(result) = parse_decimal_money(input_trim) { + return Some(result); + } + + // Try "X CURRENCY und Y SUBCURRENCY" pattern + if let Some(result) = parse_with_subcurrency(input_trim) { + return Some(result); + } + + // Try "X CURRENCY Y" (implied cents) + if let Some(result) = parse_implied_cents(input_trim) { + return Some(result); + } + + // Try simple "X CURRENCY" + if let Some(result) = parse_simple_money(input_trim) { + return Some(result); + } + + // Try cent-only: "ein cent" → "€0,01" + if let Some(result) = parse_cents_only(input_trim) { + return Some(result); + } + + None +} + +/// Parse scale money: "zwei millionen euro" → "€2 millionen" +fn parse_scale_money(input: &str) -> Option { + let scale_words = [ + "millionen", + "million", + "milliarden", + "milliarde", + "billionen", + "billion", + ]; + + for cur in CURRENCIES { + for &cur_name in cur.names { + if input.ends_with(cur_name) { + let before = input[..input.len() - cur_name.len()].trim(); + // Check if before contains a scale word + for &sw in &scale_words { + if before.ends_with(sw) { + let num_part = before[..before.len() - sw.len()].trim(); + + // Check for "komma" (decimal scale money) + if num_part.contains("komma") { + let parts: Vec<&str> = num_part.splitn(2, "komma").collect(); + if parts.len() == 2 { + let int_part = parts[0].trim(); + let dec_part = parts[1].trim(); + let int_val = cardinal::words_to_number(int_part)?; + let dec_digits = parse_decimal_digits(dec_part)?; + return Some(format!( + "{}{}", + format_with_symbol( + cur, + &format!("{},{} {}", int_val, dec_digits, sw) + ), + "" + )); + } + } + + let num = cardinal::words_to_number(num_part)?; + return Some(format_with_symbol(cur, &format!("{} {}", num, sw))); + } + } + } + } + } + + None +} + +/// Parse decimal money: "zwei komma null null dollar" → "$2,00" +fn parse_decimal_money(input: &str) -> Option { + if !input.contains("komma") { + return None; + } + + for cur in CURRENCIES { + for &cur_name in cur.names { + if input.ends_with(cur_name) { + let before = input[..input.len() - cur_name.len()].trim(); + let parts: Vec<&str> = before.splitn(2, "komma").collect(); + if parts.len() != 2 { + continue; + } + let int_part = parts[0].trim(); + let dec_part = parts[1].trim(); + + let int_val = cardinal::words_to_number(int_part)?; + let dec_digits = parse_decimal_digits(dec_part)?; + + return Some(format_with_symbol( + cur, + &format!("{},{}", int_val, dec_digits), + )); + } + } + } + + None +} + +/// Parse with subcurrency: "zwei euro und zwanzig cent" → "€2,20" +fn parse_with_subcurrency(input: &str) -> Option { + for cur in CURRENCIES { + for ¢_name in cur.cent_names { + // "X CURRENCY und Y SUBCURRENCY" + if input.ends_with(cent_name) { + let before_cent = input[..input.len() - cent_name.len()].trim(); + // Check for "und" separator + if let Some(und_pos) = before_cent.rfind(" und ") { + let cent_part = before_cent[und_pos + 5..].trim(); + let main_part = before_cent[..und_pos].trim(); + + // Parse cent amount + let cent_val = cardinal::words_to_number(cent_part)?; + + // Check if cents >= 100 (special case) + if cent_val >= 100 { + // "zwei pfund und ein hundert penny" → "£2 und 100 penny" + for &cur_name in cur.names { + if main_part.ends_with(cur_name) { + let num_part = main_part[..main_part.len() - cur_name.len()].trim(); + let main_val = cardinal::words_to_number(num_part)?; + return Some(format_with_symbol( + cur, + &format!("{} und {} {}", main_val, cent_val, cent_name), + )); + } + } + continue; + } + + // Find main currency + for &cur_name in cur.names { + if main_part.ends_with(cur_name) { + let num_part = main_part[..main_part.len() - cur_name.len()].trim(); + let main_val = cardinal::words_to_number(num_part)?; + return Some(format_with_symbol( + cur, + &format!("{},{:02}", main_val, cent_val), + )); + } + } + } + + // "X CURRENCY Y SUBCURRENCY" (without "und") + for &cur_name in cur.names { + let pattern = format!("{} ", cur_name); + if let Some(pos) = before_cent.find(&pattern) { + let num_part = before_cent[..pos].trim(); + let cent_str = before_cent[pos + pattern.len()..].trim(); + + let main_val = cardinal::words_to_number(num_part)?; + let cent_val = cardinal::words_to_number(cent_str)?; + + return Some(format_with_symbol( + cur, + &format!("{},{:02}", main_val, cent_val), + )); + } + } + } + } + } + + None +} + +/// Parse implied cents: "zwei dollar zwanzig" → "$2,20" +fn parse_implied_cents(input: &str) -> Option { + for cur in CURRENCIES { + for &cur_name in cur.names { + let pattern = format!(" {} ", cur_name); + if let Some(pos) = input.find(&pattern) { + let num_part = &input[..pos]; + let cent_part = &input[pos + pattern.len()..]; + + let main_val = cardinal::words_to_number(num_part)?; + let cent_val = cardinal::words_to_number(cent_part)?; + + return Some(format_with_symbol( + cur, + &format!("{},{:02}", main_val, cent_val), + )); + } + } + } + + None +} + +/// Parse simple money: "zwei dollar" → "$2" +fn parse_simple_money(input: &str) -> Option { + for cur in CURRENCIES { + for &cur_name in cur.names { + if input.ends_with(cur_name) { + let num_part = input[..input.len() - cur_name.len()].trim(); + if num_part.is_empty() { + continue; + } + let num = cardinal::words_to_number(num_part)?; + return Some(format_with_symbol(cur, &num.to_string())); + } + } + } + + None +} + +/// Parse cents-only: "ein cent" → "€0,01" +/// Note: bare "cent" defaults to euro (€) +fn parse_cents_only(input: &str) -> Option { + // Check each currency's cent names + // But bare "cent" (without matching a specific currency) defaults to euro + for ¢_name in &["cent", "cents"] { + if input.ends_with(cent_name) { + let num_part = input[..input.len() - cent_name.len()].trim(); + if num_part.is_empty() { + continue; + } + let num = cardinal::words_to_number(num_part)?; + + if num >= 100 { + // "einhundert cent" → "100 cent" + return Some(format!("{} {}", num, cent_name)); + } + + // Default to euro for bare cent + return Some(format!("€0,{:02}", num)); + } + } + + // Check for pence/penny → £ + for ¢_name in &["pence", "penny"] { + if input.ends_with(cent_name) { + let num_part = input[..input.len() - cent_name.len()].trim(); + if num_part.is_empty() { + continue; + } + let num = cardinal::words_to_number(num_part)?; + + if num >= 100 { + return Some(format!("{} {}", num, cent_name)); + } + + return Some(format!("£0,{:02}", num)); + } + } + + None +} + +/// Format amount with currency symbol +fn format_with_symbol(cur: &Currency, amount: &str) -> String { + // German ITN convention: symbol always prefixes the amount + format!("{}{}", cur.symbol, amount) +} + +/// Parse decimal digit words: "null null" → "00", "null eins" → "01" +fn parse_decimal_digits(input: &str) -> Option { + let digit_map = [ + ("null", "0"), + ("eins", "1"), + ("ein", "1"), + ("zwei", "2"), + ("drei", "3"), + ("vier", "4"), + ("fünf", "5"), + ("sechs", "6"), + ("sieben", "7"), + ("acht", "8"), + ("neun", "9"), + ]; + + let tokens: Vec<&str> = input.split_whitespace().collect(); + if tokens.is_empty() { + return None; + } + + let mut result = String::new(); + for token in &tokens { + let mut found = false; + for &(word, digit) in &digit_map { + if token == &word { + result.push_str(digit); + found = true; + break; + } + } + if !found { + return None; + } + } + + Some(result) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple() { + assert_eq!(parse("zwei dollar"), Some("$2".to_string())); + assert_eq!(parse("ein dollar"), Some("$1".to_string())); + } + + #[test] + fn test_with_cents() { + assert_eq!( + parse("zwei euro und zwanzig cent"), + Some("€2,20".to_string()) + ); + assert_eq!( + parse("zwei dollar und zwanzig cent"), + Some("$2,20".to_string()) + ); + } + + #[test] + fn test_cents_only() { + assert_eq!(parse("ein cent"), Some("€0,01".to_string())); + assert_eq!(parse("zwanzig cent"), Some("€0,20".to_string())); + } + + #[test] + fn test_scale() { + assert_eq!(parse("eine million dollar"), Some("$1 million".to_string())); + assert_eq!( + parse("zwei millionen euro"), + Some("€2 millionen".to_string()) + ); + } +} diff --git a/src/asr/de/ordinal.rs b/src/asr/de/ordinal.rs new file mode 100644 index 0000000..4a717b1 --- /dev/null +++ b/src/asr/de/ordinal.rs @@ -0,0 +1,134 @@ +//! Ordinal number tagger for German. +//! +//! Converts spoken German ordinal words to written form: +//! - "ein hundertste" → "100." +//! - "erster" → "erster" (pass-through for small ordinals) +//! - "dem ein tausendstem" → "dem 1000." + +use super::cardinal; + +/// Small ordinals that pass through as words (1-9) +const SMALL_ORDINALS: &[&str] = &[ + "nullte", "nullter", "nulltem", "nulltes", "erste", "erster", "erstem", "erstes", "zweite", + "zweiter", "zweitem", "zweites", "dritte", "dritter", "drittem", "drittes", "vierte", + "vierter", "viertem", "viertes", "fünfte", "fünfter", "fünftem", "fünftes", "sechste", + "sechster", "sechstem", "sechstes", "siebte", "siebter", "siebtem", "siebtes", "achte", + "achter", "achtem", "achtes", "neunte", "neunter", "neuntem", "neuntes", +]; + +/// Parse spoken German ordinal to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Pass-through small ordinals + if SMALL_ORDINALS.contains(&input_trim) { + return Some(input_trim.to_string()); + } + + // Check for prefix words: "dem ein tausendstem" → "dem 1000." + let (prefix, ordinal_part) = extract_prefix(input_trim); + + // Try to parse the ordinal + if let Some(num) = parse_ordinal_number(ordinal_part) { + if let Some(p) = prefix { + return Some(format!("{} {}.", p, num)); + } + return Some(format!("{}.", num)); + } + + None +} + +/// Extract prefix words (like "dem") from ordinal expression +fn extract_prefix(input: &str) -> (Option<&str>, &str) { + let prefixes = [ + "dem ", "der ", "des ", "die ", "das ", "den ", "am ", "im ", "vom ", "zum ", "beim ", + ]; + + for prefix in &prefixes { + if input.starts_with(prefix) { + let rest = &input[prefix.len()..]; + let p = input[..prefix.len() - 1].trim(); + return (Some(p), rest); + } + } + + (None, input) +} + +/// Parse ordinal number from German ordinal word. +/// Returns the cardinal number if >= 10, None for small numbers. +fn parse_ordinal_number(input: &str) -> Option { + // Strip ordinal suffix + let ordinal_suffixes = ["stem", "stes", "ster", "ste", "tem", "tes", "ter", "te"]; + + for &suffix in &ordinal_suffixes { + if input.ends_with(suffix) { + let stem = &input[..input.len() - suffix.len()]; + let cardinal = reconstruct_cardinal(stem); + if let Some(num) = cardinal::words_to_number(&cardinal) { + if num >= 10 { + return Some(num); + } + } + } + } + + None +} + +/// Reconstruct cardinal form from ordinal stem. +fn reconstruct_cardinal(stem: &str) -> String { + // Handle special stems + match stem { + "er" | "ers" => "eins".to_string(), + "zwei" => "zwei".to_string(), + "drit" => "drei".to_string(), + "vier" => "vier".to_string(), + "fünf" => "fünf".to_string(), + "sechs" => "sechs".to_string(), + "sieb" => "sieben".to_string(), + "ach" => "acht".to_string(), + "neun" => "neun".to_string(), + "zehn" => "zehn".to_string(), + "elf" => "elf".to_string(), + "zwölf" => "zwölf".to_string(), + _ => { + // For compound ordinals, return as-is (already cardinal form) + // e.g., "ein hundert" from "ein hundertste" + // "fünf und zwanzig tausend ein hundert elf" from that ordinal + stem.to_string() + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_passthrough() { + assert_eq!(parse("erster"), Some("erster".to_string())); + assert_eq!(parse("zweite"), Some("zweite".to_string())); + assert_eq!(parse("dritter"), Some("dritter".to_string())); + } + + #[test] + fn test_large() { + assert_eq!(parse("ein hundertste"), Some("100.".to_string())); + assert_eq!(parse("ein tausendstem"), Some("1000.".to_string())); + } + + #[test] + fn test_with_prefix() { + assert_eq!(parse("dem ein tausendstem"), Some("dem 1000.".to_string())); + } + + #[test] + fn test_teens() { + assert_eq!(parse("zehnter"), Some("10.".to_string())); + assert_eq!(parse("elftem"), Some("11.".to_string())); + assert_eq!(parse("dreizehntem"), Some("13.".to_string())); + } +} diff --git a/src/asr/de/punctuation.rs b/src/asr/de/punctuation.rs new file mode 100644 index 0000000..acebeb9 --- /dev/null +++ b/src/asr/de/punctuation.rs @@ -0,0 +1,50 @@ +//! Punctuation tagger for German. +//! +//! Converts spoken German punctuation words to symbols: +//! - "punkt" → "." +//! - "komma" → "," +//! - "fragezeichen" → "?" +//! - "ausrufezeichen" → "!" + +use lazy_static::lazy_static; + +lazy_static! { + /// Punctuation mappings (longer patterns first) + static ref PUNCTUATION: Vec<(&'static str, &'static str)> = vec![ + ("fragezeichen", "?"), + ("ausrufezeichen", "!"), + ("doppelpunkt", ":"), + ("semikolon", ";"), + ("punkt", "."), + ("komma", ","), + ("bindestrich", "-"), + ("gedankenstrich", "—"), + ("anführungszeichen", "\""), + ]; +} + +/// Parse spoken German punctuation to symbol. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + for &(spoken, symbol) in PUNCTUATION.iter() { + if input_trim == spoken { + return Some(symbol.to_string()); + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_punctuation() { + assert_eq!(parse("punkt"), Some(".".to_string())); + assert_eq!(parse("komma"), Some(",".to_string())); + assert_eq!(parse("fragezeichen"), Some("?".to_string())); + } +} diff --git a/src/asr/de/telephone.rs b/src/asr/de/telephone.rs new file mode 100644 index 0000000..62fecb1 --- /dev/null +++ b/src/asr/de/telephone.rs @@ -0,0 +1,85 @@ +//! Telephone tagger for German. +//! +//! Converts spoken German phone number to written form: +//! - "null vier eins eins eins zwei drei vier eins zwei drei vier" → "(0411) 1234-1234" + +use super::cardinal; + +/// Parse spoken German telephone number to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Convert digit words to digit string + let tokens: Vec<&str> = input_trim.split_whitespace().collect(); + if tokens.len() < 7 { + return None; + } + + let mut digits = String::new(); + for token in &tokens { + match word_to_digit(token) { + Some(d) => digits.push_str(d), + None => return None, // Non-digit word + } + } + + // Must have enough digits for a phone number + if digits.len() < 7 { + return None; + } + + // Format as phone number + // For German phone numbers: area code (first 4 digits) + rest in groups of 4 + if digits.len() == 12 { + // (XXXX) XXXX-XXXX + let area = &digits[..4]; + let first = &digits[4..8]; + let second = &digits[8..12]; + return Some(format!("({}) {}-{}", area, first, second)); + } + + // Generic formatting for other lengths + if digits.len() >= 10 { + let area = &digits[..4]; + let rest = &digits[4..]; + let mid = rest.len() / 2; + let first = &rest[..mid]; + let second = &rest[mid..]; + return Some(format!("({}) {}-{}", area, first, second)); + } + + // For shorter numbers, just group + let mid = digits.len() / 2; + Some(format!("{}-{}", &digits[..mid], &digits[mid..])) +} + +/// Convert German digit word to digit string +fn word_to_digit(word: &str) -> Option<&'static str> { + match word { + "null" => Some("0"), + "eins" | "ein" | "eine" => Some("1"), + "zwei" => Some("2"), + "drei" => Some("3"), + "vier" => Some("4"), + "fünf" => Some("5"), + "sechs" => Some("6"), + "sieben" => Some("7"), + "acht" => Some("8"), + "neun" => Some("9"), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_phone() { + assert_eq!( + parse("null vier eins eins eins zwei drei vier eins zwei drei vier"), + Some("(0411) 1234-1234".to_string()) + ); + } +} diff --git a/src/asr/de/time.rs b/src/asr/de/time.rs new file mode 100644 index 0000000..d4cbbef --- /dev/null +++ b/src/asr/de/time.rs @@ -0,0 +1,408 @@ +//! Time tagger for German. +//! +//! Converts spoken German time expressions to written form: +//! - "acht uhr" → "8 Uhr" +//! - "acht uhr sieben" → "08:07 Uhr" +//! - "halb zwölf" → "11:30 Uhr" +//! - "viertel vor zwölf" → "11:45 Uhr" +//! - "null uhr null minuten null sekunden" → "00:00:00 Uhr" + +use super::cardinal; + +/// Parse spoken German time expression to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Try HH:MM:SS pattern: "X uhr Y minuten Z sekunden" + if let Some(result) = parse_hms(input_trim) { + return Some(result); + } + + // Try "halb X" pattern (half past = X-1:30) + if let Some(result) = parse_halb(input_trim) { + return Some(result); + } + + // Try "viertel vor/nach X" patterns + if let Some(result) = parse_viertel(input_trim) { + return Some(result); + } + + // Try "N vor X" / "N nach X" patterns + if let Some(result) = parse_vor_nach(input_trim) { + return Some(result); + } + + // Try compound "Xuhr" patterns (no space before uhr) + if let Some(result) = parse_compound_uhr(input_trim) { + return Some(result); + } + + // Try standard "X uhr [Y]" pattern + if let Some(result) = parse_standard_uhr(input_trim) { + return Some(result); + } + + None +} + +/// Parse HH:MM:SS: "null uhr null minuten null sekunden" → "00:00:00 Uhr" +fn parse_hms(input: &str) -> Option { + if !input.contains("minuten") && !input.contains("minute") { + return None; + } + if !input.contains("sekunden") && !input.contains("sekunde") { + return None; + } + + // Extract timezone at end + let (time_part, tz) = extract_timezone(input); + + // Split by "uhr", "minuten/minute", "sekunden/sekunde" + let uhr_pos = time_part.find(" uhr ")?; + let hour_str = &time_part[..uhr_pos]; + + let after_uhr = &time_part[uhr_pos + 5..]; + + // Find minuten/minute + let min_end = if let Some(p) = after_uhr.find(" minuten") { + p + } else if let Some(p) = after_uhr.find(" minute") { + p + } else { + return None; + }; + + let min_str = after_uhr[..min_end].trim(); + let after_min_keyword = if after_uhr[min_end..].starts_with(" minuten ") { + &after_uhr[min_end + 9..] + } else if after_uhr[min_end..].starts_with(" minute ") { + &after_uhr[min_end + 8..] + } else { + return None; + }; + + // Find sekunden/sekunde + let sec_end = if let Some(p) = after_min_keyword.find(" sekunden") { + p + } else if let Some(p) = after_min_keyword.find(" sekunde") { + p + } else { + after_min_keyword.len() + }; + + let sec_str = after_min_keyword[..sec_end].trim(); + + let hour = parse_time_number(hour_str)?; + let min = parse_time_number(min_str)?; + let sec = parse_time_number(sec_str)?; + + let result = format!("{:02}:{:02}:{:02} Uhr", hour, min, sec); + if let Some(tz_str) = tz { + Some(format!("{} {}", result, tz_str)) + } else { + Some(result) + } +} + +/// Parse "halb X" → "(X-1):30 Uhr" +fn parse_halb(input: &str) -> Option { + if !input.starts_with("halb ") { + return None; + } + let rest = input.strip_prefix("halb ")?; + let hour = cardinal::words_to_number(rest)? as i64; + let actual_hour = (hour - 1 + 24) % 24; + Some(format!("{:02}:{:02} Uhr", actual_hour, 30)) +} + +/// Parse "viertel vor/nach X" +fn parse_viertel(input: &str) -> Option { + if input.starts_with("viertel vor ") { + let rest = input.strip_prefix("viertel vor ")?; + let (hour_part, modifier) = extract_time_modifier(rest); + let hour = cardinal::words_to_number(hour_part.trim())? as i64; + let actual_hour = (hour - 1 + 24) % 24; + let result = format!("{:02}:{:02} Uhr", actual_hour, 45); + return Some(append_modifier(&result, modifier)); + } + if input.starts_with("viertel nach ") { + let rest = input.strip_prefix("viertel nach ")?; + let (hour_part, modifier) = extract_time_modifier(rest); + let (time_part, tz) = extract_timezone(hour_part.trim()); + let hour = cardinal::words_to_number(time_part.trim())? as i64; + let result = format!("{:02}:{:02} Uhr", hour, 15); + let result = append_modifier(&result, modifier); + if let Some(tz_str) = tz { + return Some(format!("{} {}", result, tz_str)); + } + return Some(result); + } + None +} + +/// Extract time modifier (nachts, mittags, morgens, abends) from end +fn extract_time_modifier(input: &str) -> (&str, Option<&str>) { + let modifiers = ["nachts", "mittags", "morgens", "abends"]; + for &m in &modifiers { + if input.ends_with(m) { + let before = input[..input.len() - m.len()].trim(); + return (before, Some(m)); + } + } + (input, None) +} + +fn append_modifier(base: &str, modifier: Option<&str>) -> String { + if let Some(m) = modifier { + format!("{} {}", base, m) + } else { + base.to_string() + } +} + +/// Parse "N vor X" / "N nach X" +fn parse_vor_nach(input: &str) -> Option { + // "drei vor zwölf" → "11:57 Uhr" + if let Some(pos) = input.find(" vor ") { + let min_str = &input[..pos]; + let hour_str = &input[pos + 5..]; + let minutes = cardinal::words_to_number(min_str)? as i64; + let hour = cardinal::words_to_number(hour_str)? as i64; + let actual_hour = (hour - 1 + 24) % 24; + let actual_min = 60 - minutes; + return Some(format!("{:02}:{:02} Uhr", actual_hour, actual_min)); + } + + // "drei nach zwölf" → "12:03 Uhr" + if let Some(pos) = input.find(" nach ") { + let min_str = &input[..pos]; + let hour_str = &input[pos + 6..]; + let (time_part, tz) = extract_timezone(hour_str); + let minutes = cardinal::words_to_number(min_str)? as i64; + let hour = cardinal::words_to_number(time_part.trim())? as i64; + let result = format!("{:02}:{:02} Uhr", hour, minutes); + if let Some(tz_str) = tz { + return Some(format!("{} {}", result, tz_str)); + } + return Some(result); + } + + None +} + +/// Parse compound "Xuhr" pattern (no space before uhr) +/// "vierundzwanziguhr" → "24 Uhr" +/// "vierundzwanziguhrzweiundzwanzig" → "24:22 Uhr" +/// "vierundzwanziguhrzweiundzwanzigest" → "24:22 Uhr est" +/// "vierundzwanziguhrzweiundzwanzig e s t" → "24:22 Uhr est" +fn parse_compound_uhr(input: &str) -> Option { + // Extract timezone first (space-separated letters at end) + let (main_part, tz) = extract_timezone(input); + + // Look for "uhr" embedded in the string (not as a separate word) + if main_part.contains(" uhr") || !main_part.contains("uhr") { + return None; + } + + let uhr_pos = main_part.find("uhr")?; + let hour_str = &main_part[..uhr_pos]; + let after_uhr = &main_part[uhr_pos + 3..]; + + let hour = cardinal::words_to_number(hour_str)? as i64; + + if after_uhr.is_empty() { + let result = format!("{} Uhr", hour); + if let Some(tz_str) = tz { + return Some(format!("{} {}", result, tz_str)); + } + return Some(result); + } + + // Try to parse minutes, potentially with appended timezone + // "zweiundzwanzigest" → try "zweiundzwanzig" + "est" + if let Some(minutes) = cardinal::words_to_number(after_uhr) { + let minutes = minutes as i64; + if minutes <= 59 { + let result = format!("{:02}:{:02} Uhr", hour, minutes); + if let Some(tz_str) = tz { + return Some(format!("{} {}", result, tz_str)); + } + return Some(result); + } + } + + // Try stripping common timezone suffixes from the end of after_uhr + let tz_suffixes = ["est", "pst", "cst", "mst", "cet", "gmt", "utc"]; + for &tz_suffix in &tz_suffixes { + if after_uhr.ends_with(tz_suffix) { + let min_str = &after_uhr[..after_uhr.len() - tz_suffix.len()]; + if let Some(minutes) = cardinal::words_to_number(min_str) { + let minutes = minutes as i64; + if minutes <= 59 { + return Some(format!("{:02}:{:02} Uhr {}", hour, minutes, tz_suffix)); + } + } + } + } + + None +} + +/// Parse standard "X uhr [Y]" pattern +fn parse_standard_uhr(input: &str) -> Option { + let (time_part, tz) = extract_timezone(input); + + // Check for modifier: "mittags", "nachts" + let modifiers = ["mittags", "nachts", "morgens", "abends"]; + let mut modifier = None; + let mut cleaned = time_part.to_string(); + for &m in &modifiers { + if cleaned.ends_with(m) { + modifier = Some(m); + cleaned = cleaned[..cleaned.len() - m.len()].trim().to_string(); + break; + } + } + + if !cleaned.contains(" uhr") { + return None; + } + + // Split on " uhr" + let parts: Vec<&str> = cleaned.splitn(2, " uhr").collect(); + if parts.len() != 2 { + return None; + } + + let hour_str = parts[0].trim(); + let min_str = parts[1].trim(); + + let hour = parse_time_number_or_zero(hour_str)?; + + let result = if min_str.is_empty() { + format!("{} Uhr", hour) + } else { + let minutes = parse_time_number_or_zero(min_str)?; + if minutes > 59 { + return None; + } + format!("{:02}:{:02} Uhr", hour, minutes) + }; + + let result = if let Some(m) = modifier { + format!("{} {}", result, m) + } else { + result + }; + + if let Some(tz_str) = tz { + Some(format!("{} {}", result, tz_str)) + } else { + Some(result) + } +} + +/// Parse a time number (handles "null" → 0, "ein/eine" → 1, etc.) +fn parse_time_number(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed == "null" { + return Some(0); + } + if trimmed == "eine" || trimmed == "ein" || trimmed == "einer" || trimmed == "eins" { + return Some(1); + } + cardinal::words_to_number(trimmed).map(|n| n as i64) +} + +/// Parse a time number that may be zero or a single-digit word +fn parse_time_number_or_zero(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed == "null" { + return Some(0); + } + parse_time_number(trimmed) +} + +/// Extract timezone suffix (space-separated single letters at end) +/// "viertel nach zwölf nachts" → ("viertel nach zwölf", Some("nachts")) +/// "vierundzwanziguhrzweiundzwanzigest" → ("vierundzwanziguhrzweiundzwanzig", Some("est")) +/// "vierundzwanziguhrzweiundzwanzig e s t" → ("vierundzwanziguhrzweiundzwanzig", Some("est")) +fn extract_timezone(input: &str) -> (&str, Option) { + // Check for single-letter timezone: "e s t", "p s t", etc. + let tokens: Vec<&str> = input.split_whitespace().collect(); + if tokens.len() >= 3 { + // Check if last N tokens are all single letters + let mut tz_start = tokens.len(); + for i in (0..tokens.len()).rev() { + if tokens[i].len() == 1 && tokens[i].chars().all(|c| c.is_ascii_alphabetic()) { + tz_start = i; + } else { + break; + } + } + if tz_start < tokens.len() && tokens.len() - tz_start >= 2 { + let tz: String = tokens[tz_start..].join(""); + let time_part = tokens[..tz_start].join(" "); + // Return references won't work since we're creating new strings + // We need to handle this differently + let time_end = input.len() + - tokens[tz_start..].iter().map(|t| t.len()).sum::() + - (tokens.len() - tz_start); // spaces + let time_part_ref = input[..time_end].trim(); + return (time_part_ref, Some(tz)); + } + } + + // Check for "est", "pst" etc. appended directly (compound form) + // e.g., "vierundzwanziguhrzweiundzwanzigest" - the "est" at the end + // This is tricky - we'd need to know it's a tz. Skip for now. + + (input, None) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_standard() { + assert_eq!(parse("acht uhr"), Some("8 Uhr".to_string())); + assert_eq!(parse("achtzehn uhr"), Some("18 Uhr".to_string())); + assert_eq!(parse("acht uhr sieben"), Some("08:07 Uhr".to_string())); + } + + #[test] + fn test_halb() { + assert_eq!(parse("halb zwölf"), Some("11:30 Uhr".to_string())); + } + + #[test] + fn test_viertel() { + assert_eq!(parse("viertel vor zwölf"), Some("11:45 Uhr".to_string())); + assert_eq!(parse("viertel nach zwölf"), Some("12:15 Uhr".to_string())); + } + + #[test] + fn test_vor_nach() { + assert_eq!(parse("drei vor zwölf"), Some("11:57 Uhr".to_string())); + assert_eq!(parse("drei nach zwölf"), Some("12:03 Uhr".to_string())); + } + + #[test] + fn test_mittags() { + assert_eq!( + parse("zwölf uhr mittags"), + Some("12 Uhr mittags".to_string()) + ); + } + + #[test] + fn test_hms() { + assert_eq!( + parse("null uhr null minuten null sekunden"), + Some("00:00:00 Uhr".to_string()) + ); + } +} diff --git a/src/asr/de/whitelist.rs b/src/asr/de/whitelist.rs new file mode 100644 index 0000000..54eeab0 --- /dev/null +++ b/src/asr/de/whitelist.rs @@ -0,0 +1,104 @@ +//! Whitelist tagger for German. +//! +//! Maps spoken German titles and phrases to abbreviations: +//! - "doktor dao" → "Dr. dao" +//! - "mister dao" → "Mr. dao" +//! - "zum beispiel" → "z.B." + +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + /// Whitelist mappings (spoken → abbreviated) + static ref WHITELIST: Vec<(&'static str, &'static str)> = vec![ + // Multi-word entries first (longer match) + ("zum beispiel", "z.B."), + ("das heißt", "d.h."), + ("das heisst", "d.h."), + ("und so weiter", "usw."), + ("beziehungsweise", "bzw."), + // Titles + ("doktor", "Dr."), + ("professor", "Prof."), + ("mister", "Mr."), + ("miss", "Ms."), + ("misses", "Mrs."), + ("nummer", "Nr."), + ]; +} + +/// Parse spoken German whitelist entry to abbreviated form. +/// Supports prefix matching: "doktor dao" → "Dr. dao" +/// and middle-of-sentence: "ich mag essen zum beispiel eis" → "ich mag essen z.B. eis" +/// and suffix matching: "Chanel nummer fünf" → "Chanel Nr. fünf" +pub fn parse(input: &str) -> Option { + let input_trim = input.trim(); + let input_lower = input_trim.to_lowercase(); + + for &(spoken, abbreviated) in WHITELIST.iter() { + // Exact match (case-insensitive) + if input_lower == spoken { + return Some(abbreviated.to_string()); + } + + // Prefix match: "doktor dao" → "Dr. dao" + if input_lower.starts_with(spoken) { + let after = &input_lower[spoken.len()..]; + if after.starts_with(' ') { + // Use original case for the rest + let rest = &input_trim[spoken.len()..].trim_start(); + return Some(format!("{} {}", abbreviated, rest)); + } + } + + // Middle match: find spoken phrase with word boundaries in the middle + let pattern = format!(" {} ", spoken); + if let Some(pos) = input_lower.find(&pattern) { + // Use original case for before/after + let before = &input_trim[..pos]; + let after = &input_trim[pos + pattern.len()..]; + if after.is_empty() { + return Some(format!("{} {}", before, abbreviated)); + } else { + return Some(format!("{} {} {}", before, abbreviated, after)); + } + } + + // End match with rest after: " spoken rest_word" + // E.g., "Chanel nummer fünf" → pattern " nummer " found as middle match above + // But also handle: "X spoken" at end + let end_pattern = format!(" {}", spoken); + if input_lower.ends_with(&end_pattern) { + let before = &input_trim[..input_trim.len() - end_pattern.len()]; + return Some(format!("{} {}", before, abbreviated)); + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_titles() { + assert_eq!(parse("doktor dao"), Some("Dr. dao".to_string())); + assert_eq!(parse("mister dao"), Some("Mr. dao".to_string())); + assert_eq!(parse("miss smith"), Some("Ms. smith".to_string())); + assert_eq!(parse("misses smith"), Some("Mrs. smith".to_string())); + } + + #[test] + fn test_phrases() { + assert_eq!(parse("zum beispiel"), Some("z.B.".to_string())); + } + + #[test] + fn test_contextual() { + assert_eq!( + parse("ich mag essen zum beispiel eis"), + Some("ich mag essen z.B. eis".to_string()) + ); + } +} diff --git a/src/asr/de/word.rs b/src/asr/de/word.rs new file mode 100644 index 0000000..b6bd339 --- /dev/null +++ b/src/asr/de/word.rs @@ -0,0 +1,14 @@ +//! Word tagger for German. +//! +//! Handles pass-through words and special cases: +//! - "yahoo!" → "yahoo!" (pass-through) +//! - "zwanzig!" → "20 !" (cardinal + punctuation) +//! - Regular words pass through unchanged + +/// Parse is not used directly - word handling is done in the normalize pipeline. +/// This module exists for symmetry with the French implementation. +pub fn parse(_input: &str) -> Option { + // Word tagger doesn't actively transform anything. + // Pass-through and "cardinal!" patterns are handled by normalize_lang_de. + None +} diff --git a/src/asr/cardinal.rs b/src/asr/en/cardinal.rs similarity index 100% rename from src/asr/cardinal.rs rename to src/asr/en/cardinal.rs diff --git a/src/asr/date.rs b/src/asr/en/date.rs similarity index 100% rename from src/asr/date.rs rename to src/asr/en/date.rs diff --git a/src/asr/decimal.rs b/src/asr/en/decimal.rs similarity index 100% rename from src/asr/decimal.rs rename to src/asr/en/decimal.rs diff --git a/src/asr/electronic.rs b/src/asr/en/electronic.rs similarity index 100% rename from src/asr/electronic.rs rename to src/asr/en/electronic.rs diff --git a/src/asr/measure.rs b/src/asr/en/measure.rs similarity index 100% rename from src/asr/measure.rs rename to src/asr/en/measure.rs diff --git a/src/asr/en/mod.rs b/src/asr/en/mod.rs new file mode 100644 index 0000000..370911e --- /dev/null +++ b/src/asr/en/mod.rs @@ -0,0 +1,19 @@ +//! Inverse Text Normalization taggers for English. +//! +//! Converts spoken-form text to written English: +//! - "two hundred" → "200" +//! - "five dollars and fifty cents" → "$5.50" +//! - "january fifth twenty twenty five" → "January 5, 2025" + +pub mod cardinal; +pub mod date; +pub mod decimal; +pub mod electronic; +pub mod measure; +pub mod money; +pub mod ordinal; +pub mod punctuation; +pub mod telephone; +pub mod time; +pub mod whitelist; +pub mod word; diff --git a/src/asr/money.rs b/src/asr/en/money.rs similarity index 100% rename from src/asr/money.rs rename to src/asr/en/money.rs diff --git a/src/asr/ordinal.rs b/src/asr/en/ordinal.rs similarity index 100% rename from src/asr/ordinal.rs rename to src/asr/en/ordinal.rs diff --git a/src/asr/punctuation.rs b/src/asr/en/punctuation.rs similarity index 100% rename from src/asr/punctuation.rs rename to src/asr/en/punctuation.rs diff --git a/src/asr/telephone.rs b/src/asr/en/telephone.rs similarity index 100% rename from src/asr/telephone.rs rename to src/asr/en/telephone.rs diff --git a/src/asr/time.rs b/src/asr/en/time.rs similarity index 100% rename from src/asr/time.rs rename to src/asr/en/time.rs diff --git a/src/asr/whitelist.rs b/src/asr/en/whitelist.rs similarity index 100% rename from src/asr/whitelist.rs rename to src/asr/en/whitelist.rs diff --git a/src/asr/word.rs b/src/asr/en/word.rs similarity index 100% rename from src/asr/word.rs rename to src/asr/en/word.rs diff --git a/src/asr/es/cardinal.rs b/src/asr/es/cardinal.rs new file mode 100644 index 0000000..45885e8 --- /dev/null +++ b/src/asr/es/cardinal.rs @@ -0,0 +1,434 @@ +//! Cardinal number tagger for Spanish. +//! +//! Converts spoken Spanish number words to digits: +//! - "doscientos cincuenta y uno" → "251" +//! - "un millón ciento cincuenta y seis mil" → "1156000" +//! - "menos veintitrés" → "-23" +//! - "mil millones uno" → "1000000001" + +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + /// Numbers 0-29 (including veinti- compounds) + static ref ONES: HashMap<&'static str, i64> = { + let mut m = HashMap::new(); + m.insert("cero", 0); + m.insert("uno", 1); + m.insert("un", 1); + m.insert("una", 1); + m.insert("dos", 2); + m.insert("tres", 3); + m.insert("cuatro", 4); + m.insert("cinco", 5); + m.insert("seis", 6); + m.insert("siete", 7); + m.insert("ocho", 8); + m.insert("nueve", 9); + m.insert("diez", 10); + m.insert("once", 11); + m.insert("doce", 12); + m.insert("trece", 13); + m.insert("catorce", 14); + m.insert("quince", 15); + m.insert("dieciséis", 16); + m.insert("diecisiete", 17); + m.insert("dieciocho", 18); + m.insert("diecinueve", 19); + m.insert("veinte", 20); + m.insert("veintiún", 21); + m.insert("veintiuno", 21); + m.insert("veintiuna", 21); + m.insert("veintidós", 22); + m.insert("veintitrés", 23); + m.insert("veinticuatro", 24); + m.insert("veinticinco", 25); + m.insert("veintiséis", 26); + m.insert("veintisiete", 27); + m.insert("veintiocho", 28); + m.insert("veintinueve", 29); + m + }; + + /// Tens (30-90) + static ref TENS: HashMap<&'static str, i64> = { + let mut m = HashMap::new(); + m.insert("treinta", 30); + m.insert("cuarenta", 40); + m.insert("cincuenta", 50); + m.insert("sesenta", 60); + m.insert("setenta", 70); + m.insert("ochenta", 80); + m.insert("noventa", 90); + m + }; + + /// Hundreds + static ref HUNDREDS: HashMap<&'static str, i64> = { + let mut m = HashMap::new(); + m.insert("cien", 100); + m.insert("ciento", 100); + m.insert("doscientos", 200); + m.insert("doscientas", 200); + m.insert("trescientos", 300); + m.insert("trescientas", 300); + m.insert("cuatrocientos", 400); + m.insert("cuatrocientas", 400); + m.insert("quinientos", 500); + m.insert("quinientas", 500); + m.insert("seiscientos", 600); + m.insert("seiscientas", 600); + m.insert("setecientos", 700); + m.insert("setecientas", 700); + m.insert("ochocientos", 800); + m.insert("ochocientas", 800); + m.insert("novecientos", 900); + m.insert("novecientas", 900); + m + }; + + /// Scale words (Spanish long scale) + static ref SCALES: HashMap<&'static str, i128> = { + let mut m = HashMap::new(); + m.insert("mil", 1_000); + m.insert("millón", 1_000_000); + m.insert("millones", 1_000_000); + m.insert("millardo", 1_000_000_000); + m.insert("millardos", 1_000_000_000); + m.insert("billón", 1_000_000_000_000); + m.insert("billones", 1_000_000_000_000); + m.insert("trillón", 1_000_000_000_000_000_000); + m.insert("trillones", 1_000_000_000_000_000_000); + m.insert("cuatrillón", 1_000_000_000_000_000_000_000_000); + m.insert("cuatrillones", 1_000_000_000_000_000_000_000_000); + m + }; + + /// Small numbers that pass through as words (0-9) + static ref PASSTHROUGH: Vec<&'static str> = vec![ + "cero", "uno", "una", "dos", "tres", "cuatro", + "cinco", "seis", "siete", "ocho", "nueve", + ]; +} + +/// Parse spoken Spanish cardinal number to string representation. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + if input_trim.is_empty() { + return None; + } + + // Handle ", X" passthrough patterns + if input_trim.starts_with(", ") { + let rest = &input_trim[2..]; + if PASSTHROUGH.contains(&rest) { + return Some(format!(", {}", rest)); + } + // Try to parse as a number + if let Some(num) = words_to_number(rest) { + return Some(format!(", {}", num)); + } + return None; + } + + // Handle "entre X y Y" pattern + if input_trim.starts_with("entre ") && input_trim.contains(" y ") { + return parse_entre(input_trim); + } + + // Pass-through single small numbers (0-9) + if PASSTHROUGH.contains(&input_trim) { + return Some(input_trim.to_string()); + } + + // Don't parse space-separated sequences that look like phone digit sequences. + // Require at least one "heavy" structural word (hundreds, scales) for long inputs, + // or any structural word for shorter inputs. + if input_trim.contains(' ') { + if !contains_structure_word(input_trim) { + return None; + } + // Long inputs (4+ tokens excluding "y") without heavy structure are likely phone numbers. + // E.g., "uno veintitrés cincuenta y seis setenta y ocho" is a phone number, not 182. + let non_y_tokens: Vec<&str> = input_trim + .split_whitespace() + .filter(|t| *t != "y") + .collect(); + if non_y_tokens.len() >= 4 && !contains_heavy_structure(input_trim) { + return None; + } + } + + // Check for negative + let (is_negative, rest) = if input_trim.starts_with("menos ") { + (true, &input_trim[6..]) + } else { + (false, input_trim) + }; + + let num = words_to_number(rest)?; + + if is_negative { + Some(format!("-{}", num)) + } else { + Some(num.to_string()) + } +} + +/// Parse "entre X y Y" → "entre N1 y N2" +fn parse_entre(input: &str) -> Option { + let rest = &input[6..]; // after "entre " + let y_pos = rest.find(" y ")?; + let first = &rest[..y_pos]; + let second = &rest[y_pos + 3..]; + + let n1 = words_to_number(first)?; + let n2 = words_to_number(second)?; + + Some(format!("entre {} y {}", n1, n2)) +} + +/// Check if input contains structure words that indicate a compound number +/// (not just a list of digit words) +fn contains_structure_word(input: &str) -> bool { + let structure_words = [ + "cien", + "ciento", + "doscientos", + "doscientas", + "trescientos", + "trescientas", + "cuatrocientos", + "cuatrocientas", + "quinientos", + "quinientas", + "seiscientos", + "seiscientas", + "setecientos", + "setecientas", + "ochocientos", + "ochocientas", + "novecientos", + "novecientas", + "mil", + "millón", + "millones", + "millardo", + "millardos", + "billón", + "billones", + "trillón", + "trillones", + "cuatrillón", + "cuatrillones", + "y", + "menos", + "entre", + // veinti- compounds and tens are considered structure too + "diez", + "once", + "doce", + "trece", + "catorce", + "quince", + "dieciséis", + "diecisiete", + "dieciocho", + "diecinueve", + "veinte", + "veintiún", + "veintiuno", + "veintiuna", + "veintidós", + "veintitrés", + "veinticuatro", + "veinticinco", + "veintiséis", + "veintisiete", + "veintiocho", + "veintinueve", + "treinta", + "cuarenta", + "cincuenta", + "sesenta", + "setenta", + "ochenta", + "noventa", + ]; + let tokens: Vec<&str> = input.split_whitespace().collect(); + tokens.iter().any(|t| structure_words.contains(t)) +} + +/// Check if input contains "heavy" structure words: hundreds or scale words. +/// These are required for longer multi-word inputs to distinguish from phone numbers. +fn contains_heavy_structure(input: &str) -> bool { + let heavy_words = [ + "cien", + "ciento", + "doscientos", + "doscientas", + "trescientos", + "trescientas", + "cuatrocientos", + "cuatrocientas", + "quinientos", + "quinientas", + "seiscientos", + "seiscientas", + "setecientos", + "setecientas", + "ochocientos", + "ochocientas", + "novecientos", + "novecientas", + "mil", + "millón", + "millones", + "millardo", + "millardos", + "billón", + "billones", + "trillón", + "trillones", + "cuatrillón", + "cuatrillones", + ]; + let tokens: Vec<&str> = input.split_whitespace().collect(); + tokens.iter().any(|t| heavy_words.contains(t)) +} + +/// Convert Spanish number words to a number value. +pub fn words_to_number(input: &str) -> Option { + let input_trim = input.trim(); + if input_trim.is_empty() { + return None; + } + + // Handle "mil millones" as a special compound scale (= 10^9) + // Replace "mil millones" with a placeholder before tokenizing + let normalized = input_trim + .replace("mil trillones", "MIL_TRILLONES") + .replace("mil billones", "MIL_BILLONES") + .replace("mil millones", "MIL_MILLONES"); + + let tokens: Vec<&str> = normalized.split_whitespace().collect(); + if tokens.is_empty() { + return None; + } + + // Filter out "y" connectors (but keep the structure) + let tokens: Vec<&str> = tokens.iter().filter(|&&t| t != "y").copied().collect(); + + if tokens.is_empty() { + return None; + } + + let mut result: i128 = 0; + let mut sub: i128 = 0; // current accumulator below scale + + for &token in &tokens { + // Check for special compound scales + if token == "MIL_MILLONES" { + let multiplier = if sub == 0 { 1 } else { sub }; + result += multiplier * 1_000_000_000; + sub = 0; + continue; + } + if token == "MIL_BILLONES" { + let multiplier = if sub == 0 { 1 } else { sub }; + result += multiplier * 1_000_000_000_000_000; + sub = 0; + continue; + } + if token == "MIL_TRILLONES" { + let multiplier = if sub == 0 { 1 } else { sub }; + result += multiplier * 1_000_000_000_000_000_000_000; + sub = 0; + continue; + } + + if let Some(&scale) = SCALES.get(token) { + if scale == 1000 { + // "mil": flush sub as multiplier for thousands + if sub == 0 { + sub = 1; + } + sub *= 1000; + } else { + // millón+: flush sub as multiplier for this scale + let multiplier = if sub == 0 { 1 } else { sub }; + result += multiplier * scale; + sub = 0; + } + } else if let Some(&val) = HUNDREDS.get(token) { + sub += val as i128; + } else if let Some(&val) = ONES.get(token) { + sub += val as i128; + } else if let Some(&val) = TENS.get(token) { + sub += val as i128; + } else { + return None; // Unknown token + } + } + + result += sub; + + if result == 0 { + // Only return 0 if input was literally "cero" + if input_trim == "cero" { + return Some(0); + } + return None; + } + + Some(result) +} + +/// Convert a single digit word to its numeric value. +/// Used by electronic and telephone taggers. +pub fn word_to_digit(word: &str) -> Option { + match word { + "cero" => Some(0), + "uno" | "un" | "una" => Some(1), + "dos" => Some(2), + "tres" => Some(3), + "cuatro" => Some(4), + "cinco" => Some(5), + "seis" => Some(6), + "siete" => Some(7), + "ocho" => Some(8), + "nueve" => Some(9), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_passthrough() { + assert_eq!(parse("cero"), Some("cero".to_string())); + assert_eq!(parse("uno"), Some("uno".to_string())); + assert_eq!(parse("nueve"), Some("nueve".to_string())); + } + + #[test] + fn test_basic() { + assert_eq!(parse("diez"), Some("10".to_string())); + assert_eq!(parse("cien"), Some("100".to_string())); + assert_eq!(parse("doscientos cincuenta y uno"), Some("251".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!(parse("menos veintitrés"), Some("-23".to_string())); + } + + #[test] + fn test_large() { + assert_eq!(parse("mil millones uno"), Some("1000000001".to_string())); + } +} diff --git a/src/asr/es/date.rs b/src/asr/es/date.rs new file mode 100644 index 0000000..7e9c8b7 --- /dev/null +++ b/src/asr/es/date.rs @@ -0,0 +1,206 @@ +//! Date tagger for Spanish. +//! +//! Converts spoken Spanish date expressions to written form: +//! - "primero de enero" → "1 de enero" +//! - "siglo diecinueve" → "siglo xix" +//! - "doscientos tres antes de cristo" → "203 a. c." + +use super::cardinal; + +const MONTHS: [&str; 12] = [ + "enero", + "febrero", + "marzo", + "abril", + "mayo", + "junio", + "julio", + "agosto", + "septiembre", + "octubre", + "noviembre", + "diciembre", +]; + +const DAYS_OF_WEEK: [&str; 7] = [ + "lunes", + "martes", + "miércoles", + "jueves", + "viernes", + "sábado", + "domingo", +]; + +/// Parse spoken Spanish date expression to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Try "siglo X" → "siglo xix" + if let Some(result) = parse_siglo(input_trim) { + return Some(result); + } + + // Try "X antes de cristo" → "X a. c." + if let Some(result) = parse_antes_de_cristo(input_trim) { + return Some(result); + } + + // Try full date: "DAY de MONTH de YEAR" + if let Some(result) = parse_full_date(input_trim) { + return Some(result); + } + + // Try day+month with optional prefix: "[el/DOW] DAY de MONTH" + if let Some(result) = parse_day_month(input_trim) { + return Some(result); + } + + None +} + +/// Parse "siglo X" → "siglo xix" +fn parse_siglo(input: &str) -> Option { + if !input.starts_with("siglo ") { + return None; + } + let rest = &input[6..]; + let num = cardinal::words_to_number(rest)?; + let roman = to_roman(num as i64)?; + Some(format!("siglo {}", roman.to_lowercase())) +} + +/// Parse "X antes de cristo" → "X a. c." +fn parse_antes_de_cristo(input: &str) -> Option { + if !input.ends_with(" antes de cristo") { + return None; + } + let before = input[..input.len() - 16].trim(); + let num = cardinal::words_to_number(before)?; + Some(format!("{} a. c.", num)) +} + +/// Parse full date: "treinta y uno de diciembre de mil novecientos noventa y dos" +fn parse_full_date(input: &str) -> Option { + for &month in &MONTHS { + let de_month_de = format!(" de {} de ", month); + if let Some(pos) = input.find(&de_month_de) { + let day_part = &input[..pos]; + let year_part = &input[pos + de_month_de.len()..]; + + let day = parse_day(day_part)?; + let year = cardinal::words_to_number(year_part)?; + + return Some(format!("{} de {} de {}", day, month, year)); + } + } + None +} + +/// Parse day+month: "[prefix] DAY de MONTH" +fn parse_day_month(input: &str) -> Option { + for &month in &MONTHS { + let de_month = format!(" de {}", month); + if input.ends_with(&de_month) || input.contains(&format!("{} ", &de_month[1..])) { + // Check if ends with " de MONTH" + if input.ends_with(&de_month) { + let before = &input[..input.len() - de_month.len()]; + + // Extract prefix (el, day of week) + let (prefix, day_part) = extract_prefix(before); + let day = parse_day(day_part)?; + + if let Some(p) = prefix { + return Some(format!("{} {} de {}", p, day, month)); + } else { + return Some(format!("{} de {}", day, month)); + } + } + } + } + None +} + +/// Extract prefix like "el" or day of week +fn extract_prefix(input: &str) -> (Option<&str>, &str) { + let trimmed = input.trim(); + + // Check for "el" + if trimmed.starts_with("el ") { + return (Some("el"), trimmed[3..].trim()); + } + + // Check for day of week + for &dow in &DAYS_OF_WEEK { + if trimmed.starts_with(dow) { + let rest = trimmed[dow.len()..].trim(); + return (Some(dow), rest); + } + } + + (None, trimmed) +} + +/// Parse day number (handles "primero" → 1, "uno" → 1, number words → number) +fn parse_day(input: &str) -> Option { + let trimmed = input.trim(); + match trimmed { + "primero" | "primer" => Some(1), + "uno" | "una" | "un" => Some(1), + _ => cardinal::words_to_number(trimmed), + } +} + +/// Convert number to Roman numeral (lowercase) +fn to_roman(num: i64) -> Option { + if num <= 0 || num > 3999 { + return None; + } + let values = [1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1]; + let symbols = [ + "M", "CM", "D", "CD", "C", "XC", "L", "XL", "X", "IX", "V", "IV", "I", + ]; + + let mut result = String::new(); + let mut remaining = num; + for (i, &val) in values.iter().enumerate() { + while remaining >= val { + result.push_str(symbols[i]); + remaining -= val; + } + } + Some(result) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_day_month() { + assert_eq!(parse("primero de enero"), Some("1 de enero".to_string())); + assert_eq!(parse("uno de enero"), Some("1 de enero".to_string())); + } + + #[test] + fn test_with_article() { + assert_eq!( + parse("el uno de diciembre"), + Some("el 1 de diciembre".to_string()) + ); + } + + #[test] + fn test_siglo() { + assert_eq!(parse("siglo diecinueve"), Some("siglo xix".to_string())); + } + + #[test] + fn test_antes_de_cristo() { + assert_eq!( + parse("doscientos tres antes de cristo"), + Some("203 a. c.".to_string()) + ); + } +} diff --git a/src/asr/es/decimal.rs b/src/asr/es/decimal.rs new file mode 100644 index 0000000..65c060a --- /dev/null +++ b/src/asr/es/decimal.rs @@ -0,0 +1,377 @@ +//! Decimal number tagger for Spanish. +//! +//! Converts spoken Spanish decimal numbers to written form: +//! - "uno coma dos seis" → "1,26" +//! - "tres coma catorce quince noventa y dos sesenta y cinco tres" → "3,141592653" +//! - "uno punto treinta y tres millones" → "1.33 millones" + +use super::cardinal; + +/// Scale words that should be preserved as suffixes +const SCALE_WORDS: &[&str] = &[ + "millón", + "millones", + "millardo", + "millardos", + "billón", + "billones", + "trillón", + "trillones", + "cuatrillón", + "cuatrillones", +]; + +/// Parse spoken Spanish decimal number to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Check for negative + let (is_negative, rest) = if input_trim.starts_with("menos ") { + (true, input_trim.strip_prefix("menos ")?) + } else { + (false, input_trim) + }; + + let sign = if is_negative { "-" } else { "" }; + + // Try "X coma Y [scale]" pattern + if let Some(result) = parse_coma(rest) { + return Some(format!("{}{}", sign, result)); + } + + // Try "X punto Y [scale]" pattern + if let Some(result) = parse_punto(rest) { + return Some(format!("{}{}", sign, result)); + } + + // Try "punto Y" pattern (no integer part) + if let Some(result) = parse_punto_only(rest) { + return Some(format!("{}{}", sign, result)); + } + + // Try scale-only: "un millón" → "1 millón", "dos millones" → "2 millones" + if let Some(result) = parse_scale_only(rest) { + return Some(format!("{}{}", sign, result)); + } + + // Try "NUMBER scale" → "N scale" (e.g., "mil ochocientos veinticuatro millones" → "1824 millones") + if let Some(result) = parse_number_scale(rest) { + return Some(format!("{}{}", sign, result)); + } + + None +} + +/// Parse "X coma Y [scale]" +fn parse_coma(input: &str) -> Option { + let coma_pos = input.find(" coma ")?; + let int_part = &input[..coma_pos]; + let after_coma = &input[coma_pos + 6..]; + + let int_val = parse_integer_part(int_part)?; + + // Check for scale suffix + let (dec_str, scale) = extract_scale_suffix(after_coma); + + let dec_digits = parse_decimal_part(dec_str.trim())?; + + let result = if let Some(sw) = scale { + format!("{},{} {}", int_val, dec_digits, sw) + } else { + format!("{},{}", int_val, dec_digits) + }; + + Some(result) +} + +/// Parse "X punto Y [scale]" +fn parse_punto(input: &str) -> Option { + let punto_pos = input.find(" punto ")?; + let int_part = &input[..punto_pos]; + let after_punto = &input[punto_pos + 7..]; + + let int_val = parse_integer_part(int_part)?; + + // Check for scale suffix + let (dec_str, scale) = extract_scale_suffix(after_punto); + + let dec_digits = parse_decimal_part(dec_str.trim())?; + + let result = if let Some(sw) = scale { + format!("{}.{} {}", int_val, dec_digits, sw) + } else { + format!("{}.{}", int_val, dec_digits) + }; + + Some(result) +} + +/// Parse "punto Y" (no integer part) +fn parse_punto_only(input: &str) -> Option { + if !input.starts_with("punto ") { + return None; + } + let after = &input[6..]; + let dec_digits = parse_decimal_part(after.trim())?; + Some(format!(".{}", dec_digits)) +} + +/// Parse scale-only: "un millón" → "1 millón" +fn parse_scale_only(input: &str) -> Option { + let tokens: Vec<&str> = input.split_whitespace().collect(); + if tokens.len() != 2 { + return None; + } + let num_word = tokens[0]; + let scale_word = tokens[1]; + + if !SCALE_WORDS.contains(&scale_word) { + return None; + } + + let num = parse_integer_part(num_word)?; + Some(format!("{} {}", num, scale_word)) +} + +/// Parse "NUMBER scale" → "N scale" +fn parse_number_scale(input: &str) -> Option { + for &sw in SCALE_WORDS { + if input.ends_with(sw) { + let before = input[..input.len() - sw.len()].trim(); + if before.is_empty() { + continue; + } + // Must have multiple tokens (not just "un millón" which is handled above) + if !before.contains(' ') { + continue; + } + let num = cardinal::words_to_number(before)?; + return Some(format!("{} {}", num, sw)); + } + } + None +} + +/// Parse the integer part of a decimal +fn parse_integer_part(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed == "cero" { + return Some(0); + } + cardinal::words_to_number(trimmed) +} + +/// Parse decimal digits from Spanish words. +/// Handles mixed individual digits and compound numbers: +/// "catorce quince noventa y dos sesenta y cinco tres" → "141592653" +/// +/// Each group is parsed as the largest compound number possible +/// (hundreds+tens+units, tens+units, teens, or single digits) +/// and its string representation is concatenated. +fn parse_decimal_part(input: &str) -> Option { + let tokens: Vec<&str> = input.split_whitespace().collect(); + if tokens.is_empty() { + return None; + } + + let mut result = String::new(); + let mut i = 0; + + while i < tokens.len() { + let t = tokens[i]; + + // Try hundreds: "ciento cuarenta y uno" → 141, "novecientos veintiséis" → 926 + if let Some(hundred_base) = try_parse_hundred(t) { + let mut val = hundred_base; + let mut j = i + 1; + + if j < tokens.len() { + // "ciento cuarenta y uno" — tens word follows + if let Some(&tv) = lazy_static_tens(tokens[j]) { + val += tv; + j += 1; + // Check for "y UNIT" + if j + 1 < tokens.len() && tokens[j] == "y" { + if let Some(uv) = try_parse_unit(tokens[j + 1]) { + val += uv; + j += 2; + } + } + } + // "novecientos veintiséis" — compound teen/veinti- follows + else if let Some(sv) = try_parse_single(tokens[j]) { + if sv >= 1 && sv <= 29 { + val += sv; + j += 1; + } + } + // "ciento y uno" — "y" directly follows hundreds + else if tokens[j] == "y" && j + 1 < tokens.len() { + if let Some(uv) = try_parse_unit(tokens[j + 1]) { + val += uv; + j += 2; + } + } + } + + result.push_str(&val.to_string()); + i = j; + continue; + } + + // Try "TENS y UNIT": "treinta y tres" → 33, "noventa y dos" → 92 + if let Some(&tens_val) = lazy_static_tens(t) { + if i + 2 < tokens.len() && tokens[i + 1] == "y" { + if let Some(unit_val) = try_parse_unit(tokens[i + 2]) { + let compound = tens_val + unit_val; + result.push_str(&compound.to_string()); + i += 3; + continue; + } + } + // Tens alone: "treinta" → 30 + result.push_str(&tens_val.to_string()); + i += 1; + continue; + } + + // Single digit, teen, or veinti- compound + if let Some(val) = try_parse_single(t) { + result.push_str(&val.to_string()); + i += 1; + continue; + } + + return None; + } + + if result.is_empty() { + None + } else { + Some(result) + } +} + +fn lazy_static_tens(word: &str) -> Option<&i64> { + use lazy_static::lazy_static; + use std::collections::HashMap; + lazy_static! { + static ref TENS_MAP: HashMap<&'static str, i64> = { + let mut m = HashMap::new(); + m.insert("treinta", 30); + m.insert("cuarenta", 40); + m.insert("cincuenta", 50); + m.insert("sesenta", 60); + m.insert("setenta", 70); + m.insert("ochenta", 80); + m.insert("noventa", 90); + m + }; + } + TENS_MAP.get(word) +} + +fn try_parse_unit(word: &str) -> Option { + match word { + "uno" | "un" | "una" => Some(1), + "dos" => Some(2), + "tres" => Some(3), + "cuatro" => Some(4), + "cinco" => Some(5), + "seis" => Some(6), + "siete" => Some(7), + "ocho" => Some(8), + "nueve" => Some(9), + _ => None, + } +} + +fn try_parse_hundred(word: &str) -> Option { + match word { + "ciento" | "cien" => Some(100), + "doscientos" | "doscientas" => Some(200), + "trescientos" | "trescientas" => Some(300), + "cuatrocientos" | "cuatrocientas" => Some(400), + "quinientos" | "quinientas" => Some(500), + "seiscientos" | "seiscientas" => Some(600), + "setecientos" | "setecientas" => Some(700), + "ochocientos" | "ochocientas" => Some(800), + "novecientos" | "novecientas" => Some(900), + _ => None, + } +} + +fn try_parse_single(word: &str) -> Option { + match word { + "cero" => Some(0), + "uno" | "un" | "una" => Some(1), + "dos" => Some(2), + "tres" => Some(3), + "cuatro" => Some(4), + "cinco" => Some(5), + "seis" => Some(6), + "siete" => Some(7), + "ocho" => Some(8), + "nueve" => Some(9), + "diez" => Some(10), + "once" => Some(11), + "doce" => Some(12), + "trece" => Some(13), + "catorce" => Some(14), + "quince" => Some(15), + "dieciséis" => Some(16), + "diecisiete" => Some(17), + "dieciocho" => Some(18), + "diecinueve" => Some(19), + "veinte" => Some(20), + "veintiún" | "veintiuno" => Some(21), + "veintidós" => Some(22), + "veintitrés" => Some(23), + "veinticuatro" => Some(24), + "veinticinco" => Some(25), + "veintiséis" => Some(26), + "veintisiete" => Some(27), + "veintiocho" => Some(28), + "veintinueve" => Some(29), + _ => None, + } +} + +/// Extract scale suffix from end of string +fn extract_scale_suffix(input: &str) -> (&str, Option<&str>) { + let trimmed = input.trim(); + for &sw in SCALE_WORDS { + if trimmed.ends_with(sw) { + let before = trimmed[..trimmed.len() - sw.len()].trim(); + return (before, Some(sw)); + } + } + (trimmed, None) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_coma() { + assert_eq!(parse("uno coma dos seis"), Some("1,26".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!(parse("menos uno coma dos seis"), Some("-1,26".to_string())); + } + + #[test] + fn test_punto() { + assert_eq!(parse("uno punto treinta y tres"), Some("1.33".to_string())); + } + + #[test] + fn test_scale() { + assert_eq!(parse("un millón"), Some("1 millón".to_string())); + assert_eq!(parse("dos millones"), Some("2 millones".to_string())); + } +} diff --git a/src/asr/es/electronic.rs b/src/asr/es/electronic.rs new file mode 100644 index 0000000..e5cb3b5 --- /dev/null +++ b/src/asr/es/electronic.rs @@ -0,0 +1,139 @@ +//! Electronic tagger for Spanish. +//! +//! Converts spoken Spanish email/URL tokens to written form: +//! - "a b c arroba g mail punto com" → "abc@gmail.com" +//! - "hache te te pe ese dos puntos barra barra ..." → "https://..." + +use super::cardinal; + +/// Parse spoken Spanish electronic address to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + if !input_trim.contains("arroba") + && !input_trim.contains("punto") + && !input_trim.contains("barra") + { + return None; + } + + // Require "arroba" or multiple delimiters (punto/barra) to avoid matching + // decimal expressions like "uno punto treinta y tres" as electronic + if !input_trim.contains("arroba") { + let delim_count = input_trim.matches("punto").count() + + input_trim.matches("barra").count() + + input_trim.matches("dos puntos").count(); + if delim_count < 2 { + return None; + } + } + + let tokens: Vec<&str> = input_trim.split_whitespace().collect(); + if tokens.len() < 3 { + return None; + } + + let mut result = String::new(); + let mut i = 0; + + while i < tokens.len() { + let t = tokens[i]; + + // Multi-word tokens + if t == "doble" && i + 1 < tokens.len() && tokens[i + 1] == "ve" { + result.push('w'); + i += 2; + continue; + } + if t == "dos" && i + 1 < tokens.len() && tokens[i + 1] == "puntos" { + result.push(':'); + i += 2; + continue; + } + if t == "signo" + && i + 2 < tokens.len() + && tokens[i + 1] == "de" + && tokens[i + 2] == "interrogación" + { + result.push('?'); + i += 3; + continue; + } + if t == "signo" && i + 1 < tokens.len() && tokens[i + 1] == "igual" { + result.push('='); + i += 2; + continue; + } + + // Single-word special tokens + match t { + "arroba" => result.push('@'), + "punto" => result.push('.'), + "barra" => result.push('/'), + "guion" | "guión" => result.push('-'), + "hache" => result.push('h'), + "te" => result.push('t'), + "pe" => result.push('p'), + "ese" => result.push('s'), + "efe" => result.push('f'), + "ene" => result.push('n'), + "eme" => result.push('m'), + "ele" => result.push('l'), + "ere" => result.push('r'), + "ce" => result.push('c'), + "de" => result.push('d'), + "ge" => result.push('g'), + "jota" => result.push('j'), + "ka" => result.push('k'), + "cu" => result.push('q'), + "equis" => result.push('x'), + "ye" | "i griega" => result.push('y'), + "zeta" => result.push('z'), + _ => { + // Single letter (a-z) + if t.len() == 1 && t.chars().all(|c| c.is_ascii_alphabetic()) { + result.push_str(t); + } + // Digit word + else if let Some(digit) = cardinal::word_to_digit(t) { + result.push_str(&digit.to_string()); + } + // Multi-char word that's not a special token → append as-is + else if t.len() > 1 { + // Could be a domain part like "gmail", "nvidia", "com", "edu", "gob" + result.push_str(t); + } + } + } + + i += 1; + } + + if result.is_empty() || result == input_trim { + return None; + } + + Some(result) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_email() { + assert_eq!( + parse("a b c arroba g mail punto com"), + Some("abc@gmail.com".to_string()) + ); + } + + #[test] + fn test_url() { + assert_eq!( + parse("doble ve doble ve doble ve punto n vidia punto com"), + Some("www.nvidia.com".to_string()) + ); + } +} diff --git a/src/asr/es/fraction.rs b/src/asr/es/fraction.rs new file mode 100644 index 0000000..710cda3 --- /dev/null +++ b/src/asr/es/fraction.rs @@ -0,0 +1,214 @@ +//! Fraction tagger for Spanish. +//! +//! Converts spoken Spanish fractions to written form: +//! - "ocho tercios" → "8/3" +//! - "dos y dos tercios" → "2 2/3" +//! - "menos diez veinteavos" → "-10/20" + +use super::cardinal; + +/// Parse spoken Spanish fraction to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Passthrough small fractions + match input_trim { + "medio" | "media" | "un medio" | "una media" => return Some(input_trim.to_string()), + "un cuarto" | "una cuarta" => return Some(input_trim.to_string()), + "un tercio" => return Some(input_trim.to_string()), + _ => {} + } + + // Check for negative + let (is_negative, rest) = if input_trim.starts_with("menos ") { + (true, input_trim.strip_prefix("menos ")?) + } else { + (false, input_trim) + }; + + let sign = if is_negative { "-" } else { "" }; + + // Try mixed fraction: "dos y dos tercios" → "2 2/3" + if let Some(result) = parse_mixed_fraction(rest) { + return Some(format!("{}{}", sign, result)); + } + + // Try simple fraction: "ocho tercios" → "8/3" + if let Some(result) = parse_simple_fraction(rest) { + return Some(format!("{}{}", sign, result)); + } + + None +} + +/// Parse mixed fraction: "dos y dos tercios" → "2 2/3" +/// Pattern: "WHOLE y NUMER DENOM" +fn parse_mixed_fraction(input: &str) -> Option { + // Look for " y " separator for mixed fractions + // "cuatro y un quinto" → whole=4, frac=1/5 + let y_pos = input.find(" y ")?; + let whole_part = &input[..y_pos]; + let frac_part = &input[y_pos + 3..]; + + // Try parsing frac_part as a simple fraction + let frac = parse_simple_fraction(frac_part)?; + + // Parse whole part as a number + let whole = cardinal::words_to_number(whole_part)?; + + Some(format!("{} {}", whole, frac)) +} + +/// Parse simple fraction: "ocho tercios" → "8/3" +fn parse_simple_fraction(input: &str) -> Option { + let tokens: Vec<&str> = input.split_whitespace().collect(); + if tokens.len() < 2 { + return None; + } + + let last = *tokens.last()?; + + // Parse denominator + let denom = parse_denominator(last)?; + + // Parse numerator + let numer_str = tokens[..tokens.len() - 1].join(" "); + let numer = parse_numerator(&numer_str)?; + + Some(format!("{}/{}", numer, denom)) +} + +/// Parse numerator +fn parse_numerator(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed == "un" || trimmed == "una" || trimmed == "uno" { + return Some(1); + } + cardinal::words_to_number(trimmed) +} + +/// Parse denominator word to numeric value +fn parse_denominator(word: &str) -> Option { + match word { + "medio" | "media" | "medios" | "medias" => Some(2), + "tercio" | "tercios" => Some(3), + "cuarto" | "cuartos" | "cuarta" | "cuartas" => Some(4), + "quinto" | "quintos" | "quinta" | "quintas" => Some(5), + "sexto" | "sextos" => Some(6), + "séptimo" | "séptimos" => Some(7), + "octavo" | "octavos" => Some(8), + "noveno" | "novenos" => Some(9), + "décimo" | "décimos" => Some(10), + "onceavo" | "onceavos" => Some(11), + "doceavo" | "doceavos" => Some(12), + "treceavo" | "treceavos" => Some(13), + "catorceavo" | "catorceavos" => Some(14), + "quinceavo" | "quinceavos" => Some(15), + "dieciseisavo" | "dieciseisavos" => Some(16), + "diecisieteavo" | "diecisieteavos" => Some(17), + "dieciochoavo" | "dieciochoavos" => Some(18), + "diecinueveavo" | "diecinueveavos" => Some(19), + "veinteavo" | "veinteavos" => Some(20), + "vigésimo" | "vigésimos" => Some(20), + "treintavo" | "treintavos" => Some(30), + "cuarentavo" | "cuarentavos" => Some(40), + "cincuentavo" | "cincuentavos" => Some(50), + _ => parse_compound_denominator(word), + } +} + +/// Parse compound denominator like "cientounavos" → 101, "cuarentiunavo" → 41 +fn parse_compound_denominator(word: &str) -> Option { + // Try stripping -avo/-avos/-ava/-avas suffix + let stem = if let Some(s) = word.strip_suffix("avos") { + s + } else if let Some(s) = word.strip_suffix("avo") { + s + } else if let Some(s) = word.strip_suffix("avas") { + s + } else if let Some(s) = word.strip_suffix("ava") { + s + } else { + return None; + }; + + // Try to parse the stem as a number + // "cientoun" → "ciento un" → 101 + // "cuarentiun" → "cuarenta y un" → 41 + parse_denom_stem(stem) +} + +/// Parse a denominator stem to a number +fn parse_denom_stem(stem: &str) -> Option { + // Common compound patterns + match stem { + "cientoun" => Some(101), + "cuarentiun" => Some(41), + "treintaiun" | "treintaun" => Some(31), + _ => { + // Try splitting compound forms + // "cientoun" already handled above + // Try "ciento" + rest + if stem.starts_with("ciento") { + let rest = &stem[6..]; + let unit = parse_denom_unit(rest)?; + return Some(100 + unit); + } + if stem.starts_with("cien") && stem.len() > 4 { + let rest = &stem[4..]; + let unit = parse_denom_unit(rest)?; + return Some(100 + unit); + } + None + } + } +} + +fn parse_denom_unit(s: &str) -> Option { + match s { + "un" | "uno" | "una" => Some(1), + "dos" => Some(2), + "tres" => Some(3), + "cuatro" => Some(4), + "cinco" => Some(5), + "seis" => Some(6), + "siete" => Some(7), + "ocho" => Some(8), + "nueve" => Some(9), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple() { + assert_eq!(parse("ocho tercios"), Some("8/3".to_string())); + assert_eq!(parse("dos quintos"), Some("2/5".to_string())); + } + + #[test] + fn test_passthrough() { + assert_eq!(parse("medio"), Some("medio".to_string())); + assert_eq!(parse("un cuarto"), Some("un cuarto".to_string())); + } + + #[test] + fn test_mixed() { + assert_eq!(parse("dos y dos tercios"), Some("2 2/3".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!(parse("menos diez veinteavos"), Some("-10/20".to_string())); + } + + #[test] + fn test_compound_denom() { + assert_eq!(parse("once cientounavos"), Some("11/101".to_string())); + assert_eq!(parse("un cuarentiunavo"), Some("1/41".to_string())); + } +} diff --git a/src/asr/es/measure.rs b/src/asr/es/measure.rs new file mode 100644 index 0000000..660c32d --- /dev/null +++ b/src/asr/es/measure.rs @@ -0,0 +1,293 @@ +//! Measure tagger for Spanish. +//! +//! Converts spoken Spanish measurements to written form: +//! - "doscientos metros" → "200 m" +//! - "dos metros y medio" → "2 1/2 m" +//! - "dos más dos es igual a cuatro" → "2 + 2 = 4" + +use super::cardinal; +use super::decimal; +use super::fraction; + +struct UnitMapping { + spoken: &'static [&'static str], + written: &'static str, +} + +const UNITS: &[UnitMapping] = &[ + UnitMapping { + spoken: &["kilómetros por hora", "kilómetro por hora"], + written: "kph", + }, + UnitMapping { + spoken: &["millas por hora", "milla por hora"], + written: "mph", + }, + UnitMapping { + spoken: &["metros por hora", "metro por hora"], + written: "m/h", + }, + UnitMapping { + spoken: &["metros cúbicos", "metro cúbico"], + written: "m³", + }, + UnitMapping { + spoken: &["kilómetros", "kilómetro"], + written: "km", + }, + UnitMapping { + spoken: &["centímetros", "centímetro"], + written: "cm", + }, + UnitMapping { + spoken: &["milímetros", "milímetro"], + written: "mm", + }, + UnitMapping { + spoken: &["metros", "metro"], + written: "m", + }, + UnitMapping { + spoken: &["kilogramos", "kilogramo", "kilos", "kilo"], + written: "kg", + }, + UnitMapping { + spoken: &["gramos", "gramo"], + written: "g", + }, + UnitMapping { + spoken: &["litros", "litro"], + written: "l", + }, + UnitMapping { + spoken: &["mililitros", "mililitro"], + written: "ml", + }, + UnitMapping { + spoken: &["horas", "hora"], + written: "h", + }, + UnitMapping { + spoken: &["segundos", "segundo"], + written: "s", + }, + UnitMapping { + spoken: &["minutos", "minuto"], + written: "min", + }, + UnitMapping { + spoken: &["grados farenheit", "grado farenheit"], + written: "° F", + }, + UnitMapping { + spoken: &["grados celsius", "grado celsius"], + written: "° C", + }, + UnitMapping { + spoken: &["grados", "grado"], + written: "°", + }, + UnitMapping { + spoken: &["por ciento", "porciento"], + written: "%", + }, + UnitMapping { + spoken: &["millas", "milla"], + written: "mi", + }, +]; + +/// Parse spoken Spanish measurement to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Try math expression: "dos más dos es igual a cuatro" + if let Some(result) = parse_math(input_trim) { + return Some(result); + } + + // Try fraction + unit: "dos metros y medio" → "2 1/2 m" + if let Some(result) = parse_fraction_measure(input_trim) { + return Some(result); + } + + // Try "tres quintos de metro" → "3/5 m" + if let Some(result) = parse_fraction_de_unit(input_trim) { + return Some(result); + } + + // Try decimal + unit: "sesenta coma dos cuatro cero cero kilogramos" + if let Some(result) = parse_decimal_measure(input_trim) { + return Some(result); + } + + // Try simple: "doscientos metros" → "200 m" + if let Some(result) = parse_simple_measure(input_trim) { + return Some(result); + } + + None +} + +/// Parse math expression: "dos más dos es igual a cuatro" → "2 + 2 = 4" +fn parse_math(input: &str) -> Option { + if !input.contains(" es igual a ") { + return None; + } + let parts: Vec<&str> = input.splitn(2, " es igual a ").collect(); + if parts.len() != 2 { + return None; + } + + let left = parts[0].trim(); + let right = parts[1].trim(); + + // Parse right side + let right_val = cardinal::words_to_number(right)?; + + // Parse left side: "X más Y" or "X menos Y" or "X por Y" + if let Some(pos) = left.find(" más ") { + let a = cardinal::words_to_number(&left[..pos])?; + let b = cardinal::words_to_number(&left[pos + 5..])?; + return Some(format!("{} + {} = {}", a, b, right_val)); + } + if let Some(pos) = left.find(" menos ") { + let a = cardinal::words_to_number(&left[..pos])?; + let b = cardinal::words_to_number(&left[pos + 7..])?; + return Some(format!("{} - {} = {}", a, b, right_val)); + } + + None +} + +/// Parse fraction + unit: "dos metros y medio" → "2 1/2 m" +/// Also: "menos tres y medio metros por hora" → "-3 1/2 m/h" +fn parse_fraction_measure(input: &str) -> Option { + // Check for negative + let (sign, rest) = if input.starts_with("menos ") { + ("-", &input[6..]) + } else { + ("", input) + }; + + for unit in UNITS { + for &spoken in unit.spoken { + // "X UNIT y medio" → "X 1/2 UNIT" + let patterns = [ + (format!(" {} y medio", spoken), "1/2"), + (format!(" {} y media", spoken), "1/2"), + ]; + for (pattern, frac) in &patterns { + if rest.ends_with(pattern.as_str()) { + let before = rest[..rest.len() - pattern.len()].trim(); + let num = cardinal::words_to_number(before)?; + return Some(format!("{}{} {} {}", sign, num, frac, unit.written)); + } + } + + // "X y medio UNIT" → "X 1/2 UNIT" + if rest.ends_with(spoken) { + let before = rest[..rest.len() - spoken.len()].trim(); + if before.ends_with(" y medio") || before.ends_with(" y media") { + let num_part = if before.ends_with(" y medio") { + &before[..before.len() - 8] + } else { + &before[..before.len() - 8] + }; + let num = cardinal::words_to_number(num_part.trim())?; + return Some(format!("{}{} 1/2 {}", sign, num, unit.written)); + } + } + } + } + None +} + +/// Parse "tres quintos de metro" → "3/5 m" +fn parse_fraction_de_unit(input: &str) -> Option { + for unit in UNITS { + for &spoken in unit.spoken { + let de_pattern = format!(" de {}", spoken); + if input.ends_with(&de_pattern) { + let before = input[..input.len() - de_pattern.len()].trim(); + if let Some(frac) = fraction::parse(before) { + return Some(format!("{} {}", frac, unit.written)); + } + } + } + } + None +} + +/// Parse decimal + unit: "sesenta coma dos cuatro cero cero kilogramos" → "60,2400 kg" +fn parse_decimal_measure(input: &str) -> Option { + if !input.contains(" coma ") { + return None; + } + + for unit in UNITS { + for &spoken in unit.spoken { + if input.ends_with(spoken) { + let before = input[..input.len() - spoken.len()].trim(); + if let Some(dec_result) = decimal::parse(before) { + return Some(format!("{} {}", dec_result, unit.written)); + } + } + } + } + None +} + +/// Parse simple measure: "doscientos metros" → "200 m" +fn parse_simple_measure(input: &str) -> Option { + // Check for negative + let (sign, rest) = if input.starts_with("menos ") { + ("-", &input[6..]) + } else { + ("", input) + }; + + for unit in UNITS { + for &spoken in unit.spoken { + if rest.ends_with(spoken) { + let before = rest[..rest.len() - spoken.len()].trim(); + if before.is_empty() { + continue; + } + // Handle "una hora" → "1 h" (feminine) + let num = if before == "una" || before == "un" { + 1 + } else { + cardinal::words_to_number(before)? as i64 + }; + return Some(format!("{}{} {}", sign, num, unit.written)); + } + } + } + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple() { + assert_eq!(parse("doscientos metros"), Some("200 m".to_string())); + assert_eq!(parse("una hora"), Some("1 h".to_string())); + } + + #[test] + fn test_fraction() { + assert_eq!(parse("dos metros y medio"), Some("2 1/2 m".to_string())); + } + + #[test] + fn test_math() { + assert_eq!( + parse("dos más dos es igual a cuatro"), + Some("2 + 2 = 4".to_string()) + ); + } +} diff --git a/src/asr/es/mod.rs b/src/asr/es/mod.rs new file mode 100644 index 0000000..8361ec8 --- /dev/null +++ b/src/asr/es/mod.rs @@ -0,0 +1,20 @@ +//! Inverse Text Normalization taggers for Spanish. +//! +//! Converts spoken-form Spanish to written form: +//! - "doscientos cincuenta y uno" → "251" +//! - "doce dólares y cinco centavos" → "$12,05" +//! - "primero de enero" → "1 de enero" + +pub mod cardinal; +pub mod date; +pub mod decimal; +pub mod electronic; +pub mod fraction; +pub mod measure; +pub mod money; +pub mod ordinal; +pub mod punctuation; +pub mod telephone; +pub mod time; +pub mod whitelist; +pub mod word; diff --git a/src/asr/es/money.rs b/src/asr/es/money.rs new file mode 100644 index 0000000..de011b9 --- /dev/null +++ b/src/asr/es/money.rs @@ -0,0 +1,434 @@ +//! Money tagger for Spanish. +//! +//! Converts spoken Spanish currency expressions to written form: +//! - "doce dólares y cinco centavos" → "$12,05" +//! - "veinticinco céntimos" → "€0,25" +//! - "diez pesetas" → "₧10" + +use super::cardinal; + +struct Currency { + names: &'static [&'static str], + symbol: &'static str, + cent_names: &'static [&'static str], +} + +const CURRENCIES: &[Currency] = &[ + Currency { + names: &["dólares estadounidenses", "dólares americanos"], + symbol: "US$", + cent_names: &["centavos", "centavo"], + }, + Currency { + names: &["pesos mexicanos", "peso mexicano"], + symbol: "Mex$", + cent_names: &["centavos", "centavo"], + }, + Currency { + names: &["dólar", "dólares"], + symbol: "$", + cent_names: &["centavos", "centavo", "céntimos", "céntimo"], + }, + Currency { + names: &["euro", "euros"], + symbol: "€", + cent_names: &["centavos", "centavo", "céntimos", "céntimo"], + }, + Currency { + names: &["peso", "pesos"], + symbol: "$", + cent_names: &["centavos", "centavo"], + }, + Currency { + names: &["yen", "yenes"], + symbol: "¥", + cent_names: &["centavos", "centavo"], + }, + Currency { + names: &["peseta", "pesetas"], + symbol: "₧", + cent_names: &[], + }, + Currency { + names: &["colón", "colones"], + symbol: "₡", + cent_names: &[], + }, + Currency { + names: &["won", "wones"], + symbol: "₩", + cent_names: &["chon", "chones"], + }, + Currency { + names: &["quetzal", "quetzales"], + symbol: "q", + cent_names: &[], + }, +]; + +/// Parse spoken Spanish money expression to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + if !input_trim.contains(' ') { + return None; + } + + // Try "dos dólares y sesenta y tres dólares" → "$2 y $63" (two amounts) + if let Some(result) = parse_two_amounts(input_trim) { + return Some(result); + } + + // Try scale money: "nueve punto cinco millones de pesos" → "$9.5 millones" + if let Some(result) = parse_scale_money(input_trim) { + return Some(result); + } + + // Try full scale: "catorce millones quinientos mil pesos mexicanos" → "Mex$14500000" + if let Some(result) = parse_full_scale_money(input_trim) { + return Some(result); + } + + // Try "X CURRENCY y/con Y centavos" + if let Some(result) = parse_with_subcurrency(input_trim) { + return Some(result); + } + + // Try "X CURRENCY Y [centavos]" (implied or explicit cents) + if let Some(result) = parse_implied_cents(input_trim) { + return Some(result); + } + + // Try "X CURRENCY con Y" + if let Some(result) = parse_con_amount(input_trim) { + return Some(result); + } + + // Try simple: "un dólar" → "$1" + if let Some(result) = parse_simple(input_trim) { + return Some(result); + } + + // Try cent-only: "veinticinco centavos" → "$0,25" + if let Some(result) = parse_cents_only(input_trim) { + return Some(result); + } + + // Try chon: "un chon" → "₩0,01" + if let Some(result) = parse_subunit_only(input_trim) { + return Some(result); + } + + None +} + +/// Parse two separate amounts: "dos dólares y sesenta y tres dólares" +fn parse_two_amounts(input: &str) -> Option { + for cur in CURRENCIES { + for &name in cur.names { + // Look for "X NAME y ... NAME" pattern + let pattern = format!(" {} y ", name); + if let Some(pos) = input.find(&pattern) { + let first_part = &input[..pos]; + let second_part = &input[pos + pattern.len()..]; + + // Second part should end with same currency + if second_part.ends_with(name) { + let second_num = second_part[..second_part.len() - name.len()].trim(); + let first_val = cardinal::words_to_number(first_part)?; + let second_val = cardinal::words_to_number(second_num)?; + return Some(format!( + "{}{} y {}{}", + cur.symbol, first_val, cur.symbol, second_val + )); + } + } + } + } + None +} + +/// Parse scale money: "nueve punto cinco millones de pesos" → "$9.5 millones" +fn parse_scale_money(input: &str) -> Option { + let scale_words = ["millones", "millón", "billones", "billón"]; + + for cur in CURRENCIES { + for &name in cur.names { + // Check for "de CURRENCY" at end + let de_pattern = format!("de {}", name); + if input.ends_with(&de_pattern) { + let before = input[..input.len() - de_pattern.len()].trim(); + // Check for scale word + for &sw in &scale_words { + if before.ends_with(sw) { + let num_part = before[..before.len() - sw.len()].trim(); + // Try "punto" decimal + if num_part.contains(" punto ") { + let parts: Vec<&str> = num_part.splitn(2, " punto ").collect(); + let int_val = cardinal::words_to_number(parts[0].trim())?; + let dec_digits = parse_decimal_digits(parts[1].trim())?; + return Some(format!( + "{}{}.{} {}", + cur.symbol, int_val, dec_digits, sw + )); + } + let num = cardinal::words_to_number(num_part)?; + return Some(format!("{}{} {}", cur.symbol, num, sw)); + } + } + } + } + } + None +} + +/// Parse full-scale money: "catorce millones quinientos mil pesos mexicanos" → "Mex$14500000" +fn parse_full_scale_money(input: &str) -> Option { + for cur in CURRENCIES { + for &name in cur.names { + if input.ends_with(name) { + let before = input[..input.len() - name.len()].trim(); + if before.is_empty() { + continue; + } + // Must contain a scale word to be full-scale + let has_scale = ["millones", "millón", "mil", "billones", "billón"] + .iter() + .any(|&sw| before.contains(sw)); + if !has_scale { + continue; + } + let num = cardinal::words_to_number(before)?; + if num >= 1000 { + return Some(format!("{}{}", cur.symbol, num)); + } + } + } + } + None +} + +/// Parse with subcurrency: "doce dólares y cinco centavos" → "$12,05" +fn parse_with_subcurrency(input: &str) -> Option { + for cur in CURRENCIES { + for ¢_name in cur.cent_names { + if !input.ends_with(cent_name) { + continue; + } + let before_cent = input[..input.len() - cent_name.len()].trim(); + + // Try "X CURRENCY y Y" + for &cur_name in cur.names { + // "y" separator + let y_pattern = format!("{} y ", cur_name); + if let Some(pos) = before_cent.find(&y_pattern) { + let main_part = &before_cent[..pos]; + let cent_part = &before_cent[pos + y_pattern.len()..]; + + let main_val = cardinal::words_to_number(main_part)?; + let cent_val = cardinal::words_to_number(cent_part.trim())?; + + return Some(format!("{}{},{:02}", cur.symbol, main_val, cent_val)); + } + + // "con" separator + let con_pattern = format!("{} con ", cur_name); + if let Some(pos) = before_cent.find(&con_pattern) { + let main_part = &before_cent[..pos]; + let cent_part = &before_cent[pos + con_pattern.len()..]; + + let main_val = cardinal::words_to_number(main_part)?; + let cent_val = cardinal::words_to_number(cent_part.trim())?; + + return Some(format!("{}{},{:02}", cur.symbol, main_val, cent_val)); + } + + // No separator: "veintinueve dólares cincuenta centavos" + let space_pattern = format!("{} ", cur_name); + if let Some(pos) = before_cent.find(&space_pattern) { + let main_part = &before_cent[..pos]; + let cent_part = &before_cent[pos + space_pattern.len()..]; + + let main_val = cardinal::words_to_number(main_part)?; + let cent_val = cardinal::words_to_number(cent_part.trim())?; + + return Some(format!("{}{},{:02}", cur.symbol, main_val, cent_val)); + } + } + } + } + None +} + +/// Parse implied cents: "setenta y cinco dólares sesenta y tres" → "$75,63" +fn parse_implied_cents(input: &str) -> Option { + for cur in CURRENCIES { + for &cur_name in cur.names { + let pattern = format!(" {} ", cur_name); + if let Some(pos) = input.find(&pattern) { + let main_part = &input[..pos]; + let cent_part = &input[pos + pattern.len()..]; + + // cent_part should not end with a currency name + let is_subcurrency = cur.cent_names.iter().any(|&c| cent_part.ends_with(c)); + if is_subcurrency { + continue; + } + + let main_val = cardinal::words_to_number(main_part)?; + let cent_val = cardinal::words_to_number(cent_part)?; + + return Some(format!("{}{},{:02}", cur.symbol, main_val, cent_val)); + } + } + } + None +} + +/// Parse "X CURRENCY con Y" +fn parse_con_amount(input: &str) -> Option { + for cur in CURRENCIES { + for &cur_name in cur.names { + let pattern = format!(" {} con ", cur_name); + if let Some(pos) = input.find(&pattern) { + let main_part = &input[..pos]; + let cent_part = &input[pos + pattern.len()..]; + + let main_val = cardinal::words_to_number(main_part)?; + let cent_val = cardinal::words_to_number(cent_part)?; + + return Some(format!("{}{},{:02}", cur.symbol, main_val, cent_val)); + } + } + } + None +} + +/// Parse simple: "un dólar" → "$1" +fn parse_simple(input: &str) -> Option { + for cur in CURRENCIES { + for &cur_name in cur.names { + if input.ends_with(cur_name) { + let before = input[..input.len() - cur_name.len()].trim(); + if before.is_empty() { + continue; + } + let num = cardinal::words_to_number(before)?; + return Some(format!("{}{}", cur.symbol, num)); + } + } + } + None +} + +/// Parse cents-only: "veinticinco centavos" → "$0,25" +fn parse_cents_only(input: &str) -> Option { + // "centavos" defaults to dollar + for ¢_name in &["centavos", "centavo"] { + if input.ends_with(cent_name) { + let before = input[..input.len() - cent_name.len()].trim(); + if before.is_empty() { + continue; + } + let num = cardinal::words_to_number(before)?; + return Some(format!("$0,{:02}", num)); + } + } + // "céntimos" defaults to euro + for ¢_name in &["céntimos", "céntimo"] { + if input.ends_with(cent_name) { + let before = input[..input.len() - cent_name.len()].trim(); + if before.is_empty() { + continue; + } + let num = cardinal::words_to_number(before)?; + return Some(format!("€0,{:02}", num)); + } + } + None +} + +/// Parse subunit-only: "un chon" → "₩0,01" +fn parse_subunit_only(input: &str) -> Option { + for cur in CURRENCIES { + for ¢_name in cur.cent_names { + if input.ends_with(cent_name) { + let before = input[..input.len() - cent_name.len()].trim(); + if before.is_empty() { + continue; + } + let num = cardinal::words_to_number(before)?; + return Some(format!("{}0,{:02}", cur.symbol, num)); + } + } + } + None +} + +/// Parse decimal digits +fn parse_decimal_digits(input: &str) -> Option { + let digit_map = [ + ("cero", "0"), + ("uno", "1"), + ("un", "1"), + ("dos", "2"), + ("tres", "3"), + ("cuatro", "4"), + ("cinco", "5"), + ("seis", "6"), + ("siete", "7"), + ("ocho", "8"), + ("nueve", "9"), + ]; + + let tokens: Vec<&str> = input.split_whitespace().collect(); + let mut result = String::new(); + for token in &tokens { + let mut found = false; + for &(word, digit) in &digit_map { + if token == &word { + result.push_str(digit); + found = true; + break; + } + } + if !found { + // Try as a compound number + if let Some(num) = cardinal::words_to_number(token) { + result.push_str(&num.to_string()); + } else { + return None; + } + } + } + Some(result) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple() { + assert_eq!(parse("un dólar"), Some("$1".to_string())); + } + + #[test] + fn test_with_cents() { + assert_eq!( + parse("doce dólares y cinco centavos"), + Some("$12,05".to_string()) + ); + } + + #[test] + fn test_centimos() { + assert_eq!(parse("veinticinco céntimos"), Some("€0,25".to_string())); + } + + #[test] + fn test_pesetas() { + assert_eq!(parse("diez pesetas"), Some("₧10".to_string())); + } +} diff --git a/src/asr/es/ordinal.rs b/src/asr/es/ordinal.rs new file mode 100644 index 0000000..ebda994 --- /dev/null +++ b/src/asr/es/ordinal.rs @@ -0,0 +1,193 @@ +//! Ordinal number tagger for Spanish. +//! +//! Converts spoken Spanish ordinals to written form: +//! - "primero" → "primero" (small ordinals stay as words) +//! - "décimo" → "10.º" +//! - "vigésimo primero" → "21.º" + +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + /// Small ordinals that pass through as words + static ref PASSTHROUGH: Vec<&'static str> = vec![ + "primero", "primera", "primer", "segundo", "segunda", + "tercero", "tercera", "tercer", "cuarto", "cuarta", + "quinto", "quinta", "sexto", "sexta", + "séptimo", "séptima", "octavo", "octava", + "noveno", "novena", + ]; + + /// Ordinal word → (value, gender) mappings + /// Gender: 'm' = masculine, 'f' = feminine, 'r' = abbreviated masculine (ᵉʳ) + static ref ORDINALS: HashMap<&'static str, (i64, char)> = { + let mut m = HashMap::new(); + // Tens ordinals + m.insert("décimo", (10, 'm')); + m.insert("décima", (10, 'f')); + m.insert("undécimo", (11, 'm')); + m.insert("undécima", (11, 'f')); + m.insert("duodécimo", (12, 'm')); + m.insert("duodécima", (12, 'f')); + m.insert("decimotercero", (13, 'm')); + m.insert("decimotercera", (13, 'f')); + m.insert("decimocuarto", (14, 'm')); + m.insert("decimoquinto", (15, 'm')); + m.insert("decimosexto", (16, 'm')); + m.insert("decimoséptimo", (17, 'm')); + m.insert("decimoctavo", (18, 'm')); + m.insert("decimonoveno", (19, 'm')); + m.insert("vigésimo", (20, 'm')); + m.insert("vigésima", (20, 'f')); + m.insert("vigesimosegundo", (22, 'm')); + m.insert("vigesimosegunda", (22, 'f')); + m.insert("vigesimoctavo", (28, 'm')); + m.insert("trigésimo", (30, 'm')); + m.insert("trigésima", (30, 'f')); + m.insert("cuadragésimo", (40, 'm')); + m.insert("quincuagésimo", (50, 'm')); + m.insert("sexagésimo", (60, 'm')); + m.insert("septuagésimo", (70, 'm')); + m.insert("octogésimo", (80, 'm')); + m.insert("nonagésimo", (90, 'm')); + m.insert("centésimo", (100, 'm')); + m.insert("centésima", (100, 'f')); + // Compound forms that don't split + m.insert("decimoprimero", (11, 'm')); + m.insert("decimoprimera", (11, 'f')); + m.insert("decimoprimer", (11, 'r')); + m + }; + + /// Small ordinal components for compound ordinals + static ref ORDINAL_UNITS: HashMap<&'static str, (i64, char)> = { + let mut m = HashMap::new(); + m.insert("primero", (1, 'm')); + m.insert("primera", (1, 'f')); + m.insert("primer", (1, 'r')); + m.insert("segundo", (2, 'm')); + m.insert("segunda", (2, 'f')); + m.insert("tercero", (3, 'm')); + m.insert("tercera", (3, 'f')); + m.insert("tercer", (3, 'r')); + m.insert("cuarto", (4, 'm')); + m.insert("cuarta", (4, 'f')); + m.insert("quinto", (5, 'm')); + m.insert("quinta", (5, 'f')); + m.insert("sexto", (6, 'm')); + m.insert("sexta", (6, 'f')); + m.insert("séptimo", (7, 'm')); + m.insert("séptima", (7, 'f')); + m.insert("octavo", (8, 'm')); + m.insert("octava", (8, 'f')); + m.insert("noveno", (9, 'm')); + m.insert("novena", (9, 'f')); + m.insert("undécimo", (11, 'm')); + m.insert("undécima", (11, 'f')); + m + }; +} + +/// Parse spoken Spanish ordinal to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Handle prefix text like "(technically ungrammatical)" + let (prefix, ordinal_part) = extract_prefix(input_trim); + + // Check passthrough + if prefix.is_none() && PASSTHROUGH.contains(&ordinal_part) { + return Some(ordinal_part.to_string()); + } + + // Try single-word ordinal + if let Some(&(val, gender)) = ORDINALS.get(ordinal_part) { + let suffix = gender_suffix(gender); + let result = format!("{}{}", val, suffix); + return Some(with_prefix(prefix, &result)); + } + + // Try multi-word compound ordinals: "vigésimo primero", "centésimo trigésimo cuarto" + let tokens: Vec<&str> = ordinal_part.split_whitespace().collect(); + if tokens.len() >= 2 { + let mut total: i64 = 0; + let mut last_gender = 'm'; + + for &token in &tokens { + if let Some(&(val, g)) = ORDINALS.get(token) { + total += val; + last_gender = g; + } else if let Some(&(val, g)) = ORDINAL_UNITS.get(token) { + total += val; + last_gender = g; + } else { + return None; + } + } + + if total > 0 { + let suffix = gender_suffix(last_gender); + let result = format!("{}{}", total, suffix); + return Some(with_prefix(prefix, &result)); + } + } + + None +} + +/// Extract prefix like "(technically ungrammatical)" from ordinal input +fn extract_prefix(input: &str) -> (Option, &str) { + // Check for parenthesized prefix + if input.starts_with('(') { + if let Some(close) = input.find(')') { + let prefix = &input[..close + 1]; + let rest = input[close + 1..].trim(); + return (Some(prefix.to_string()), rest); + } + } + (None, input) +} + +fn with_prefix(prefix: Option, result: &str) -> String { + if let Some(p) = prefix { + format!("{} {}", p, result) + } else { + result.to_string() + } +} + +fn gender_suffix(gender: char) -> &'static str { + match gender { + 'f' => ".ª", + 'r' => ".ᵉʳ", + _ => ".º", + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_passthrough() { + assert_eq!(parse("primero"), Some("primero".to_string())); + assert_eq!(parse("tercera"), Some("tercera".to_string())); + assert_eq!(parse("noveno"), Some("noveno".to_string())); + } + + #[test] + fn test_simple() { + assert_eq!(parse("décimo"), Some("10.º".to_string())); + assert_eq!(parse("undécima"), Some("11.ª".to_string())); + } + + #[test] + fn test_compound() { + assert_eq!(parse("vigésimo primero"), Some("21.º".to_string())); + assert_eq!( + parse("centésimo trigésimo cuarto"), + Some("134.º".to_string()) + ); + } +} diff --git a/src/asr/es/punctuation.rs b/src/asr/es/punctuation.rs new file mode 100644 index 0000000..4991eb3 --- /dev/null +++ b/src/asr/es/punctuation.rs @@ -0,0 +1,34 @@ +//! Punctuation tagger for Spanish. +//! +//! Converts spoken Spanish punctuation words to symbols: +//! - "punto" → "." +//! - "coma" → "," +//! - "signo de interrogación" → "?" + +use lazy_static::lazy_static; + +lazy_static! { + static ref PUNCTUATION: Vec<(&'static str, &'static str)> = vec![ + ("signo de interrogación", "?"), + ("signo de exclamación", "!"), + ("dos puntos", ":"), + ("punto y coma", ";"), + ("punto", "."), + ("coma", ","), + ("guión", "-"), + ]; +} + +/// Parse spoken Spanish punctuation to symbol. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + for &(spoken, symbol) in PUNCTUATION.iter() { + if input_trim == spoken { + return Some(symbol.to_string()); + } + } + + None +} diff --git a/src/asr/es/telephone.rs b/src/asr/es/telephone.rs new file mode 100644 index 0000000..e523a68 --- /dev/null +++ b/src/asr/es/telephone.rs @@ -0,0 +1,225 @@ +//! Telephone tagger for Spanish. +//! +//! Converts spoken Spanish phone number to written form: +//! - "uno dos tres uno dos tres cinco seis siete ocho" → "123-123-5678" +//! - "más uno uno dos tres ..." → "+1-123-123-5678" +//! - "triple tres ..." → "333-..." + +use super::cardinal; + +/// Parse spoken Spanish phone number to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Must have spaces (multiple words) + if !input_trim.contains(' ') { + return None; + } + + let tokens: Vec<&str> = input_trim.split_whitespace().collect(); + + // Extract extension if present + let (main_tokens, extension) = extract_extension(&tokens); + + // Extract international prefix + let (prefix, digit_tokens) = extract_prefix(main_tokens); + + // Convert tokens to digit groups + let digits = tokens_to_digits(digit_tokens)?; + + if digits.is_empty() { + return None; + } + + // Format the number + let formatted = format_phone_number(&digits)?; + + let mut result = String::new(); + if let Some(p) = prefix { + result.push_str(&format!("+{}-", p)); + } + result.push_str(&formatted); + + if let Some(ext) = extension { + result.push_str(&format!(" ext. {}", ext)); + } + + Some(result) +} + +/// Extract extension: "extensión doce" → (tokens, Some("12")) +fn extract_extension<'a>(tokens: &'a [&'a str]) -> (&'a [&'a str], Option) { + for (i, &t) in tokens.iter().enumerate() { + if t == "extensión" { + let ext_words = &tokens[i + 1..]; + let ext_str = ext_words.join(" "); + if let Some(num) = cardinal::words_to_number(&ext_str) { + return (&tokens[..i], Some(num.to_string())); + } + } + } + (tokens, None) +} + +/// Extract international prefix: "más uno" → (Some("1"), rest) +/// Also handles multi-digit codes: "más cincuenta y cuatro" → (Some("54"), rest) +fn extract_prefix<'a>(tokens: &'a [&'a str]) -> (Option, &'a [&'a str]) { + if tokens.is_empty() { + return (None, tokens); + } + + if tokens[0] == "más" && tokens.len() > 1 { + // Try single digit first: "más uno" → 1 + if let Some(d) = single_digit(tokens[1]) { + return (Some(d.to_string()), &tokens[2..]); + } + + // Try multi-word country code: "más cincuenta y cuatro" → 54 + // Try longest match first (up to 3 tokens), require the rest to start + // with a parseable digit token to avoid consuming phone digits + let remaining = &tokens[1..]; + let max_cc = 3.min(remaining.len()); + for end in (1..=max_cc).rev() { + let candidate = remaining[..end].join(" "); + if let Some(num) = cardinal::words_to_number(&candidate) { + let num = num as i64; + if num >= 10 && num <= 999 { + // Verify the next token after the country code is a digit + let after = &remaining[end..]; + if !after.is_empty() + && (single_digit(after[0]).is_some() + || cardinal::words_to_number(after[0]).is_some() + || after[0] == "triple") + { + return (Some(num.to_string()), after); + } + } + } + } + } + + (None, tokens) +} + +/// Convert word tokens to digit groups +fn tokens_to_digits(tokens: &[&str]) -> Option> { + let mut digits = Vec::new(); + let mut i = 0; + + while i < tokens.len() { + let t = tokens[i]; + + // Handle "triple X" → XXX + if t == "triple" && i + 1 < tokens.len() { + let next = tokens[i + 1]; + if let Some(d) = single_digit(next) { + digits.push(d); + digits.push(d); + digits.push(d); + i += 2; + continue; + } + } + + // Try compound number (veintitrés → 23, cincuenta y seis → 56) + // First try multi-word: "cincuenta y seis" (3 tokens) + if i + 2 < tokens.len() && tokens[i + 1] == "y" { + let compound = format!("{} y {}", t, tokens[i + 2]); + if let Some(num) = cardinal::words_to_number(&compound) { + let num = num as u64; + if num >= 10 && num <= 99 { + digits.push((num / 10) as u8); + digits.push((num % 10) as u8); + i += 3; + continue; + } + } + } + + // Single compound word (veintitrés → 23) + if let Some(num) = cardinal::words_to_number(t) { + let num = num as u64; + if num >= 10 && num <= 99 { + digits.push((num / 10) as u8); + digits.push((num % 10) as u8); + } else if num <= 9 { + digits.push(num as u8); + } else { + return None; + } + i += 1; + continue; + } + + // Single digit word + if let Some(d) = single_digit(t) { + digits.push(d); + i += 1; + continue; + } + + return None; + } + + Some(digits) +} + +/// Parse single digit word +fn single_digit(word: &str) -> Option { + match word { + "cero" => Some(0), + "uno" | "un" | "una" => Some(1), + "dos" => Some(2), + "tres" => Some(3), + "cuatro" => Some(4), + "cinco" => Some(5), + "seis" => Some(6), + "siete" => Some(7), + "ocho" => Some(8), + "nueve" => Some(9), + _ => None, + } +} + +/// Format phone digits into standard format +fn format_phone_number(digits: &[u8]) -> Option { + let s: String = digits.iter().map(|d| d.to_string()).collect(); + + match digits.len() { + 10 => Some(format!("{}-{}-{}", &s[..3], &s[3..6], &s[6..10])), + 9 => Some(format!("{}-{}-{}", &s[..3], &s[3..6], &s[6..9])), + 8 => Some(format!("{}-{}", &s[..4], &s[4..8])), + 7 => Some(format!("{}-{}", &s[..3], &s[3..7])), + _ => Some(s), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!( + parse("uno dos tres uno dos tres cinco seis siete ocho"), + Some("123-123-5678".to_string()) + ); + } + + #[test] + fn test_international() { + assert_eq!( + parse("más uno uno dos tres uno dos tres cinco seis siete ocho"), + Some("+1-123-123-5678".to_string()) + ); + } + + #[test] + fn test_triple() { + assert_eq!( + parse("triple tres uno dos tres cinco seis siete ocho"), + Some("333-123-5678".to_string()) + ); + } +} diff --git a/src/asr/es/time.rs b/src/asr/es/time.rs new file mode 100644 index 0000000..26ef52c --- /dev/null +++ b/src/asr/es/time.rs @@ -0,0 +1,339 @@ +//! Time tagger for Spanish. +//! +//! Converts spoken Spanish time expressions to written form: +//! - "las dieciséis cincuenta" → "las 16:50" +//! - "la una y cuarto" → "la 1:15" +//! - "las dos menos cuarto" → "la 1:45" +//! - "cuarto para las dos" → "la 1:45" + +use super::cardinal; + +/// Parse spoken Spanish time expression to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Try "X para las Y" pattern (including "cuarto para las X", "un cuarto para las X") + if let Some(result) = parse_para(input_trim) { + return Some(result); + } + + // Try "X y media de la tarde" (no article output) + if let Some(result) = parse_media_de_la_tarde(input_trim) { + return Some(result); + } + + // Try "la/las X" patterns + if input_trim.starts_with("la ") || input_trim.starts_with("las ") { + return parse_article_time(input_trim); + } + + None +} + +/// Parse "X para las Y" → "las (Y-1):(60-X) Uhr" +fn parse_para(input: &str) -> Option { + // "cuarto para las dos" → "la 1:45" + // "un cuarto para las dos" → "la 1:45" + // "diez para las doce" → "las 11:50" + + let para_pos = input.find(" para las ")?; + let before = &input[..para_pos]; + let after = &input[para_pos + 10..]; // " para las " is 10 chars + + let hour = parse_hour_word(after)?; + let minutes = parse_minutes_before(before)?; + + let (actual_hour, actual_min) = subtract_time(hour, minutes); + + let article = if actual_hour == 1 { "la" } else { "las" }; + Some(format!("{} {}:{:02}", article, actual_hour, actual_min)) +} + +/// Parse "X y media de la tarde" → "X:30 p.m." +fn parse_media_de_la_tarde(input: &str) -> Option { + if !input.ends_with(" de la tarde") { + return None; + } + let before = input[..input.len() - 12].trim(); + + // "dos y media" → hour=2, min=30 + if before.ends_with(" y media") { + let hour_part = before[..before.len() - 8].trim(); + let hour = parse_hour_word(hour_part)?; + return Some(format!("{}:{:02} p.m.", hour, 30)); + } + + None +} + +/// Parse "la/las X ..." time patterns +fn parse_article_time(input: &str) -> Option { + let (article, rest) = if input.starts_with("la ") { + ("la", &input[3..]) + } else if input.starts_with("las ") { + ("las", &input[4..]) + } else { + return None; + }; + + // Extract timezone suffix "u t c más X" + let (time_part, tz) = extract_timezone(rest); + + // Extract AM/PM modifier + let (time_part, ampm) = extract_ampm(time_part); + let time_part = time_part.trim(); + + // Extract "de la tarde" → p.m. + let (time_part, de_la) = extract_de_la(time_part); + let time_part = time_part.trim(); + let ampm = ampm.or(de_la); + + // Try "X menos Y" pattern + if let Some(result) = parse_menos(time_part, ampm.as_deref(), tz.as_deref()) { + return Some(result); + } + + // Try "X y cuarto" → X:15 + if time_part.ends_with(" y cuarto") { + let hour_part = &time_part[..time_part.len() - 9]; + let hour = parse_hour_word(hour_part)?; + let out_article = if hour == 1 { "la" } else { article }; + return Some(format_time( + out_article, + hour, + 15, + ampm.as_deref(), + tz.as_deref(), + )); + } + + // Try "X y media" → X:30 + if time_part.ends_with(" y media") { + let hour_part = &time_part[..time_part.len() - 8]; + let hour = parse_hour_word(hour_part)?; + let out_article = if hour == 1 { "la" } else { article }; + return Some(format_time( + out_article, + hour, + 30, + ampm.as_deref(), + tz.as_deref(), + )); + } + + // Try "X y/con MINUTES" → X:MM + for connector in &[" y ", " con "] { + if let Some(c_pos) = time_part.find(connector) { + let hour_part = &time_part[..c_pos]; + let min_part = &time_part[c_pos + connector.len()..]; + + if let Some(hour) = parse_hour_word(hour_part) { + if let Some(minutes) = cardinal::words_to_number(min_part) { + let minutes = minutes as i64; + if minutes <= 59 { + let out_article = if hour == 1 { "la" } else { article }; + return Some(format_time( + out_article, + hour, + minutes, + ampm.as_deref(), + tz.as_deref(), + )); + } + } + } + } + } + + // Try "X MINUTES" (no connector) → X:MM + let tokens: Vec<&str> = time_part.split_whitespace().collect(); + if tokens.len() >= 2 { + // Try to find where hour ends and minutes begin + // First token(s) = hour, remaining = minutes + let hour = parse_hour_word(tokens[0])?; + let min_str = tokens[1..].join(" "); + if let Some(minutes) = cardinal::words_to_number(&min_str) { + let minutes = minutes as i64; + if minutes <= 59 && minutes > 0 { + let out_article = if hour == 1 { "la" } else { article }; + return Some(format_time( + out_article, + hour, + minutes, + ampm.as_deref(), + tz.as_deref(), + )); + } + } + } + + // Try bare hour: "la una" / "las dos" + if tokens.len() == 1 { + // Check if it's actually a time (not "las tres personas") + if parse_hour_word(tokens[0]).is_some() { + // Bare hours with AM/PM should be formatted + if ampm.is_some() { + let hour = parse_hour_word(tokens[0])?; + let out_article = if hour == 1 { "la" } else { article }; + return Some(format_time( + out_article, + hour, + 0, + ampm.as_deref(), + tz.as_deref(), + )); + } + // Bare hours without AM/PM pass through + return None; + } + return None; + } + + None +} + +/// Parse "X menos Y" → subtract Y from X +fn parse_menos(input: &str, ampm: Option<&str>, tz: Option<&str>) -> Option { + let menos_pos = input.find(" menos ")?; + let hour_part = &input[..menos_pos]; + let min_part = &input[menos_pos + 7..]; + + let hour = parse_hour_word(hour_part)?; + let minutes = parse_minutes_before(min_part)?; + + let (actual_hour, actual_min) = subtract_time(hour, minutes); + + let article = if actual_hour == 1 { "la" } else { "las" }; + Some(format_time(article, actual_hour, actual_min, ampm, tz)) +} + +/// Parse minutes for "before" patterns +fn parse_minutes_before(input: &str) -> Option { + let trimmed = input.trim(); + match trimmed { + "cuarto" | "un cuarto" => Some(15), + "media" => Some(30), + _ => cardinal::words_to_number(trimmed).map(|n| n as i64), + } +} + +/// Subtract minutes from hour +fn subtract_time(hour: i64, minutes: i64) -> (i64, i64) { + let total_minutes = hour * 60 - minutes; + let actual_hour = total_minutes.div_euclid(60).rem_euclid(24); + let actual_min = total_minutes.rem_euclid(60); + (actual_hour, actual_min) +} + +/// Parse hour word to number +fn parse_hour_word(input: &str) -> Option { + let trimmed = input.trim(); + match trimmed { + "cero" => Some(0), + "una" | "uno" | "un" => Some(1), + _ => cardinal::words_to_number(trimmed).map(|n| n as i64), + } +} + +/// Extract AM/PM: "a eme" → "a.m.", "pe eme" → "p.m." +fn extract_ampm(input: &str) -> (&str, Option) { + let trimmed = input.trim(); + if trimmed.ends_with(" a eme") { + return (&trimmed[..trimmed.len() - 6], Some("a.m.".to_string())); + } + if trimmed.ends_with(" pe eme") { + return (&trimmed[..trimmed.len() - 7], Some("p.m.".to_string())); + } + (trimmed, None) +} + +/// Extract "de la tarde" → "p.m.", "de la mañana" → "a.m." +fn extract_de_la(input: &str) -> (&str, Option) { + let trimmed = input.trim(); + if trimmed.ends_with(" de la tarde") { + return (&trimmed[..trimmed.len() - 12], Some("p.m.".to_string())); + } + if trimmed.ends_with(" de la mañana") { + return (&trimmed[..trimmed.len() - 13], Some("a.m.".to_string())); + } + (trimmed, None) +} + +/// Extract timezone: "u t c más cuatro" → "UTC+4" +fn extract_timezone(input: &str) -> (&str, Option) { + let trimmed = input.trim(); + // "u t c más X" + if let Some(pos) = trimmed.find(" u t c más ") { + let before = &trimmed[..pos]; + let tz_num = &trimmed[pos + 11..]; + if let Some(num) = cardinal::words_to_number(tz_num) { + return (before, Some(format!("UTC+{}", num))); + } + } + if let Some(pos) = trimmed.find(" u t c menos ") { + let before = &trimmed[..pos]; + let tz_num = &trimmed[pos + 13..]; + if let Some(num) = cardinal::words_to_number(tz_num) { + return (before, Some(format!("UTC-{}", num))); + } + } + (trimmed, None) +} + +/// Format time output +fn format_time( + article: &str, + hour: i64, + minutes: i64, + ampm: Option<&str>, + tz: Option<&str>, +) -> String { + let time = if minutes == 0 && ampm.is_some() { + format!("{} {}:{:02}", article, hour, minutes) + } else if minutes > 0 { + format!("{} {}:{:02}", article, hour, minutes) + } else { + format!("{} {}", article, hour) + }; + + let time = if let Some(ap) = ampm { + format!("{} {}", time, ap) + } else { + time + }; + + if let Some(tz_str) = tz { + format!("{} {}", time, tz_str) + } else { + time + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_digital() { + assert_eq!( + parse("las dieciséis cincuenta"), + Some("las 16:50".to_string()) + ); + } + + #[test] + fn test_y_cuarto() { + assert_eq!(parse("la una y cuarto"), Some("la 1:15".to_string())); + } + + #[test] + fn test_menos() { + assert_eq!(parse("las dos menos veinte"), Some("la 1:40".to_string())); + } + + #[test] + fn test_para() { + assert_eq!(parse("cuarto para las dos"), Some("la 1:45".to_string())); + } +} diff --git a/src/asr/es/whitelist.rs b/src/asr/es/whitelist.rs new file mode 100644 index 0000000..499110a --- /dev/null +++ b/src/asr/es/whitelist.rs @@ -0,0 +1,83 @@ +//! Whitelist tagger for Spanish. +//! +//! Maps spoken Spanish titles and phrases to abbreviations: +//! - "doctor" → "Dr." +//! - "señor" → "Sr." +//! - "por ejemplo" → "p.ej." +//! - "ustedes" → "Uds." +//! - "estados unidos" → "EE. UU." + +use lazy_static::lazy_static; + +lazy_static! { + /// Whitelist entries ordered longest-first to prevent prefix conflicts + /// (e.g., "ustedes" must match before "usted"). + static ref WHITELIST: Vec<(&'static str, &'static str)> = vec![ + ("estados unidos", "EE. UU."), + ("por ejemplo", "p.ej."), + ("etcétera", "etc."), + ("doctora", "Dra."), + ("doctor", "Dr."), + ("señorita", "Srta."), + ("señora", "Sra."), + ("señor", "Sr."), + ("ustedes", "Uds."), + ("usted", "Ud."), + ]; +} + +/// Parse spoken Spanish whitelist expression. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Exact match + for &(spoken, abbrev) in WHITELIST.iter() { + if input_trim == spoken { + return Some(abbrev.to_string()); + } + } + + // Word-boundary match within sentences + for &(spoken, abbrev) in WHITELIST.iter() { + if let Some(result) = replace_word_boundary(input_trim, spoken, abbrev) { + return Some(result); + } + } + + None +} + +/// Replace `spoken` with `abbrev` in `input`, respecting word boundaries. +fn replace_word_boundary(input: &str, spoken: &str, abbrev: &str) -> Option { + let mut search_from = 0; + while let Some(pos) = input[search_from..].find(spoken) { + let abs_pos = search_from + pos; + let end_pos = abs_pos + spoken.len(); + + // Check word boundary before + let start_ok = abs_pos == 0 + || input + .as_bytes() + .get(abs_pos - 1) + .map_or(true, |&b| b == b' ' || b == b',' || b == b'.'); + + // Check word boundary after + let end_ok = end_pos == input.len() + || input + .as_bytes() + .get(end_pos) + .map_or(true, |&b| b == b' ' || b == b',' || b == b'.'); + + if start_ok && end_ok { + let mut result = String::new(); + result.push_str(&input[..abs_pos]); + result.push_str(abbrev); + result.push_str(&input[end_pos..]); + return Some(result); + } + + search_from = abs_pos + 1; + } + None +} diff --git a/src/asr/es/word.rs b/src/asr/es/word.rs new file mode 100644 index 0000000..1c467df --- /dev/null +++ b/src/asr/es/word.rs @@ -0,0 +1,8 @@ +//! Word tagger for Spanish. +//! +//! Pass-through module for symmetry with other languages. + +/// Parse is not used directly for Spanish. +pub fn parse(_input: &str) -> Option { + None +} diff --git a/src/asr/fr/cardinal.rs b/src/asr/fr/cardinal.rs new file mode 100644 index 0000000..a4f0305 --- /dev/null +++ b/src/asr/fr/cardinal.rs @@ -0,0 +1,269 @@ +//! Cardinal number tagger for French. +//! +//! Converts spoken French number words to digits: +//! - "un" → "1" +//! - "vingt et un" → "21" +//! - "cent vingt-trois" → "123" +//! - "mille deux cent trente-quatre" → "1234" +//! - "moins soixante" → "-60" + +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + /// Single digit and teen numbers + static ref ONES: HashMap<&'static str, i64> = { + let mut m = HashMap::new(); + m.insert("zero", 0); + m.insert("un", 1); + m.insert("une", 1); + m.insert("deux", 2); + m.insert("trois", 3); + m.insert("quatre", 4); + m.insert("cinq", 5); + m.insert("six", 6); + m.insert("sept", 7); + m.insert("huit", 8); + m.insert("neuf", 9); + m.insert("dix", 10); + m.insert("onze", 11); + m.insert("douze", 12); + m.insert("treize", 13); + m.insert("quatorze", 14); + m.insert("quinze", 15); + m.insert("seize", 16); + m + }; + + /// Tens (30, 40, 50, 60) - Note: vingt (20) is handled specially for quatre-vingts + static ref TENS: HashMap<&'static str, i64> = { + let mut m = HashMap::new(); + m.insert("trente", 30); + m.insert("quarante", 40); + m.insert("cinquante", 50); + m.insert("soixante", 60); + // Belgian/Swiss French + m.insert("septante", 70); + m.insert("huitante", 80); + m.insert("octante", 80); + m.insert("nonante", 90); + m + }; + + /// Scale words + static ref SCALES: HashMap<&'static str, i128> = { + let mut m = HashMap::new(); + m.insert("cent", 100); + m.insert("cents", 100); + m.insert("mille", 1_000); + m.insert("million", 1_000_000); + m.insert("millions", 1_000_000); + m.insert("milliard", 1_000_000_000); + m.insert("milliards", 1_000_000_000); + m.insert("billion", 1_000_000_000_000); + m.insert("billions", 1_000_000_000_000); + m.insert("billiard", 1_000_000_000_000_000); + m.insert("billiards", 1_000_000_000_000_000); + m.insert("trillion", 1_000_000_000_000_000_000); + m.insert("trillions", 1_000_000_000_000_000_000); + m + }; +} + +/// Parse spoken French cardinal number to string representation. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + if input_trim == "zero" { + return Some("zero".to_string()); + } + + // Don't parse single digit words (0-9) + let single_digits = [ + "un", "une", "deux", "trois", "quatre", "cinq", "six", "sept", "huit", "neuf", + ]; + if single_digits.contains(&input_trim) { + return None; + } + + // Don't parse space-separated simple compounds without scale words or "et" + // E.g. "quarante trois" should not parse, but "vingt et un" and "cent vingt" should + if input_trim.contains(' ') && !contains_scale_word(input_trim) && !input_trim.contains(" et ") + { + // Special case: "moins" + single word (like "moins soixante") + if !input_trim.starts_with("moins ") || input_trim.matches(' ').count() > 1 { + return None; + } + } + + // Check for negative + let (is_negative, rest) = if input_trim.starts_with("moins ") { + (true, input_trim.strip_prefix("moins ")?) + } else { + (false, input_trim) + }; + + let num = words_to_number(rest)?; + + if is_negative { + Some(format!("-{}", num)) + } else { + Some(num.to_string()) + } +} + +/// Check if input contains scale words (cent, mille, million, etc.) +fn contains_scale_word(input: &str) -> bool { + let scale_words = [ + "cent", + "cents", + "mille", + "mil", + "million", + "millions", + "milliard", + "milliards", + "billion", + "billions", + "billiard", + "billiards", + "trillion", + "trillions", + ]; + scale_words.iter().any(|&word| input.contains(word)) +} + +pub fn words_to_number(input: &str) -> Option { + // Normalize: remove hyphens, "et" connectors + let normalized = input + .replace("-", " ") + .replace(" et ", " ") + .replace(" ", " "); + + let tokens: Vec<&str> = normalized.split_whitespace().collect(); + if tokens.is_empty() { + return None; + } + + let mut result: i128 = 0; + let mut current: i128 = 0; + let mut last_val: i128 = 0; // Track last value added for "quatre-vingt" handling + + for token in tokens { + // Check if it's a scale word + if let Some(&scale) = SCALES.get(token) { + if scale == 100 { + // "cent" multiplies current or assumes 1 + if current == 0 { + current = 100; + } else { + current *= 100; + } + last_val = 0; + } else { + // "mille", "million", etc. + if current == 0 { + current = 1; // "mille" = 1000, not 0 + } + result += current * scale; + current = 0; + last_val = 0; + } + } else if let Some(&val) = ONES.get(token) { + current += val as i128; + last_val = val as i128; + } else if let Some(&val) = TENS.get(token) { + current += val as i128; + last_val = val as i128; + } else if token == "dix" { + // Special handling for "soixante-dix" (70), "quatre-vingt-dix" (90) + current += 10; + last_val = 10; + } else if token == "vingts" || token == "vingt" { + // "quatre-vingts" = 4 * 20, check LAST value added, not total current + if last_val >= 2 && last_val <= 4 { + // Remove the last value and multiply by 20 + current = current - last_val + (last_val * 20); + last_val = last_val * 20; + } else { + current += 20; + last_val = 20; + } + } else { + return None; // Unknown word + } + } + + result += current; + + if result == 0 { + None + } else { + Some(result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(parse("zero"), Some("zero".to_string())); + // Single-digit words ("un", "deux", etc.) intentionally return None + // to avoid over-matching in sentence context + assert_eq!(parse("un"), None); + assert_eq!(parse("deux"), None); + assert_eq!(parse("dix"), Some("10".to_string())); + assert_eq!(parse("seize"), Some("16".to_string())); + } + + #[test] + fn test_tens() { + assert_eq!(parse("vingt"), Some("20".to_string())); + assert_eq!(parse("vingt et un"), Some("21".to_string())); + assert_eq!(parse("vingt-deux"), Some("22".to_string())); + assert_eq!(parse("trente"), Some("30".to_string())); + } + + #[test] + fn test_special() { + assert_eq!(parse("soixante-dix"), Some("70".to_string())); + assert_eq!(parse("quatre-vingts"), Some("80".to_string())); + assert_eq!(parse("quatre-vingt-dix"), Some("90".to_string())); + assert_eq!(parse("quatre-vingt-dix-neuf"), Some("99".to_string())); + } + + #[test] + fn test_hundreds() { + assert_eq!(parse("cent"), Some("100".to_string())); + assert_eq!(parse("deux cents"), Some("200".to_string())); + assert_eq!(parse("deux cent vingt"), Some("220".to_string())); + } + + #[test] + fn test_thousands() { + assert_eq!(parse("mille"), Some("1000".to_string())); + assert_eq!(parse("deux mille"), Some("2000".to_string())); + assert_eq!(parse("deux mille vingt-cinq"), Some("2025".to_string())); + } + + #[test] + fn test_large() { + assert_eq!(parse("un million"), Some("1000000".to_string())); + assert_eq!(parse("deux millions trois"), Some("2000003".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!(parse("moins quarante-deux"), Some("-42".to_string())); + assert_eq!(parse("moins mille"), Some("-1000".to_string())); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("hello"), None); + assert_eq!(parse(""), None); + } +} diff --git a/src/asr/fr/date.rs b/src/asr/fr/date.rs new file mode 100644 index 0000000..6a37e5d --- /dev/null +++ b/src/asr/fr/date.rs @@ -0,0 +1,138 @@ +//! Date tagger for French. +//! +//! Converts spoken French date expressions to written form: +//! - "cinq janvier deux mille vingt-cinq" → "5 janvier 2025" +//! - "premier janvier" → "1er janvier" +//! - "quatorze juillet" → "14 juillet" + +use super::cardinal::words_to_number; + +/// French month names +const MONTHS: [&str; 12] = [ + "janvier", + "février", + "mars", + "avril", + "mai", + "juin", + "juillet", + "août", + "septembre", + "octobre", + "novembre", + "décembre", +]; + +/// Parse spoken French date expression to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.trim().to_lowercase(); + + // Try day + month + year pattern + if let Some(result) = parse_day_month_year(&input_lower) { + return Some(result); + } + + // Try day + month pattern (no year) + if let Some(result) = parse_day_month(&input_lower) { + return Some(result); + } + + None +} + +/// Parse "X month year" pattern +fn parse_day_month_year(input: &str) -> Option { + // Find month in the input + for month in &MONTHS { + if let Some(month_pos) = input.find(month) { + let day_part = &input[..month_pos].trim(); + let after_month = &input[month_pos + month.len()..].trim(); + + // Parse day + let day_str = if day_part == &"premier" || day_part == &"première" { + "1ᵉʳ".to_string() + } else if let Some(day_num) = words_to_number(day_part) { + (day_num as i64).to_string() + } else { + return None; + }; + + // Parse year if present + if !after_month.is_empty() { + let year = words_to_number(after_month)? as i64; + return Some(format!("{} {} {}", day_str, month, year)); + } else { + return Some(format!("{} {}", day_str, month)); + } + } + } + + None +} + +/// Parse "X month" pattern (no year) +fn parse_day_month(input: &str) -> Option { + // Find month in the input + for month in &MONTHS { + if input.contains(month) { + let parts: Vec<&str> = input.split(month).collect(); + if parts.len() == 2 && parts[1].trim().is_empty() { + let day_part = parts[0].trim(); + + // Parse day + let day_str = if day_part == "premier" || day_part == "première" { + "1ᵉʳ".to_string() + } else if let Some(day_num) = words_to_number(day_part) { + (day_num as i64).to_string() + } else { + return None; + }; + + return Some(format!("{} {}", day_str, month)); + } + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_day_month_year() { + assert_eq!( + parse("cinq janvier deux mille vingt-cinq"), + Some("5 janvier 2025".to_string()) + ); + assert_eq!( + parse("quatorze juillet deux mille"), + Some("14 juillet 2000".to_string()) + ); + } + + #[test] + fn test_day_month() { + assert_eq!(parse("quatorze juillet"), Some("14 juillet".to_string())); + assert_eq!( + parse("vingt-cinq décembre"), + Some("25 décembre".to_string()) + ); + } + + #[test] + fn test_premier() { + assert_eq!(parse("premier janvier"), Some("1ᵉʳ janvier".to_string())); + assert_eq!( + parse("premier mai deux mille vingt"), + Some("1ᵉʳ mai 2020".to_string()) + ); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("vingt"), None); + } +} diff --git a/src/asr/fr/decimal.rs b/src/asr/fr/decimal.rs new file mode 100644 index 0000000..d7b40e5 --- /dev/null +++ b/src/asr/fr/decimal.rs @@ -0,0 +1,285 @@ +//! Decimal number tagger for French. +//! +//! Converts spoken French decimal numbers to written form: +//! - "trois virgule un quatre" → "3,14" +//! - "zero virgule cinq" → "0,5" +//! - "huit cent dix-huit virgule trois zéro trois" → "818,303" +//! - "mille-huit-cent-dix-huit virgule trois zéro trois trois quatre" → "1 818,303 34" + +use super::cardinal::words_to_number; + +/// Parse spoken French decimal expression to written form. +pub fn parse(input: &str) -> Option { + let original = input.trim(); + let input_lower = original.to_lowercase(); + + // Check for scale suffix (million, milliard, etc.) + if let Some(result) = parse_with_scale(original, &input_lower) { + return Some(result); + } + + // Check for "virgule" decimal + if let Some(result) = parse_virgule_decimal(&input_lower) { + return Some(result); + } + + None +} + +/// Parse numbers with scale words (million, milliard, billion, etc.) +fn parse_with_scale(original: &str, input_lower: &str) -> Option { + let scales = [ + "trillions", + "trillion", + "billiards", + "billiard", + "billions", + "billion", + "milliards", + "milliard", + "millions", + "million", + "mille", + ]; + + for scale in &scales { + if input_lower.ends_with(scale) { + let num_part = input_lower[..input_lower.len() - scale.len()].trim(); + + // Extract original scale word to preserve casing + let orig_scale = &original[original.len() - scale.len()..]; + + // Check if it has a decimal point + if num_part.contains(" virgule ") || num_part.contains("virgule ") { + let decimal = parse_virgule_decimal(num_part)?; + return Some(format!("{} {}", decimal, orig_scale)); + } + + // Plain number with scale + let num = parse_integer_part(num_part)?; + return Some(format!("{} {}", num, orig_scale)); + } + } + + None +} + +/// Parse "X virgule Y" decimal pattern +fn parse_virgule_decimal(input: &str) -> Option { + // Handle negative + let (is_negative, rest) = if input.starts_with("moins ") { + (true, input.strip_prefix("moins ")?) + } else { + (false, input) + }; + + // Handle "virgule X" (no integer part, e.g., "virgule cinq" → ",5") + let (integer_str, decimal_str) = if rest.starts_with("virgule ") { + ("", rest.strip_prefix("virgule ")?) + } else if rest.contains(" virgule ") { + let parts: Vec<&str> = rest.splitn(2, " virgule ").collect(); + if parts.len() != 2 { + return None; + } + (parts[0], parts[1]) + } else { + return None; + }; + + // Integer part + let integer_part = if integer_str.is_empty() { + String::new() + } else { + let n = parse_integer_part(integer_str)?; + format_with_spaces(n) + }; + + // Decimal part - parse as individual digits, with compound number support + let decimal_raw = parse_decimal_digits(decimal_str)?; + + // Format decimal part with space separators (groups of 3 from left) + let decimal_part = format_decimal_with_spaces(&decimal_raw); + + let sign = if is_negative { "-" } else { "" }; + + if integer_part.is_empty() { + Some(format!("{},{}", sign, decimal_part)) + } else { + Some(format!("{}{},{}", sign, integer_part, decimal_part)) + } +} + +/// Parse integer part from words, handling both space-separated and hyphenated forms +fn parse_integer_part(input: &str) -> Option { + let normalized = input.trim(); + if normalized.is_empty() { + return None; + } + + // Handle "zéro"/"zero" + let lower = normalized.to_lowercase(); + if lower == "zéro" || lower == "zero" { + return Some(0); + } + + words_to_number(&lower).map(|n| n as i64) +} + +/// Format number with French space separators for thousands +fn format_with_spaces(n: i64) -> String { + let abs_n = n.unsigned_abs(); + let s = abs_n.to_string(); + + if s.len() <= 3 { + return if n < 0 { format!("-{}", s) } else { s }; + } + + let mut result = String::new(); + let chars: Vec = s.chars().collect(); + let len = chars.len(); + + for (i, &c) in chars.iter().enumerate() { + if i > 0 && (len - i) % 3 == 0 { + result.push(' '); + } + result.push(c); + } + + if n < 0 { + format!("-{}", result) + } else { + result + } +} + +/// Format decimal digits with space separators (groups of 3 from left) +/// "2400" → "240 0", "303" → "303", "30334" → "303 34" +fn format_decimal_with_spaces(digits: &str) -> String { + if digits.len() <= 3 { + return digits.to_string(); + } + + let mut result = String::new(); + for (i, c) in digits.chars().enumerate() { + if i > 0 && i % 3 == 0 { + result.push(' '); + } + result.push(c); + } + result +} + +/// Parse decimal digits: "un quatre" → "14", "zéro cinq" → "05" +/// Each word is independently converted to its digit value: +/// - "trente" → "30", "trois" → "3", so "trente trois" → "303" +/// - "vingt-huit" → "28" (hyphenated compound = single token) +fn parse_decimal_digits(input: &str) -> Option { + let tokens: Vec<&str> = input.split_whitespace().collect(); + let mut result = String::new(); + + for token in tokens { + // Try single digit word first + if let Some(digit) = digit_word_to_char(token) { + result.push(digit); + continue; + } + + // Try as a compound number (single token, possibly hyphenated) + if let Some(num) = words_to_number(token) { + let num = num as i64; + if num >= 0 { + for c in num.to_string().chars() { + result.push(c); + } + continue; + } + } + + return None; + } + + if result.is_empty() { + None + } else { + Some(result) + } +} + +/// Convert single digit word to char +fn digit_word_to_char(word: &str) -> Option { + match word { + "zéro" | "zero" => Some('0'), + "un" | "une" => Some('1'), + "deux" => Some('2'), + "trois" => Some('3'), + "quatre" => Some('4'), + "cinq" => Some('5'), + "six" => Some('6'), + "sept" => Some('7'), + "huit" => Some('8'), + "neuf" => Some('9'), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple_decimal() { + assert_eq!(parse("trois virgule un quatre"), Some("3,14".to_string())); + assert_eq!(parse("zero virgule cinq"), Some("0,5".to_string())); + assert_eq!(parse("zero virgule deux six"), Some("0,26".to_string())); + } + + #[test] + fn test_virgule_only() { + assert_eq!(parse("virgule cinq"), Some(",5".to_string())); + assert_eq!(parse("virgule zero deux"), Some(",02".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!( + parse("moins soixante virgule deux quatre zéro zéro"), + Some("-60,240 0".to_string()) + ); + } + + #[test] + fn test_compound_integer() { + assert_eq!( + parse("huit cent dix-huit virgule trois zéro trois"), + Some("818,303".to_string()) + ); + assert_eq!( + parse("huit-cent-dix-huit virgule trois zéro trois"), + Some("818,303".to_string()) + ); + } + + #[test] + fn test_large_with_spaces() { + assert_eq!( + parse("mille-huit-cent-dix-huit virgule trois zéro trois trois quatre"), + Some("1 818,303 34".to_string()) + ); + } + + #[test] + fn test_with_scale() { + assert_eq!( + parse("cinq virgule deux millions"), + Some("5,2 millions".to_string()) + ); + assert_eq!( + parse("cinquante milliards"), + Some("50 milliards".to_string()) + ); + assert_eq!( + parse("zéro virgule deux million"), + Some("0,2 million".to_string()) + ); + } +} diff --git a/src/asr/fr/electronic.rs b/src/asr/fr/electronic.rs new file mode 100644 index 0000000..af461b3 --- /dev/null +++ b/src/asr/fr/electronic.rs @@ -0,0 +1,148 @@ +//! Electronic tagger for French. +//! +//! Converts spoken French electronic addresses to written form: +//! - "test arobase gmail point com" → "test@gmail.com" +//! - "a b c at g mail point com" → "abc@gmail.com" +//! - Handles both "arobase" (French) and "at" (English) for @ +//! - Converts digit words to digits: "un" → "1", "trois" → "3" + +/// Parse spoken French electronic address to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.trim().to_lowercase(); + + parse_email(&input_lower) +} + +/// Parse email address pattern +fn parse_email(input: &str) -> Option { + // Look for "arobase" or "at" as the @ indicator + let (local_raw, domain_raw) = if input.contains(" arobase ") { + let parts: Vec<&str> = input.splitn(2, " arobase ").collect(); + if parts.len() != 2 { + return None; + } + (parts[0].trim(), parts[1].trim()) + } else if input.contains(" at ") { + let parts: Vec<&str> = input.splitn(2, " at ").collect(); + if parts.len() != 2 { + return None; + } + (parts[0].trim(), parts[1].trim()) + } else { + return None; + }; + + let local_part = convert_email_part(local_raw); + let domain_part = convert_email_part(domain_raw); + + if local_part.is_empty() || domain_part.is_empty() { + return None; + } + + Some(format!("{}@{}", local_part, domain_part)) +} + +/// Convert email part: +/// - "point" → "." +/// - "tiret" → "-" +/// - single letter words are concatenated: "a b c" → "abc" +/// - digit words are converted: "un" → "1", "deux" → "2" +/// - multi-letter words are kept as-is and concatenated +fn convert_email_part(input: &str) -> String { + let tokens: Vec<&str> = input.split_whitespace().collect(); + let mut result = String::new(); + let mut need_concat = true; // letters/words are concatenated + + for token in tokens { + if token == "point" { + result.push('.'); + need_concat = true; + } else if token == "tiret" { + result.push('-'); + need_concat = true; + } else if token == "tiret du bas" || token == "sous-tiret" || token == "underscore" { + result.push('_'); + need_concat = true; + } else if let Some(d) = word_to_digit(token) { + result.push(char::from(b'0' + d)); + } else { + // Regular word or letter — concatenate directly + if need_concat { + result.push_str(token); + need_concat = false; + } else { + result.push_str(token); + } + } + } + + result +} + +/// Convert digit word to digit +fn word_to_digit(word: &str) -> Option { + match word { + "zéro" | "zero" => Some(0), + "un" | "une" => Some(1), + "deux" => Some(2), + "trois" => Some(3), + "quatre" => Some(4), + "cinq" => Some(5), + "six" => Some(6), + "sept" => Some(7), + "huit" => Some(8), + "neuf" => Some(9), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple_email_arobase() { + assert_eq!( + parse("test arobase gmail point com"), + Some("test@gmail.com".to_string()) + ); + } + + #[test] + fn test_email_with_at() { + assert_eq!( + parse("a b c at g mail point com"), + Some("abc@gmail.com".to_string()) + ); + } + + #[test] + fn test_email_with_digits() { + assert_eq!( + parse("a un b deux arobase a b c point com"), + Some("a1b2@abc.com".to_string()) + ); + } + + #[test] + fn test_email_with_dots() { + assert_eq!( + parse("a b trois point s d d point trois arobase g mail point com"), + Some("ab3.sdd.3@gmail.com".to_string()) + ); + } + + #[test] + fn test_email_with_dash() { + assert_eq!( + parse("jean tiret luc arobase example point com"), + Some("jean-luc@example.com".to_string()) + ); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("test gmail dot com"), None); // No arobase or at + assert_eq!(parse("arobase"), None); // Missing parts + } +} diff --git a/src/asr/fr/measure.rs b/src/asr/fr/measure.rs new file mode 100644 index 0000000..af7cac9 --- /dev/null +++ b/src/asr/fr/measure.rs @@ -0,0 +1,313 @@ +//! Measure tagger for French. +//! +//! Converts spoken French measurements to written form: +//! - "deux cents mètres" → "200 m" +//! - "dix-huit virgule cinq kilomètres" → "18,5 km" +//! - "cent kilomètres par heure" → "100 km/h" +//! - "soixante-cinq kilomètres carrés" → "65 km²" +//! - "deux mètres cubes" → "2 m³" + +use super::cardinal::words_to_number; +use super::decimal; + +/// Parse spoken French measurement expression to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trimmed = input_lower.trim(); + + // Try rate units first (X par Y): "par kilomètre carré", "par mètre carré" + if let Some(result) = parse_rate_unit(input_trimmed) { + return Some(result); + } + + // Try compound units: "kilomètres par heure", "mètres par seconde", "kilomètres heure" + if let Some(result) = parse_compound_unit(input_trimmed) { + return Some(result); + } + + // Try simple unit with modifiers (carrés, cubes) + if let Some(result) = parse_simple_unit(input_trimmed) { + return Some(result); + } + + None +} + +/// Parse rate expressions: "X par kilomètre carré" → "X /km²" +fn parse_rate_unit(input: &str) -> Option { + let rate_units = [ + (" par kilomètre carré", "/km²"), + (" par mètre carré", "/m²"), + (" par mètre cube", "/m³"), + (" par kilomètre", "/km"), + (" par mètre", "/m"), + (" par seconde", "/s"), + (" par heure", "/h"), + (" par minute", "/min"), + (" par litre", "/l"), + ]; + + for (spoken, symbol) in &rate_units { + if input.ends_with(spoken) { + let num_part = input.strip_suffix(spoken)?.trim(); + let num_value = parse_number_value(num_part)?; + return Some(format!("{} {}", num_value, symbol)); + } + } + + None +} + +/// Parse compound units like "kilomètres par heure" → "km/h" +fn parse_compound_unit(input: &str) -> Option { + let compound_units = [ + (" kilomètres par heure", "km/h"), + (" kilomètre par heure", "km/h"), + (" kilomètres heure", "km/h"), + (" kilomètre heure", "km/h"), + (" mètres par seconde", "m/s"), + (" mètre par seconde", "m/s"), + ]; + + for (spoken, symbol) in &compound_units { + if input.ends_with(spoken) { + let num_part = input.strip_suffix(spoken)?.trim(); + let num_value = parse_number_value(num_part)?; + return Some(format!("{} {}", num_value, symbol)); + } + } + + None +} + +/// Parse simple measurement: number + unit (with optional modifier carré/cube) +fn parse_simple_unit(input: &str) -> Option { + let (value, unit) = parse_number_and_unit(input)?; + Some(format!("{} {}", value, unit)) +} + +/// Parse number and unit from input +fn parse_number_and_unit(input: &str) -> Option<(String, String)> { + // Handle negative + let (is_negative, rest) = if input.starts_with("moins ") { + (true, input.strip_prefix("moins ")?) + } else { + (false, input) + }; + + // Try to find unit at the end + let (num_part, unit_symbol) = extract_unit(rest)?; + + // Parse the number part + let num_value = parse_number_value(num_part.trim())?; + + let sign = if is_negative { "-" } else { "" }; + Some((format!("{}{}", sign, num_value), unit_symbol)) +} + +/// Extract unit from end of string (includes modifier handling) +fn extract_unit(input: &str) -> Option<(&str, String)> { + // Try units with modifiers first (most specific) + for (spoken, symbol) in get_modifier_unit_mappings() { + if input.ends_with(spoken) { + let num_part = input.strip_suffix(spoken)?.trim(); + return Some((num_part, symbol.to_string())); + } + } + + // Then simple units + for (spoken, symbol) in get_unit_mappings() { + if input.ends_with(spoken) { + let num_part = input.strip_suffix(spoken)?.trim(); + return Some((num_part, symbol.to_string())); + } + } + + None +} + +/// Parse number value (handles both cardinal and decimal) +fn parse_number_value(input: &str) -> Option { + if input.is_empty() { + return None; + } + + // Handle "zéro"/"zero" + if input == "zéro" || input == "zero" { + return Some("0".to_string()); + } + + // Try decimal first (has "virgule") + if input.contains("virgule") { + return decimal::parse(input); + } + + // Cardinal number + let num = words_to_number(input)?; + let n = num as i64; + + // Format large numbers with spaces + Some(format_with_spaces(n)) +} + +/// Format number with French space separators for thousands +fn format_with_spaces(n: i64) -> String { + let abs_n = n.unsigned_abs(); + let s = abs_n.to_string(); + + if s.len() <= 3 { + return if n < 0 { format!("-{}", s) } else { s }; + } + + let mut result = String::new(); + let chars: Vec = s.chars().collect(); + let len = chars.len(); + + for (i, &c) in chars.iter().enumerate() { + if i > 0 && (len - i) % 3 == 0 { + result.push(' '); + } + result.push(c); + } + + if n < 0 { + format!("-{}", result) + } else { + result + } +} + +/// Unit mappings with modifiers (squared, cubed) +fn get_modifier_unit_mappings() -> Vec<(&'static str, &'static str)> { + vec![ + // Squared/Cubed variants (must be before simple) + (" kilomètres carrés", "km²"), + (" kilomètre carré", "km²"), + (" mètres carrés", "m²"), + (" mètre carré", "m²"), + (" centimètres carrés", "cm²"), + (" centimètre carré", "cm²"), + (" mètres cubes", "m³"), + (" mètre cube", "m³"), + (" centimètres cubes", "cm³"), + (" centimètre cube", "cm³"), + ] +} + +/// Get French unit mappings (spoken -> symbol) +fn get_unit_mappings() -> Vec<(&'static str, &'static str)> { + vec![ + // Distance/Length (plural and singular) + (" kilomètres", "km"), + (" kilomètre", "km"), + (" mètres", "m"), + (" mètre", "m"), + (" centimètres", "cm"), + (" centimètre", "cm"), + (" millimètres", "mm"), + (" millimètre", "mm"), + (" micromètres", "µm"), + (" micromètre", "µm"), + // Mass/Weight + (" kilogrammes", "kg"), + (" kilogramme", "kg"), + (" grammes", "g"), + (" gramme", "g"), + (" tonnes", "t"), + (" tonne", "t"), + // Volume + (" litres", "l"), + (" litre", "l"), + (" millilitres", "ml"), + (" millilitre", "ml"), + // Time + (" heures", "h"), + (" heure", "h"), + (" minutes", "min"), + (" minute", "min"), + (" secondes", "s"), + (" seconde", "s"), + // Temperature + (" degrés celsius", "°C"), + (" degré celsius", "°C"), + (" degrés", "°"), + (" degré", "°"), + // Data + (" gigaoctets", "Go"), + (" gigaoctet", "Go"), + (" mégaoctets", "Mo"), + (" mégaoctet", "Mo"), + (" kilooctets", "Ko"), + (" kilooctet", "Ko"), + // Power + (" kilowatts", "kW"), + (" kilowatt", "kW"), + (" watts", "W"), + (" watt", "W"), + // Percentage + (" pourcent", "%"), + ] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_distance() { + assert_eq!(parse("cent mètres"), Some("100 m".to_string())); + assert_eq!(parse("cinq kilomètres"), Some("5 km".to_string())); + assert_eq!(parse("trois cents micromètres"), Some("300 µm".to_string())); + } + + #[test] + fn test_speed() { + assert_eq!( + parse("cent kilomètres par heure"), + Some("100 km/h".to_string()) + ); + assert_eq!( + parse("deux-cents kilomètres heure"), + Some("200 km/h".to_string()) + ); + } + + #[test] + fn test_squared_cubed() { + assert_eq!( + parse("soixante-cinq kilomètres carrés"), + Some("65 km²".to_string()) + ); + assert_eq!(parse("deux mètres cubes"), Some("2 m³".to_string())); + } + + #[test] + fn test_rate() { + assert_eq!( + parse("cinquante-six virgule trois par kilomètre carré"), + Some("56,3 /km²".to_string()) + ); + } + + #[test] + fn test_weight() { + assert_eq!(parse("deux kilogrammes"), Some("2 kg".to_string())); + assert_eq!(parse("cinquante grammes"), Some("50 g".to_string())); + } + + #[test] + fn test_negative() { + assert_eq!( + parse("moins soixante-six kilogrammes"), + Some("-66 kg".to_string()) + ); + } + + #[test] + fn test_decimal_measure() { + assert_eq!( + parse("dix-huit virgule cinq kilomètres"), + Some("18,5 km".to_string()) + ); + } +} diff --git a/src/asr/fr/mod.rs b/src/asr/fr/mod.rs new file mode 100644 index 0000000..a937e22 --- /dev/null +++ b/src/asr/fr/mod.rs @@ -0,0 +1,19 @@ +//! Inverse Text Normalization taggers for French. +//! +//! Converts spoken-form French to written form: +//! - "deux cents" → "200" +//! - "cinq euros et cinquante centimes" → "5,50 €" +//! - "cinq janvier deux mille vingt-cinq" → "5 janvier 2025" + +pub mod cardinal; +pub mod date; +pub mod decimal; +pub mod electronic; +pub mod measure; +pub mod money; +pub mod ordinal; +pub mod punctuation; +pub mod telephone; +pub mod time; +pub mod whitelist; +pub mod word; diff --git a/src/asr/fr/money.rs b/src/asr/fr/money.rs new file mode 100644 index 0000000..6ce3976 --- /dev/null +++ b/src/asr/fr/money.rs @@ -0,0 +1,320 @@ +//! Money tagger for French. +//! +//! Converts spoken French currency expressions to written form: +//! - "cinq euros" → "5 €" +//! - "cinq euros et cinquante centimes" → "5,50 €" +//! - "cinquante centimes" → "0,50 €" +//! - "un euro" → "1 €" +//! - "deux dollars vingt" → "2,20 $" +//! - "quatre-vingt mille won" → "80 000 ₩" +//! - "deux-millions de dollars" → "2 millions de dollars" + +use super::cardinal::words_to_number; + +/// Currency definition +struct Currency { + /// Main unit words (plural, singular) + main_words: &'static [&'static str], + /// Symbol + symbol: &'static str, + /// Cent/subunit words + cent_words: &'static [&'static str], + /// Whether cents are represented as fraction of main unit + cent_is_fraction: bool, +} + +const CURRENCIES: &[Currency] = &[ + Currency { + main_words: &["euros", "euro"], + symbol: "€", + cent_words: &["centimes", "centime"], + cent_is_fraction: true, + }, + Currency { + main_words: &["dollars", "dollar"], + symbol: "$", + cent_words: &[], // "cent(s)" conflicts with French number word for 100 + cent_is_fraction: false, + }, + Currency { + main_words: &["livres", "livre"], + symbol: "£", + cent_words: &["pence"], + cent_is_fraction: true, + }, + Currency { + main_words: &["francs suisses", "franc suisse"], + symbol: "CHF", + cent_words: &["centimes", "centime"], + cent_is_fraction: true, + }, + Currency { + main_words: &["wons", "won"], + symbol: "₩", + cent_words: &[], + cent_is_fraction: false, + }, + Currency { + main_words: &["yens", "yen"], + symbol: "¥", + cent_words: &[], + cent_is_fraction: false, + }, +]; + +/// Parse spoken French money expression to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.trim().to_lowercase(); + + // Check for scale expressions first: "X-millions de dollars" → "X millions de dollars" + if let Some(result) = parse_scale_currency(&input_lower) { + return Some(result); + } + + // Try each currency + for currency in CURRENCIES { + if let Some(result) = try_currency(&input_lower, currency) { + return Some(result); + } + } + + None +} + +/// Parse scale expressions: "deux-millions de dollars" → "2 millions de dollars" +/// "quatre virgule quatre-vingt milliards d'euros" → "4,80 milliards d'euros" +fn parse_scale_currency(input: &str) -> Option { + let scale_words = [ + "trillions", + "trillion", + "billiards", + "billiard", + "billions", + "billion", + "milliards", + "milliard", + "millions", + "million", + ]; + + // Normalize hyphens around scale words to spaces for matching + let mut normalized = input.to_string(); + for &scale in &scale_words { + let hyphen_pattern = format!("-{}", scale); + let space_pattern = format!(" {}", scale); + normalized = normalized.replace(&hyphen_pattern, &space_pattern); + } + + for &scale in &scale_words { + // Pattern: "X scale de CURRENCY" or "X scale d'CURRENCY" + let de_pattern = format!(" {} de ", scale); + let d_pattern = format!(" {} d'", scale); + let d_pattern_curly = format!(" {} d\u{2019}", scale); // right single quote + + for pattern in &[&de_pattern, &d_pattern, &d_pattern_curly] { + if let Some(scale_pos) = normalized.find(pattern.as_str()) { + let num_part = &normalized[..scale_pos]; + // Parse the number + let num_str = parse_money_number(num_part)?; + // Return with scale and currency name preserved + let suffix = &normalized[scale_pos + 1..]; // "millions de dollars" + return Some(format!("{} {}", num_str, suffix)); + } + } + } + + None +} + +/// Try to parse with a specific currency +fn try_currency(input: &str, currency: &Currency) -> Option { + // Try "X MAIN et Y CENT" pattern + for &main_word in currency.main_words { + let et_pattern = format!(" {} et ", main_word); + if let Some(main_pos) = input.find(&et_pattern) { + let num_part = &input[..main_pos]; + let cent_part = &input[main_pos + et_pattern.len()..]; + + // Check if cent_part ends with a cent word + for ¢_word in currency.cent_words { + if cent_part.ends_with(cent_word) { + let cent_num_part = cent_part.strip_suffix(cent_word)?.trim(); + let main_num = parse_money_number(num_part)?; + let cent_num = parse_money_number(cent_num_part)?; + return Some(format!("{},{:0>2} {}", main_num, cent_num, currency.symbol)); + } + } + + // "cinq euro et soixante" → "5,60 €" (cent amount without cent word) + if let Some(cent_num) = parse_money_number(cent_part) { + return Some(format!( + "{},{:0>2} {}", + parse_money_number(num_part)?, + cent_num, + currency.symbol + )); + } + } + + // Try "X MAIN Y" pattern (no "et", cents implied by second number) + // "vingt euro cinq" → "20,05 €", "deux dollars vingt" → "2,20 $" + let main_pattern = format!(" {} ", main_word); + if let Some(main_pos) = input.find(&main_pattern) { + let num_part = &input[..main_pos]; + let after_main = &input[main_pos + main_pattern.len()..]; + + // The part after the main word should be a cent value + if !after_main.is_empty() { + if let Some(main_num) = parse_money_number(num_part) { + if let Some(cent_num) = parse_money_number(after_main) { + return Some(format!("{},{:0>2} {}", main_num, cent_num, currency.symbol)); + } + } + } + } + + // Try "X MAIN" pattern (main unit only, at end of string) + let end_pattern = format!(" {}", main_word); + if input.ends_with(&end_pattern) { + let num_part = input.strip_suffix(&end_pattern)?.trim(); + let main_num = parse_money_number(num_part)?; + return Some(format!("{} {}", main_num, currency.symbol)); + } + } + + // Try cent-only pattern: "X CENT_WORD" → "0,XX SYMBOL" + // Only match if cent value is ≤99 (avoids "mille cent" = 1100 being parsed as $10.00) + if currency.cent_is_fraction { + for ¢_word in currency.cent_words { + let end_pattern = format!(" {}", cent_word); + if input.ends_with(&end_pattern) { + let num_part = input.strip_suffix(&end_pattern)?.trim(); + // Validate the number before "cent(s)" is a small cents amount + if let Some(num) = words_to_number(&num_part.to_lowercase()) { + let n = num as i64; + if n >= 0 && n <= 99 { + return Some(format!("0,{:0>2} {}", n, currency.symbol)); + } + } + // If > 99 or not parseable, skip (probably "mille cent" = 1100) + } + } + } + + None +} + +/// Parse number from money context (handles "zéro" and compound numbers) +fn parse_money_number(input: &str) -> Option { + let trimmed = input.trim(); + if trimmed.is_empty() { + return None; + } + + let lower = trimmed.to_lowercase(); + + // Handle "zéro" + if lower == "zéro" || lower == "zero" { + return Some("0".to_string()); + } + + // Handle decimal: "quatre virgule quatre-vingt" → "4,80" + if lower.contains(" virgule ") || lower.contains("virgule ") { + return super::decimal::parse(&lower); + } + + let num = words_to_number(&lower)?; + let n = num as i64; + + // Format with French space separators for large numbers + Some(format_with_spaces(n)) +} + +/// Format number with French space separators +fn format_with_spaces(n: i64) -> String { + let abs_n = n.unsigned_abs(); + let s = abs_n.to_string(); + + if s.len() <= 3 { + return if n < 0 { format!("-{}", s) } else { s }; + } + + let mut result = String::new(); + let chars: Vec = s.chars().collect(); + let len = chars.len(); + + for (i, &c) in chars.iter().enumerate() { + if i > 0 && (len - i) % 3 == 0 { + result.push(' '); + } + result.push(c); + } + + if n < 0 { + format!("-{}", result) + } else { + result + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_euros() { + assert_eq!(parse("cinq euros"), Some("5 €".to_string())); + assert_eq!(parse("un euro"), Some("1 €".to_string())); + assert_eq!(parse("vingt euros"), Some("20 €".to_string())); + assert_eq!(parse("zéro euro"), Some("0 €".to_string())); + } + + #[test] + fn test_euros_and_centimes() { + assert_eq!( + parse("deux euros et vingt centimes"), + Some("2,20 €".to_string()) + ); + assert_eq!(parse("cinq euro et soixante"), Some("5,60 €".to_string())); + assert_eq!(parse("vingt euro cinq"), Some("20,05 €".to_string())); + assert_eq!(parse("zéro euro quatre-vingt"), Some("0,80 €".to_string())); + } + + #[test] + fn test_centimes_only() { + assert_eq!(parse("cinquante centimes"), Some("0,50 €".to_string())); + assert_eq!(parse("un centime"), Some("0,01 €".to_string())); + assert_eq!(parse("vingt centimes"), Some("0,20 €".to_string())); + } + + #[test] + fn test_dollars() { + assert_eq!(parse("deux dollars"), Some("2 $".to_string())); + assert_eq!(parse("deux dollars vingt"), Some("2,20 $".to_string())); + } + + #[test] + fn test_other_currencies() { + assert_eq!(parse("un franc suisse"), Some("1 CHF".to_string())); + assert_eq!(parse("trois livre"), Some("3 £".to_string())); + assert_eq!(parse("trois pence"), Some("0,03 £".to_string())); + } + + #[test] + fn test_large_amounts() { + assert_eq!( + parse("quatre-vingt mille won"), + Some("80 000 ₩".to_string()) + ); + assert_eq!( + parse("quatre-vingt-mille won"), + Some("80 000 ₩".to_string()) + ); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("cinq"), None); + } +} diff --git a/src/asr/fr/ordinal.rs b/src/asr/fr/ordinal.rs new file mode 100644 index 0000000..331f712 --- /dev/null +++ b/src/asr/fr/ordinal.rs @@ -0,0 +1,308 @@ +//! Ordinal number tagger for French. +//! +//! Converts spoken French ordinal words to written form with Unicode superscripts: +//! - "premier" → "1ᵉʳ" +//! - "première" → "1ʳᵉ" +//! - "deuxième" → "2ᵉ" +//! - "troisièmes" → "3ᵉˢ" +//! - "second" → "2ᵈ" + +use super::cardinal::words_to_number; + +/// Parse spoken French ordinal number to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trim = input_lower.trim(); + + // Special case: "Xième siècle" → Roman numerals + if input_trim.ends_with(" siècle") { + return parse_century(input_trim); + } + + // Try to extract ordinal suffix and detect plural + if let Some((number_str, suffix)) = extract_ordinal_parts(input_trim) { + // Parse the number part + let number = if number_str.is_empty() || number_str == "premier" || number_str == "première" + { + 1 + } else if number_str == "second" || number_str == "seconde" { + 2 + } else { + words_to_number(&number_str)? as i64 + }; + + // Format with appropriate Unicode superscripts + return Some(format_ordinal(number, &suffix)); + } + + None +} + +/// Parse century pattern "Xième siècle" +fn parse_century(input: &str) -> Option { + let without_siecle = input.strip_suffix(" siècle")?; + + // Extract the ordinal number before "ième" + if let Some(stem) = without_siecle.strip_suffix("ième") { + let stem = stem.trim_end_matches('-').trim(); + if stem.is_empty() { + return None; + } + + // Reconstruct cardinal form (e.g., "dix-neuv" → "dix neuf") + let cardinal = reconstruct_cardinal(stem)?; + + // Parse to number + let number = words_to_number(&cardinal)? as i64; + + // Convert to Roman numerals + return Some(format!("{}ᵉ siècle", int_to_roman(number))); + } + + None +} + +/// Convert integer to Roman numerals (for centuries) +fn int_to_roman(mut num: i64) -> String { + let values = [ + (1000, "M"), + (900, "CM"), + (500, "D"), + (400, "CD"), + (100, "C"), + (90, "XC"), + (50, "L"), + (40, "XL"), + (10, "X"), + (9, "IX"), + (5, "V"), + (4, "IV"), + (1, "I"), + ]; + + let mut result = String::new(); + for (value, numeral) in &values { + while num >= *value { + result.push_str(numeral); + num -= value; + } + } + result +} + +/// Reconstruct cardinal form from ordinal stem +/// E.g., "quatr" → "quatre", "onz" → "onze", "mill" → "mille" +/// For compounds: "cent-onz" → "cent onze", "dix-neuv" → "dix neuf" +fn reconstruct_cardinal(stem: &str) -> Option { + // Direct mapping for common ordinal stems that need reconstruction + let mappings = [ + ("quatr", "quatre"), + ("cinqu", "cinq"), + ("neuv", "neuf"), + ("dix", "dix"), // stays same + ("onz", "onze"), + ("douz", "douze"), + ("treiz", "treize"), + ("quatorz", "quatorze"), + ("quinz", "quinze"), + ("seiz", "seize"), + ("vingt", "vingt"), // stays same + ("trent", "trente"), + ("quarant", "quarante"), + ("cinquant", "cinquante"), + ("soixant", "soixante"), + ("sept", "sept"), // stays same + ("huit", "huit"), // stays same + ("cent", "cent"), // stays same + ("mill", "mille"), + ("million", "million"), // stays same + ("milliard", "milliard"), // stays same + ]; + + // Handle compound numbers with hyphens or spaces + if stem.contains('-') || stem.contains(' ') { + // Split and reconstruct each part + let parts: Vec<&str> = if stem.contains('-') { + stem.split('-').collect() + } else { + stem.split_whitespace().collect() + }; + + let reconstructed: Vec = parts + .iter() + .filter_map(|part| { + // Try to map each part + for (ord_stem, cardinal) in &mappings { + if part == ord_stem || part.starts_with(ord_stem) { + return Some(cardinal.to_string()); + } + } + // If no mapping, keep as is + if !part.is_empty() { + Some(part.to_string()) + } else { + None + } + }) + .collect(); + + if !reconstructed.is_empty() { + return Some(reconstructed.join(" ")); + } + } + + // Simple (non-compound) ordinal stem + for (ord_stem, cardinal) in &mappings { + if stem == *ord_stem { + return Some(cardinal.to_string()); + } + } + + // If no mapping found, return as-is if non-empty + if !stem.is_empty() { + Some(stem.to_string()) + } else { + None + } +} + +/// Extract number and ordinal suffix from input +fn extract_ordinal_parts(input: &str) -> Option<(String, OrdinalSuffix)> { + // Check if the whole word is "premier", "première", "second", "seconde" FIRST + // before checking ends_with, otherwise they'll match themselves + if input == "premier" { + return Some(("premier".to_string(), OrdinalSuffix::PremierM)); + } + if input == "première" { + return Some(("première".to_string(), OrdinalSuffix::PremiereF)); + } + if input == "premiers" { + return Some(("premier".to_string(), OrdinalSuffix::PremiersM)); + } + if input == "premières" { + return Some(("première".to_string(), OrdinalSuffix::PremieresF)); + } + if input == "second" { + return Some(("second".to_string(), OrdinalSuffix::SecondM)); + } + if input == "seconde" { + return Some(("seconde".to_string(), OrdinalSuffix::SecondeF)); + } + if input == "seconds" { + return Some(("second".to_string(), OrdinalSuffix::SecondsM)); + } + if input == "secondes" { + return Some(("seconde".to_string(), OrdinalSuffix::SecondesF)); + } + + // Check for specific ordinal endings + if input.ends_with("premiers") { + let num_part = input.strip_suffix("premiers")?.trim_end_matches('-').trim(); + return Some((num_part.to_string(), OrdinalSuffix::PremiersM)); + } + if input.ends_with("premier") { + let num_part = input.strip_suffix("premier")?.trim_end_matches('-').trim(); + return Some((num_part.to_string(), OrdinalSuffix::PremierM)); + } + if input.ends_with("premières") { + let num_part = input + .strip_suffix("premières")? + .trim_end_matches('-') + .trim(); + return Some((num_part.to_string(), OrdinalSuffix::PremieresF)); + } + if input.ends_with("première") { + let num_part = input.strip_suffix("première")?.trim_end_matches('-').trim(); + return Some((num_part.to_string(), OrdinalSuffix::PremiereF)); + } + if input.ends_with("seconds") { + let num_part = input.strip_suffix("seconds")?.trim_end_matches('-').trim(); + return Some((num_part.to_string(), OrdinalSuffix::SecondsM)); + } + if input.ends_with("second") { + let num_part = input.strip_suffix("second")?.trim_end_matches('-').trim(); + return Some((num_part.to_string(), OrdinalSuffix::SecondM)); + } + if input.ends_with("secondes") { + let num_part = input.strip_suffix("secondes")?.trim_end_matches('-').trim(); + return Some((num_part.to_string(), OrdinalSuffix::SecondesF)); + } + if input.ends_with("seconde") { + let num_part = input.strip_suffix("seconde")?.trim_end_matches('-').trim(); + return Some((num_part.to_string(), OrdinalSuffix::SecondeF)); + } + + // Regular ordinals: ième/ièmes + if input.ends_with("ièmes") { + let stem = input.strip_suffix("ièmes")?.trim_end_matches('-').trim(); + let num_part = reconstruct_cardinal(stem)?; + return Some((num_part, OrdinalSuffix::IemesPlural)); + } + if input.ends_with("ième") { + let stem = input.strip_suffix("ième")?.trim_end_matches('-').trim(); + let num_part = reconstruct_cardinal(stem)?; + return Some((num_part, OrdinalSuffix::Ieme)); + } + + None +} + +#[derive(Debug)] +enum OrdinalSuffix { + PremierM, // premier → Nᵉʳ + PremiersM, // premiers → Nᵉʳˢ + PremiereF, // première → Nʳᵉ + PremieresF, // premières → Nʳᵉˢ + SecondM, // second → Nᵈ + SecondsM, // seconds → Nᵈˢ + SecondeF, // seconde → Nᵈᵉ + SecondesF, // secondes → Nᵈᵉˢ + Ieme, // deuxième → Nᵉ + IemesPlural, // deuxièmes → Nᵉˢ +} + +/// Format number with appropriate Unicode superscript suffix +fn format_ordinal(number: i64, suffix: &OrdinalSuffix) -> String { + match suffix { + OrdinalSuffix::PremierM => format!("{}ᵉʳ", number), + OrdinalSuffix::PremiersM => format!("{}ᵉʳˢ", number), + OrdinalSuffix::PremiereF => format!("{}ʳᵉ", number), + OrdinalSuffix::PremieresF => format!("{}ʳᵉˢ", number), + OrdinalSuffix::SecondM => format!("{}ᵈ", number), + OrdinalSuffix::SecondsM => format!("{}ᵈˢ", number), + OrdinalSuffix::SecondeF => format!("{}ᵈᵉ", number), + OrdinalSuffix::SecondesF => format!("{}ᵈᵉˢ", number), + OrdinalSuffix::Ieme => format!("{}ᵉ", number), + OrdinalSuffix::IemesPlural => format!("{}ᵉˢ", number), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_ordinals() { + assert_eq!(parse("premier"), Some("1ᵉʳ".to_string())); + assert_eq!(parse("première"), Some("1ʳᵉ".to_string())); + assert_eq!(parse("deuxième"), Some("2ᵉ".to_string())); + assert_eq!(parse("troisième"), Some("3ᵉ".to_string())); + } + + #[test] + fn test_compound_ordinals() { + assert_eq!(parse("vingt et unième"), Some("21ᵉ".to_string())); + assert_eq!(parse("cent onzième"), Some("111ᵉ".to_string())); + } + + #[test] + fn test_large_ordinals() { + assert_eq!(parse("millième"), Some("1000ᵉ".to_string())); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("hello"), None); + assert_eq!(parse("vingt"), None); + } +} diff --git a/src/asr/fr/punctuation.rs b/src/asr/fr/punctuation.rs new file mode 100644 index 0000000..f09eb79 --- /dev/null +++ b/src/asr/fr/punctuation.rs @@ -0,0 +1,85 @@ +//! Punctuation tagger for French. +//! +//! Converts spoken French punctuation words to their written symbols: +//! - "point" → "." +//! - "virgule" → "," +//! - "point d'interrogation" → "?" + +use lazy_static::lazy_static; + +lazy_static! { + /// Spoken French punctuation → written symbol mappings. + static ref PUNCTUATION: Vec<(&'static str, &'static str)> = vec![ + // Multi-word patterns first + ("point d'interrogation", "?"), + ("point dinterrogation", "?"), + ("point d'exclamation", "!"), + ("point dexclamation", "!"), + ("guillemet ouvrant", "«"), + ("guillemet fermant", "»"), + ("parenthèse ouvrante", "("), + ("parenthèse fermante", ")"), + ("crochet ouvrant", "["), + ("crochet fermant", "]"), + ("accolade ouvrante", "{"), + ("accolade fermante", "}"), + ("deux points", ":"), + ("point virgule", ";"), + ("trait d'union", "-"), + ("barre oblique", "/"), + + // Single-word patterns + ("point", "."), + ("virgule", ","), + ("tiret", "-"), + ("arobase", "@"), + ("dièse", "#"), + ("pourcent", "%"), + ("plus", "+"), + ("égal", "="), + ("astérisque", "*"), + ("slash", "/"), + ]; +} + +/// Try to parse spoken French punctuation into its written symbol. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trimmed = input_lower.trim(); + + for (pattern, symbol) in PUNCTUATION.iter() { + if input_trimmed == *pattern { + return Some(symbol.to_string()); + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_punctuation() { + assert_eq!(parse("point"), Some(".".to_string())); + assert_eq!(parse("virgule"), Some(",".to_string())); + assert_eq!(parse("deux points"), Some(":".to_string())); + assert_eq!(parse("point virgule"), Some(";".to_string())); + } + + #[test] + fn test_multi_word() { + assert_eq!(parse("point d'interrogation"), Some("?".to_string())); + assert_eq!(parse("point d'exclamation"), Some("!".to_string())); + assert_eq!(parse("parenthèse ouvrante"), Some("(".to_string())); + } + + #[test] + fn test_symbols() { + assert_eq!(parse("tiret"), Some("-".to_string())); + assert_eq!(parse("arobase"), Some("@".to_string())); + assert_eq!(parse("dièse"), Some("#".to_string())); + assert_eq!(parse("pourcent"), Some("%".to_string())); + } +} diff --git a/src/asr/fr/telephone.rs b/src/asr/fr/telephone.rs new file mode 100644 index 0000000..ac7e94d --- /dev/null +++ b/src/asr/fr/telephone.rs @@ -0,0 +1,176 @@ +//! Telephone tagger for French. +//! +//! Converts spoken French phone numbers to written form: +//! - "zéro six douze trente-quatre" → "06 12 34" +//! - "double neuf douze trente-deux" → "99 12 32" +//! - Handles digit-by-digit or grouped number words +//! +//! French phone numbers are formatted as 2-digit groups: "02 12 32 30 30" +//! Standard French numbers are 10 digits; if 9 digits are provided, +//! a leading zero is prepended (implied area code). + +use super::cardinal::words_to_number; + +/// Parse spoken French telephone number to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.trim().to_lowercase(); + + parse_number_sequence(&input_lower) +} + +/// Parse sequence of number words into phone number format. +fn parse_number_sequence(input: &str) -> Option { + let input = input.trim(); + + let tokens: Vec<&str> = input.split_whitespace().collect(); + if tokens.is_empty() { + return None; + } + + let mut digits = Vec::new(); + let mut i = 0; + + while i < tokens.len() { + // Handle "double X" → XX + if tokens[i] == "double" && i + 1 < tokens.len() { + if let Some(d) = parse_single_digit(tokens[i + 1]) { + digits.push(d); + digits.push(d); + i += 2; + continue; + } + } + + // Handle "triple X" → XXX + if tokens[i] == "triple" && i + 1 < tokens.len() { + if let Some(d) = parse_single_digit(tokens[i + 1]) { + digits.push(d); + digits.push(d); + digits.push(d); + i += 2; + continue; + } + } + + // Try to parse single digit word (zéro-neuf) + if let Some(d) = parse_single_digit(tokens[i]) { + digits.push(d); + i += 1; + continue; + } + + // Try single-token compound number: "douze" → 12, "trente-deux" → 32 + // Only parse single tokens to avoid greedily combining separate groups + if let Some(num) = words_to_number(tokens[i]) { + let num = num as u32; + if num >= 10 && num <= 99 { + digits.push((num / 10) as u8); + digits.push((num % 10) as u8); + } else if num < 10 { + digits.push(num as u8); + } else { + return None; + } + i += 1; + } else { + return None; + } + } + + // Need at least 6 digits for a phone number + if digits.len() < 6 { + return None; + } + + // French phone numbers are 10 digits; if 9 provided, prepend 0 + if digits.len() == 9 { + digits.insert(0, 0); + } + + // Format as 2-digit groups: "02 12 32 30 30" + let mut result = String::new(); + for (idx, &d) in digits.iter().enumerate() { + if idx > 0 && idx % 2 == 0 { + result.push(' '); + } + result.push(char::from(b'0' + d)); + } + + Some(result) +} + +/// Parse single digit word (0-9), including "une" +fn parse_single_digit(token: &str) -> Option { + match token { + "zéro" | "zero" => Some(0), + "un" | "une" => Some(1), + "deux" => Some(2), + "trois" => Some(3), + "quatre" => Some(4), + "cinq" => Some(5), + "six" => Some(6), + "sept" => Some(7), + "huit" => Some(8), + "neuf" => Some(9), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_digit_by_digit() { + assert_eq!( + parse("zéro six un deux trois quatre"), + Some("06 12 34".to_string()) + ); + } + + #[test] + fn test_grouped_numbers() { + assert_eq!( + parse("zéro six douze trente-quatre"), + Some("06 12 34".to_string()) + ); + } + + #[test] + fn test_full_phone() { + assert_eq!( + parse("zéro six douze trente-quatre cinquante-six soixante-dix-huit"), + Some("06 12 34 56 78".to_string()) + ); + } + + #[test] + fn test_without_leading_zero() { + assert_eq!( + parse("deux douze trente-deux trente trente"), + Some("02 12 32 30 30".to_string()) + ); + } + + #[test] + fn test_digit_by_digit_with_une() { + assert_eq!( + parse("deux une deux trois deux trois zéro trois zéro"), + Some("02 12 32 30 30".to_string()) + ); + } + + #[test] + fn test_double() { + assert_eq!( + parse("double neuf douze trente-deux trente trente"), + Some("99 12 32 30 30".to_string()) + ); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("un deux trois"), None); // Too short + assert_eq!(parse("hello world"), None); + } +} diff --git a/src/asr/fr/time.rs b/src/asr/fr/time.rs new file mode 100644 index 0000000..9a8d3e2 --- /dev/null +++ b/src/asr/fr/time.rs @@ -0,0 +1,215 @@ +//! Time tagger for French. +//! +//! Converts spoken French time expressions to written form: +//! - "quatorze heures trente" → "14 h 30" +//! - "midi" → "12 h" +//! - "minuit" → "0 h" +//! - "huit heures du soir" → "20 h" +//! - "midi moins le quart" → "11 h 45" + +use super::cardinal::words_to_number; + +/// Parse spoken French time expression to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.trim().to_lowercase(); + + // Try "moins" patterns first (subtractive) + if let Some(result) = parse_moins_pattern(&input_lower) { + return Some(result); + } + + // Special base times + if input_lower.starts_with("midi") { + return parse_midi_pattern(&input_lower); + } + if input_lower.starts_with("minuit") { + return parse_minuit_pattern(&input_lower); + } + + // Standard pattern: "X heures Y" with modifiers + if let Some(result) = parse_heures_pattern(&input_lower) { + return Some(result); + } + + None +} + +/// Parse "midi" patterns +fn parse_midi_pattern(input: &str) -> Option { + if input == "midi" { + return Some("12 h".to_string()); + } + // "midi moins le quart" → 11:45 + if input == "midi moins le quart" { + return Some("11 h 45".to_string()); + } + // "midi moins X" → 12 - X + if let Some(rest) = input.strip_prefix("midi moins ") { + let subtract = words_to_number(rest)? as i64; + let minutes = 60 - subtract; + return Some(format!("11 h {:02}", minutes)); + } + None +} + +/// Parse "minuit" patterns +fn parse_minuit_pattern(input: &str) -> Option { + if input == "minuit" { + return Some("0 h".to_string()); + } + // "minuit X" → 0:X + if let Some(rest) = input.strip_prefix("minuit ") { + let minutes = words_to_number(rest)? as i64; + if minutes > 59 { + return None; + } + return Some(format!("0 h {:02}", minutes)); + } + None +} + +/// Parse "X heures moins Y" patterns +fn parse_moins_pattern(input: &str) -> Option { + // "X heures moins le quart" → X-1:45 + if let Some(hour_part) = input.strip_suffix(" heures moins le quart") { + let hour = words_to_number(hour_part)? as i64; + let actual_hour = if hour > 1 { hour - 1 } else { 23 }; + return Some(format!("{} h 45", actual_hour)); + } + + // "X heures moins Y" + if let Some((before, after)) = input.split_once(" heures moins ") { + let hour = words_to_number(before)? as i64; + let subtract = words_to_number(after)? as i64; + let actual_hour = if hour > 1 { hour - 1 } else { 23 }; + let minutes = 60 - subtract; + return Some(format!("{} h {:02}", actual_hour, minutes)); + } + + None +} + +/// Parse "X heures Y" pattern +fn parse_heures_pattern(input: &str) -> Option { + // Remove time-of-day modifiers + let cleaned = input + .replace(" du matin", "") + .replace(" du soir", "") + .replace(" de l'après-midi", ""); + + let add_12 = input.contains(" du soir") || input.contains(" de l'après-midi"); + + // Pattern: "X heures et demie" → X:30 + if let Some(hour_part) = cleaned.strip_suffix(" heures et demie") { + let mut hour = words_to_number(hour_part)? as i64; + if add_12 && hour < 12 { + hour += 12; + } + if hour > 23 { + return None; + } + return Some(format!("{} h 30", hour)); + } + + // Pattern: "X heures et trois quarts" → X:45 + if let Some(hour_part) = cleaned.strip_suffix(" heures et trois quarts") { + let mut hour = words_to_number(hour_part)? as i64; + if add_12 && hour < 12 { + hour += 12; + } + if hour > 23 { + return None; + } + return Some(format!("{} h 45", hour)); + } + + // Pattern: "X heures Y" + if let Some((hour_part, minute_part)) = cleaned.split_once(" heures ") { + let mut hour = words_to_number(hour_part)? as i64; + if add_12 && hour < 12 { + hour += 12; + } + if hour > 23 { + return None; + } + + let minute = words_to_number(minute_part)? as i64; + if minute > 59 { + return None; + } + + return Some(format!("{} h {:02}", hour, minute)); + } + + // Pattern: just "X heures" (no minutes) + if let Some(hour_part) = cleaned.strip_suffix(" heures") { + let mut hour = words_to_number(hour_part)? as i64; + if add_12 && hour < 12 { + hour += 12; + } + if hour > 23 { + return None; + } + return Some(format!("{} h", hour)); + } + + // Singular: "une heure" + if cleaned.ends_with(" heure") { + let hour_part = cleaned.strip_suffix(" heure")?; + let mut hour = if hour_part == "une" { + 1 + } else { + words_to_number(hour_part)? as i64 + }; + if add_12 && hour < 12 { + hour += 12; + } + if hour > 23 { + return None; + } + return Some(format!("{} h", hour)); + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_special_times() { + assert_eq!(parse("midi"), Some("12 h".to_string())); + assert_eq!(parse("minuit"), Some("0 h".to_string())); + } + + #[test] + fn test_heures_pattern() { + assert_eq!(parse("quatorze heures trente"), Some("14 h 30".to_string())); + assert_eq!(parse("quinze heures"), Some("15 h".to_string())); + assert_eq!(parse("neuf heures dix"), Some("9 h 10".to_string())); + } + + #[test] + fn test_time_of_day() { + assert_eq!(parse("huit heures du matin"), Some("8 h".to_string())); + assert_eq!(parse("huit heures du soir"), Some("20 h".to_string())); + } + + #[test] + fn test_special_minutes() { + assert_eq!(parse("onze heures et demie"), Some("11 h 30".to_string())); + assert_eq!(parse("midi moins le quart"), Some("11 h 45".to_string())); + } + + #[test] + fn test_singular() { + assert_eq!(parse("une heure"), Some("1 h".to_string())); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("vingt-cinq heures"), None); // > 23 + assert_eq!(parse("hello"), None); + } +} diff --git a/src/asr/fr/whitelist.rs b/src/asr/fr/whitelist.rs new file mode 100644 index 0000000..7df061b --- /dev/null +++ b/src/asr/fr/whitelist.rs @@ -0,0 +1,49 @@ +//! Whitelist tagger for French. +//! +//! Converts specific French titles and words to their abbreviated forms with Unicode superscripts. + +use lazy_static::lazy_static; +use std::collections::HashMap; + +lazy_static! { + /// Mapping of French words to their abbreviated forms + static ref WHITELIST: HashMap<&'static str, &'static str> = { + let mut m = HashMap::new(); + // Titles with Unicode superscripts + m.insert("docteur", "Dʳ"); + m.insert("docteures", "Dʳᵉˢ"); + m.insert("monsieur", "M."); + m.insert("messieurs", "MM."); + m.insert("madame", "Mᵐᵉ"); + m.insert("mesdames", "Mᵐᵉˢ"); + m.insert("mademoiselle", "Mˡˡᵉ"); + m.insert("mademoiselles", "Mˡˡᵉˢ"); + m + }; +} + +/// Convert whitelisted French words to their abbreviated forms. +pub fn parse(input: &str) -> Option { + let input_lower = input.to_lowercase(); + let input_trimmed = input_lower.trim(); + + WHITELIST.get(input_trimmed).map(|&s| s.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_whitelist() { + assert_eq!(parse("docteur"), Some("Dʳ".to_string())); + assert_eq!(parse("madame"), Some("Mᵐᵉ".to_string())); + assert_eq!(parse("monsieur"), Some("M.".to_string())); + } + + #[test] + fn test_not_whitelisted() { + assert_eq!(parse("bonjour"), None); + assert_eq!(parse("un"), None); + } +} diff --git a/src/asr/fr/word.rs b/src/asr/fr/word.rs new file mode 100644 index 0000000..e8bf514 --- /dev/null +++ b/src/asr/fr/word.rs @@ -0,0 +1,102 @@ +//! Word tagger for French. +//! +//! Converts spoken French letter sequences to written form: +//! - "a b c" → "ABC" +//! - Handles spelled-out words and acronyms + +/// Parse spoken French letter sequence to written form. +pub fn parse(input: &str) -> Option { + let input_lower = input.trim().to_lowercase(); + + // Try parsing as a sequence of letters + if let Some(result) = parse_letter_sequence(&input_lower) { + return Some(result); + } + + None +} + +/// Parse sequence of letter words into uppercase letters +fn parse_letter_sequence(input: &str) -> Option { + let tokens: Vec<&str> = input.split_whitespace().collect(); + + // Need at least 2 letters to be considered a sequence + if tokens.len() < 2 { + return None; + } + + let mut letters = Vec::new(); + + for token in tokens { + if let Some(letter) = parse_letter(token) { + letters.push(letter); + } else { + // If any token is not a letter, this isn't a letter sequence + return None; + } + } + + Some(letters.join("")) +} + +/// Parse single letter word to uppercase letter +fn parse_letter(word: &str) -> Option { + // French letter names + let letter_map = [ + ("a", "A"), + ("bé", "B"), + ("cé", "C"), + ("dé", "D"), + ("e", "E"), + ("effe", "F"), + ("gé", "G"), + ("hache", "H"), + ("i", "I"), + ("ji", "J"), + ("ka", "K"), + ("elle", "L"), + ("emme", "M"), + ("enne", "N"), + ("o", "O"), + ("pé", "P"), + ("ku", "Q"), + ("erre", "R"), + ("esse", "S"), + ("té", "T"), + ("u", "U"), + ("vé", "V"), + ("double vé", "W"), + ("ixe", "X"), + ("i grec", "Y"), + ("zède", "Z"), + ]; + + for (spoken, letter) in &letter_map { + if word == *spoken { + return Some(letter.to_string()); + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple_sequence() { + assert_eq!(parse("a bé cé"), Some("ABC".to_string())); + } + + #[test] + fn test_longer_sequence() { + assert_eq!(parse("u esse a"), Some("USA".to_string())); + } + + #[test] + fn test_invalid() { + assert_eq!(parse("a"), None); // Single letter + assert_eq!(parse("hello world"), None); // Not letters + } +} diff --git a/src/asr/hi/address.rs b/src/asr/hi/address.rs new file mode 100644 index 0000000..c6b0102 --- /dev/null +++ b/src/asr/hi/address.rs @@ -0,0 +1,123 @@ +//! Address tagger for Hindi. +//! +//! After cardinal processing, digit words have been converted to Devanagari digits. +//! This module concatenates sequences of Devanagari digits in address contexts: +//! - "७ ० ०" → "७००" +//! - "६ ६ - ४," → "६६-४," +//! - "१ ४ / ३," → "१४/३," +//! +//! Also handles comma-separated digit sequences and +//! हाइफ़न/बटा between digit groups. + +/// Check if a string is a Devanagari digit sequence (one or more digits). +fn is_devanagari_number(s: &str) -> bool { + !s.is_empty() && s.chars().all(|c| ('०'..='९').contains(&c)) +} + +/// Check if a string is a Devanagari digit with trailing comma (like "०,"). +fn strip_trailing_comma(s: &str) -> Option<&str> { + if s.ends_with(',') { + let core = &s[..s.len() - 1]; + if is_devanagari_number(core) { + return Some(core); + } + } + None +} + +/// Process address patterns in a string. +/// At this point, cardinal has already converted number words to Devanagari digits. +pub fn process(input: &str) -> String { + let words: Vec<&str> = input.split_whitespace().collect(); + if words.is_empty() { + return input.to_string(); + } + + let mut result = Vec::new(); + let mut i = 0; + + while i < words.len() { + // Check for Devanagari digit sequences that should be concatenated + if is_devanagari_number(words[i]) || strip_trailing_comma(words[i]).is_some() { + let mut digits = String::new(); + let mut trailing_comma = false; + + while i < words.len() { + if is_devanagari_number(words[i]) { + digits.push_str(words[i]); + i += 1; + } else if let Some(core) = strip_trailing_comma(words[i]) { + // Digit with trailing comma — add digit, mark comma, stop sequence + digits.push_str(core); + trailing_comma = true; + i += 1; + break; + } else if words[i] == "हाइफ़न" || words[i] == "हाइफन" || words[i] == "-" + { + // Hyphen separator + if i + 1 < words.len() + && (is_devanagari_number(words[i + 1]) + || strip_trailing_comma(words[i + 1]).is_some()) + { + digits.push('-'); + i += 1; + } else { + break; + } + } else if words[i] == "बटा" || words[i] == "/" { + // Slash separator (address fraction) + if i + 1 < words.len() + && (is_devanagari_number(words[i + 1]) + || strip_trailing_comma(words[i + 1]).is_some()) + { + digits.push('/'); + i += 1; + } else { + break; + } + } else { + break; + } + } + + if !digits.is_empty() { + if trailing_comma { + result.push(format!("{},", digits)); + } else { + result.push(digits); + } + continue; + } + } + + result.push(words[i].to_string()); + i += 1; + } + + result.join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(process("७ ० ० ओक स्ट्रीट"), "७०० ओक स्ट्रीट"); + } + + #[test] + fn test_hyphen() { + assert_eq!(process("६ ६ हाइफ़न ४, पार्कहर्स्ट रोड"), "६६-४, पार्कहर्स्ट रोड"); + } + + #[test] + fn test_slash() { + assert_eq!(process("१ ४ बटा ३, मथुरा रोड"), "१४/३, मथुरा रोड"); + } + + #[test] + fn test_comma_separated() { + assert_eq!(process("बूथ ७०, सेक्टर ८, चंडीगढ़"), "बूथ ७०, सेक्टर ८, चंडीगढ़"); + } +} diff --git a/src/asr/hi/cardinal.rs b/src/asr/hi/cardinal.rs new file mode 100644 index 0000000..9dfe36b --- /dev/null +++ b/src/asr/hi/cardinal.rs @@ -0,0 +1,461 @@ +//! Cardinal number tagger for Hindi. +//! +//! Converts Hindi number words to Devanagari numeral form: +//! - "एक" → "१" +//! - "दो हज़ार दो सौ बाईस" → "२२२२" +//! - "एक लाख एक" → "१००००१" +//! - "सवा सात सौ" → "७२५" +//! - "डेढ़ सौ" → "१५०" + +/// Convert an Arabic digit to Devanagari. +pub fn to_devanagari_digit(d: u8) -> char { + match d { + 0 => '०', + 1 => '१', + 2 => '२', + 3 => '३', + 4 => '४', + 5 => '५', + 6 => '६', + 7 => '७', + 8 => '८', + 9 => '९', + _ => unreachable!(), + } +} + +/// Convert a number to Devanagari digit string. +pub fn to_devanagari(n: i64) -> String { + if n < 0 { + let s = to_devanagari(-n); + return format!("-{}", s); + } + if n == 0 { + return "०".to_string(); + } + let s = n.to_string(); + s.chars() + .map(|c| to_devanagari_digit(c as u8 - b'0')) + .collect() +} + +/// Convert a decimal string like "206.29" to Devanagari "२०६.२९". +pub fn to_devanagari_str(s: &str) -> String { + s.chars() + .map(|c| { + if c.is_ascii_digit() { + to_devanagari_digit(c as u8 - b'0') + } else { + c + } + }) + .collect() +} + +/// Map a single Hindi word to its numeric value. +/// Returns None if the word is not a Hindi number word. +pub fn word_to_value(word: &str) -> Option { + match word { + "शून्य" | "शुन्य" => Some(0), + "एक" => Some(1), + "दो" => Some(2), + "तीन" => Some(3), + "चार" => Some(4), + "पाँच" | "पांच" | "पांचो" => Some(5), + "छह" | "छः" | "छे" => Some(6), + "सात" => Some(7), + "आठ" => Some(8), + "नौ" => Some(9), + "दस" => Some(10), + "ग्यारह" => Some(11), + "बारह" => Some(12), + "तेरह" => Some(13), + "चौदह" => Some(14), + "पन्द्रह" | "पंद्रह" | "पंदरह" | "पंडरह" => { + Some(15) + } + "सोलह" => Some(16), + "सत्रह" => Some(17), + "अठारह" | "अठाहर" | "अठाहरवीं" => Some(18), + "उन्नीस" => Some(19), + "बीस" => Some(20), + "इक्कीस" => Some(21), + "बाईस" => Some(22), + "तेईस" => Some(23), + "चौबीस" => Some(24), + "पच्चीस" => Some(25), + "छब्बीस" => Some(26), + "सत्ताईस" => Some(27), + "अट्ठाईस" => Some(28), + "उनतीस" => Some(29), + "तीस" => Some(30), + "इकतीस" | "इकत्तीस" => Some(31), + "बत्तीस" => Some(32), + "तैंतीस" => Some(33), + "चौंतीस" => Some(34), + "पैंतीस" | "पैंतिस" => Some(35), + "छत्तीस" | "छतीस" => Some(36), + "सैंतीस" => Some(37), + "अड़तीस" => Some(38), + "उनतालीस" => Some(39), + "चालीस" => Some(40), + "इकतालीस" => Some(41), + "बयालीस" => Some(42), + "तैंतालीस" => Some(43), + "चौवालीस" => Some(44), + "पैंतालीस" | "पैंतालिस" => Some(45), + "छियालीस" => Some(46), + "सैंतालीस" => Some(47), + "अड़तालीस" => Some(48), + "उनचास" => Some(49), + "पचास" => Some(50), + "इक्यावन" => Some(51), + "बावन" => Some(52), + "तिरपन" | "तिरेपन" => Some(53), + "चौवन" | "चौंवन" => Some(54), + "पचपन" => Some(55), + "छप्पन" => Some(56), + "सत्तावन" => Some(57), + "अट्ठावन" => Some(58), + "उनसठ" => Some(59), + "साठ" => Some(60), + "इकसठ" => Some(61), + "बासठ" => Some(62), + "तिरसठ" => Some(63), + "चौंसठ" => Some(64), + "पैंसठ" => Some(65), + "छियासठ" => Some(66), + "सड़सठ" | "सरसठ" => Some(67), + "अड़सठ" => Some(68), + "उनहत्तर" => Some(69), + "सत्तर" => Some(70), + "इकहत्तर" => Some(71), + "बहत्तर" => Some(72), + "तिहत्तर" => Some(73), + "चौहत्तर" => Some(74), + "पिछत्तर" | "पचहत्तर" => Some(75), + "छिहत्तर" => Some(76), + "सतत्तर" => Some(77), + "अठत्तर" | "अठहत्तर" => Some(78), + "उनासी" | "उन्नासी" => Some(79), + "अस्सी" => Some(80), + "इक्यासी" => Some(81), + "बयासी" => Some(82), + "तिरासी" => Some(83), + "चौरासी" => Some(84), + "पचासी" | "पच्चासी" => Some(85), + "छियासी" => Some(86), + "सत्तासी" => Some(87), + "अठासी" => Some(88), + "नवासी" => Some(89), + "नब्बे" => Some(90), + "इक्यानबे" | "इक्यानवे" => Some(91), + "बानवे" => Some(92), + "तिरानवे" => Some(93), + "चौरानवे" => Some(94), + "पिचानवे" | "पंचानवे" => Some(95), + "छियानवे" => Some(96), + "सत्तानवे" => Some(97), + "अट्ठानवे" => Some(98), + "निन्यानवे" | "निन्यानवें" => Some(99), + _ => None, + } +} + +/// Check if a word is a scale word (सौ, हज़ार, लाख, करोड़, अरब). +pub fn scale_value(word: &str) -> Option { + match word { + "सौ" => Some(100), + "हज़ार" | "हजार" => Some(1_000), + "लाख" => Some(1_00_000), + "करोड़" => Some(1_00_00_000), + "अरब" => Some(1_00_00_00_000), + _ => None, + } +} + +/// Check if a word is a Hindi number word (value or scale). +pub fn is_hi_number_word(word: &str) -> bool { + word_to_value(word).is_some() || scale_value(word).is_some() +} + +/// Check if a word is a special modifier. +pub fn is_modifier(word: &str) -> bool { + matches!(word, "सवा" | "साढ़े" | "डेढ़" | "ढाई" | "पौने" | "पौन" | "पौना") +} + +/// Parse a sequence of Hindi number words into a number. +/// Uses Indian numbering: अरब > करोड़ > लाख > हज़ार > सौ +/// +/// Modifier semantics: +/// - सवा N*scale → N*scale + scale/4 (add quarter of the lowest scale) +/// - साढ़े N*scale → N*scale + scale/2 (add half of the lowest scale) +/// - डेढ़ scale → 1.5 * scale +/// - ढाई scale → 2.5 * scale +/// - पौने N*scale → N*scale - scale/4 (subtract quarter of the lowest scale) +pub fn words_to_number(words: &[&str]) -> Option { + if words.is_empty() { + return None; + } + + // Handle special modifiers at the start + match words[0] { + "डेढ़" => { + if words.len() == 1 { + return None; + } + let rest = &words[1..]; + let base = parse_compound_number(rest)?; + let lowest = find_lowest_scale(rest); + return Some(base + lowest / 2); + } + "ढाई" => { + if words.len() == 1 { + return None; + } + let rest = &words[1..]; + let base = parse_compound_number(rest)?; + let lowest = find_lowest_scale(rest); + return Some(base + lowest + lowest / 2); + } + "सवा" => { + if words.len() == 1 { + return None; + } + let rest = &words[1..]; + let base = parse_compound_number(rest)?; + let lowest = find_lowest_scale(rest); + return Some(base + lowest / 4); + } + "साढ़े" => { + if words.len() == 1 { + return None; + } + let rest = &words[1..]; + let base = parse_compound_number(rest)?; + let lowest = find_lowest_scale(rest); + return Some(base + lowest / 2); + } + "पौने" | "पौन" | "पौना" => { + if words.len() == 1 { + return None; + } + let rest = &words[1..]; + let base = parse_compound_number(rest)?; + let lowest = find_lowest_scale(rest); + return Some(base - lowest / 4); + } + _ => {} + } + + parse_compound_number(words) +} + +/// Find the lowest scale value used in a word sequence. +pub fn find_lowest_scale(words: &[&str]) -> i64 { + let mut lowest: Option = None; + for &w in words { + if let Some(sv) = scale_value(w) { + match lowest { + None => lowest = Some(sv), + Some(current) => { + if sv < current { + lowest = Some(sv); + } + } + } + } + } + lowest.unwrap_or(1) +} + +/// Parse a compound Hindi number from words. +/// Handles the Indian number scale: अरब > करोड़ > लाख > हज़ार > सौ +fn parse_compound_number(words: &[&str]) -> Option { + if words.is_empty() { + return None; + } + + // Single word + if words.len() == 1 { + if let Some(v) = word_to_value(words[0]) { + return Some(v); + } + if let Some(s) = scale_value(words[0]) { + return Some(s); + } + return None; + } + + // Multi-word: accumulate using Indian number system + let scales: &[(&[&str], i64)] = &[ + (&["अरब"], 1_00_00_00_000), + (&["करोड़"], 1_00_00_000), + (&["लाख"], 1_00_000), + (&["हज़ार", "हजार"], 1_000), + (&["सौ"], 100), + ]; + + for &(scale_words, scale_val) in scales { + for (i, &w) in words.iter().enumerate() { + if scale_words.contains(&w) { + let before = &words[..i]; + let after = &words[i + 1..]; + + let multiplier = if before.is_empty() { + 1 + } else { + parse_compound_number(before)? + }; + + let remainder = if after.is_empty() { + 0 + } else { + parse_compound_number(after)? + }; + + return Some(multiplier * scale_val + remainder); + } + } + } + + // No scale found — try as a single value word + if words.len() == 1 { + return word_to_value(words[0]); + } + + None +} + +/// Strip trailing punctuation from a word, returning (core_word, suffix). +fn strip_trailing_punct(word: &str) -> (&str, &str) { + for punct in &[",", ".", ";", ":", "!", "?"] { + if word.ends_with(punct) { + let core = &word[..word.len() - punct.len()]; + return (core, punct); + } + } + (word, "") +} + +/// Process Hindi text, replacing Hindi number word sequences with Devanagari numerals. +/// This is a sentence-scanning approach: it finds number word spans within the input +/// and replaces them with their numeric equivalents. +pub fn process(input: &str) -> String { + let words: Vec<&str> = input.split_whitespace().collect(); + if words.is_empty() { + return input.to_string(); + } + + // Pre-process: strip trailing punctuation for matching purposes + let stripped: Vec<(&str, &str)> = words.iter().map(|w| strip_trailing_punct(w)).collect(); + + let mut result = Vec::new(); + let mut i = 0; + + while i < words.len() { + // Try to find the longest number word span starting at i + let mut best_end = i; + let mut best_val: Option = None; + let mut best_suffix = ""; + + // Check for modifier-led sequences first + let has_modifier = is_modifier(stripped[i].0); + + let max_end = words.len().min(i + 15); // reasonable limit + for end in (i + 1..=max_end).rev() { + // Build span from stripped words (no trailing punct) + let span: Vec<&str> = stripped[i..end].iter().map(|(core, _)| *core).collect(); + + // At least one word must be a number word or modifier + let has_number = span.iter().any(|w| is_hi_number_word(w) || is_modifier(w)); + if !has_number { + continue; + } + + if let Some(val) = words_to_number(&span) { + if has_modifier && end > i + 1 { + best_end = end; + best_val = Some(val); + best_suffix = stripped[end - 1].1; + break; + } + if !has_modifier { + best_end = end; + best_val = Some(val); + best_suffix = stripped[end - 1].1; + break; + } + } + } + + if let Some(val) = best_val { + let num_str = to_devanagari(val); + if best_suffix.is_empty() { + result.push(num_str); + } else { + result.push(format!("{}{}", num_str, best_suffix)); + } + i = best_end; + } else { + // Try single word (with stripped punctuation) + let (core, suffix) = stripped[i]; + if let Some(val) = word_to_value(core) { + let num_str = to_devanagari(val); + if suffix.is_empty() { + result.push(num_str); + } else { + result.push(format!("{}{}", num_str, suffix)); + } + i += 1; + } else if let Some(val) = scale_value(core) { + let num_str = to_devanagari(val); + if suffix.is_empty() { + result.push(num_str); + } else { + result.push(format!("{}{}", num_str, suffix)); + } + i += 1; + } else { + result.push(words[i].to_string()); + i += 1; + } + } + } + + result.join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(to_devanagari(1), "१"); + assert_eq!(to_devanagari(100), "१००"); + assert_eq!(to_devanagari(12345), "१२३४५"); + } + + #[test] + fn test_words_to_number() { + assert_eq!(words_to_number(&["एक"]), Some(1)); + assert_eq!(words_to_number(&["एक", "सौ"]), Some(100)); + assert_eq!( + words_to_number(&["दो", "हज़ार", "दो", "सौ", "बाईस"]), + Some(2222) + ); + assert_eq!(words_to_number(&["एक", "लाख", "एक"]), Some(100001)); + } + + #[test] + fn test_modifiers() { + assert_eq!(words_to_number(&["सवा", "सात", "सौ"]), Some(725)); + assert_eq!(words_to_number(&["साढ़े", "सात", "सौ"]), Some(750)); + assert_eq!(words_to_number(&["डेढ़", "सौ"]), Some(150)); + assert_eq!(words_to_number(&["ढाई", "सौ"]), Some(250)); + assert_eq!(words_to_number(&["पौने", "तीन", "सौ"]), Some(275)); + assert_eq!(words_to_number(&["सवा", "सोलह", "सौ"]), Some(1625)); + assert_eq!(words_to_number(&["साढ़े", "सोलह", "सौ"]), Some(1650)); + } +} diff --git a/src/asr/hi/date.rs b/src/asr/hi/date.rs new file mode 100644 index 0000000..62a9c54 --- /dev/null +++ b/src/asr/hi/date.rs @@ -0,0 +1,293 @@ +//! Date tagger for Hindi. +//! +//! Converts Hindi date expressions to Devanagari form: +//! - "छः मई" → "६ मई" +//! - "पच्चीस मार्च दो हज़ार दस" → "२५ मार्च, २०१०" +//! - "मार्च तीस उन्नीस सौ नब्बे" → "मार्च ३०, १९९०" +//! - "उन्नीस सौ नब्बे से उन्नीस सौ इक्यानबे" → "१९९०-१९९१" +//! - "चौंतीस सौ ईसा पूर्व" → "३४०० ई.पू." +//! - "दसवें शताब्दी" → "१०वें शताब्दी" + +use super::cardinal; + +/// Hindi month names. +const MONTHS: &[&str] = &[ + "जनवरी", + "फ़रवरी", + "फरवरी", + "मार्च", + "अप्रैल", + "मई", + "जून", + "जुलाई", + "अगस्त", + "सितंबर", + "अक्टूबर", + "नवंबर", + "दिसंबर", +]; + +fn is_month(word: &str) -> bool { + MONTHS.contains(&word) +} + +/// Process date patterns in a string. +pub fn process(input: &str) -> String { + let words: Vec<&str> = input.split_whitespace().collect(); + if words.is_empty() { + return input.to_string(); + } + + // First handle special patterns, then fall through to ordinal+cardinal processing + let mut result = Vec::new(); + let mut i = 0; + + while i < words.len() { + // Check for "शताब्दी" pattern — this is handled by ordinal processor + // Check for "ईसा पूर्व" / "ईस्वी" / "ईसवी" suffixes + // Check for "की" + number pattern (मार्च की दो → मार्च २) + // Check for "से" range pattern (X से Y → X-Y) + // Check for "वर्ष" / "सन" prefix + + // "वर्ष" or "सन" followed by number → "वर्ष/सन" + Devanagari + if (words[i] == "वर्ष" || words[i] == "सन") && i + 1 < words.len() { + let (year_end, year_val) = find_number_span(&words, i + 1); + if let Some(yv) = year_val { + result.push(words[i].to_string()); + result.push(cardinal::to_devanagari(yv)); + i = year_end; + continue; + } + } + + // Month + "की" + number → month + number + if is_month(words[i]) && i + 2 < words.len() && words[i + 1] == "की" { + let (num_end, num_val) = find_number_span(&words, i + 2); + if let Some(nv) = num_val { + result.push(words[i].to_string()); + result.push(cardinal::to_devanagari(nv)); + i = num_end; + continue; + } + } + + // Check for date range: "X से Y" where both are numbers + // or "X से Y तक" + if i > 0 && words[i] == "से" && i + 1 < words.len() { + // Check if previous words form a number and next words form a number + // This is complex; handle it after basic patterns + } + + // Number + Month + Year pattern (with optional ईसवी/ईसा पूर्व) + // Month + Number + Year pattern + if is_month(words[i]) { + // Month-first: "मार्च तीस उन्नीस सौ नब्बे" + // Try to find day (1-31) then year + if i + 1 < words.len() { + // First try: day as a greedy number span, then year + let (day_end, day_val) = find_number_span(&words, i + 1); + if let Some(dv) = day_val { + // Check for year after day + let (year_end, year_val) = find_number_span(&words, day_end); + if let Some(yv) = year_val { + let (era_end, era_str) = find_era_suffix(&words, year_end); + result.push(format!("{} {},", words[i], cardinal::to_devanagari(dv))); + if let Some(era) = era_str { + result.push(format!("{} {}", cardinal::to_devanagari(yv), era)); + } else { + result.push(cardinal::to_devanagari(yv)); + } + i = era_end; + continue; + } + // Just month + day + result.push(format!("{} {}", words[i], cardinal::to_devanagari(dv))); + i = day_end; + continue; + } + + // Second try: if greedy failed, try day as single word (1-31), rest as year + if let Some(dv) = cardinal::word_to_value(words[i + 1]) { + if dv >= 1 && dv <= 31 && i + 2 < words.len() { + let (year_end, year_val) = find_number_span(&words, i + 2); + if let Some(yv) = year_val { + let (era_end, era_str) = find_era_suffix(&words, year_end); + result.push(format!("{} {},", words[i], cardinal::to_devanagari(dv))); + if let Some(era) = era_str { + result.push(format!("{} {}", cardinal::to_devanagari(yv), era)); + } else { + result.push(cardinal::to_devanagari(yv)); + } + i = era_end; + continue; + } + } + } + } + + result.push(words[i].to_string()); + i += 1; + continue; + } + + // Number + Month pattern (day first) + if cardinal::is_hi_number_word(words[i]) || cardinal::is_modifier(words[i]) { + let (num_end, num_val) = find_number_span(&words, i); + if let Some(nv) = num_val { + // Check if followed by month + if num_end < words.len() && is_month(words[num_end]) { + let month = words[num_end]; + // Check for year after month + let (year_end, year_val) = find_number_span(&words, num_end + 1); + if let Some(yv) = year_val { + // Check for era suffix + let (era_end, era_str) = find_era_suffix(&words, year_end); + if let Some(era) = era_str { + result.push(format!("{} {},", cardinal::to_devanagari(nv), month)); + result.push(format!("{} {}", cardinal::to_devanagari(yv), era)); + } else { + result.push(format!("{} {},", cardinal::to_devanagari(nv), month)); + result.push(cardinal::to_devanagari(yv)); + } + i = era_end; + continue; + } + // Just day + month + result.push(format!("{} {}", cardinal::to_devanagari(nv), month)); + i = num_end + 1; + continue; + } + + // Check for "से" range pattern + if num_end < words.len() && words[num_end] == "से" { + let (end2, val2) = find_number_span(&words, num_end + 1); + if let Some(v2) = val2 { + // Check for era suffix after range + let (era_end, era_str) = find_era_suffix(&words, end2); + // Check for "तक" after range + let (tack_end, has_tack) = + if era_end < words.len() && words[era_end] == "तक" { + (era_end + 1, true) + } else { + (era_end, false) + }; + + if let Some(era) = era_str { + if has_tack { + result.push(format!( + "{}-{} {} तक", + cardinal::to_devanagari(nv), + cardinal::to_devanagari(v2), + era + )); + } else { + result.push(format!( + "{}-{} {}", + cardinal::to_devanagari(nv), + cardinal::to_devanagari(v2), + era + )); + } + } else if has_tack { + result.push(format!( + "{}-{} तक", + cardinal::to_devanagari(nv), + cardinal::to_devanagari(v2), + )); + } else { + result.push(format!( + "{}-{}", + cardinal::to_devanagari(nv), + cardinal::to_devanagari(v2), + )); + } + i = tack_end; + continue; + } + } + + // Check for era suffix directly after number + if num_end < words.len() { + let (era_end, era_str) = find_era_suffix(&words, num_end); + if let Some(era) = era_str { + result.push(format!("{} {}", cardinal::to_devanagari(nv), era)); + i = era_end; + continue; + } + } + } + } + + // Default: pass through + result.push(words[i].to_string()); + i += 1; + } + + result.join(" ") +} + +/// Find a number span starting at position `start`. +/// Returns (end_position, value). +fn find_number_span(words: &[&str], start: usize) -> (usize, Option) { + if start >= words.len() { + return (start, None); + } + + let mut end = start; + while end < words.len() { + if cardinal::is_hi_number_word(words[end]) || cardinal::is_modifier(words[end]) { + end += 1; + } else { + break; + } + } + + if end == start { + return (start, None); + } + + let span: Vec<&str> = words[start..end].to_vec(); + let val = cardinal::words_to_number(&span); + if val.is_some() { + (end, val) + } else { + (start, None) + } +} + +/// Find an era suffix (ईसा पूर्व, ईस्वी, ईसवी) starting at `start`. +/// Returns (end_position, era_string). +fn find_era_suffix(words: &[&str], start: usize) -> (usize, Option<&'static str>) { + if start >= words.len() { + return (start, None); + } + + // "ईसा पूर्व" → "ई.पू." + if start + 1 < words.len() && words[start] == "ईसा" && words[start + 1] == "पूर्व" + { + return (start + 2, Some("ई.पू.")); + } + + // "ईस्वी" or "ईसवी" → "ई." + if words[start] == "ईस्वी" || words[start] == "ईसवी" { + return (start + 1, Some("ई.")); + } + + (start, None) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_day_month() { + assert_eq!(process("छः मई"), "६ मई"); + assert_eq!(process("तीस जून"), "३० जून"); + } + + #[test] + fn test_day_month_year() { + assert_eq!(process("पच्चीस मार्च दो हज़ार दस"), "२५ मार्च, २०१०"); + } +} diff --git a/src/asr/hi/decimal.rs b/src/asr/hi/decimal.rs new file mode 100644 index 0000000..7352ace --- /dev/null +++ b/src/asr/hi/decimal.rs @@ -0,0 +1,124 @@ +//! Decimal number tagger for Hindi. +//! +//! Converts Hindi decimal expressions to Devanagari form: +//! - "दो सौ छह दशमलव दो नौ" → "२०६.२९" +//! - "साढ़े तीन सौ दशमलव दो दो" → "३५०.२२" +//! +//! Uses "दशमलव" as the decimal point marker. +//! Fractional digits are parsed individually. + +use super::cardinal; + +/// Process decimal patterns in a string. +pub fn process(input: &str) -> String { + let words: Vec<&str> = input.split_whitespace().collect(); + if words.is_empty() { + return input.to_string(); + } + + // Find "दशमलव" and split into integer part + fractional part + let mut result = Vec::new(); + let mut i = 0; + + while i < words.len() { + if words[i] == "दशमलव" { + // Find the integer part before "दशमलव" + let (int_start, int_val) = find_number_before(&words, &result, i); + + // Find the fractional digits after "दशमलव" + let (frac_end, frac_digits) = find_frac_digits_after(&words, i + 1); + + if let (Some(int_val), Some(frac_digits)) = (int_val, frac_digits) { + // Remove integer words from result + let to_remove = result.len() - int_start; + for _ in 0..to_remove { + result.pop(); + } + + let int_str = cardinal::to_devanagari(int_val); + let frac_str = frac_digits + .iter() + .map(|&d| cardinal::to_devanagari_digit(d as u8)) + .collect::(); + result.push(format!("{}.{}", int_str, frac_str)); + i = frac_end; + continue; + } + } + + result.push(words[i].to_string()); + i += 1; + } + + result.join(" ") +} + +/// Find the number words before position `pos` in the word list. +/// Returns (start_index_in_result, value). +fn find_number_before(words: &[&str], result: &[String], pos: usize) -> (usize, Option) { + if pos == 0 { + return (result.len(), None); + } + + // Scan backwards to find number words + let mut start = pos; + while start > 0 { + let w = words[start - 1]; + if cardinal::is_hi_number_word(w) || cardinal::is_modifier(w) { + start -= 1; + } else { + break; + } + } + + if start == pos { + return (result.len(), None); + } + + let num_words: Vec<&str> = words[start..pos].to_vec(); + let val = cardinal::words_to_number(&num_words); + let result_start = result.len() - (pos - start); + + (result_start, val) +} + +/// Find fractional digit words after position `pos`. +/// Returns (end_index, digits). +fn find_frac_digits_after(words: &[&str], start: usize) -> (usize, Option>) { + let mut digits = Vec::new(); + let mut end = start; + + while end < words.len() { + if let Some(v) = cardinal::word_to_value(words[end]) { + if v <= 9 { + digits.push(v); + end += 1; + } else { + break; + } + } else { + break; + } + } + + if digits.is_empty() { + (start, None) + } else { + (end, Some(digits)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(process("दो सौ छह दशमलव दो नौ"), "२०६.२९"); + } + + #[test] + fn test_modifier() { + assert_eq!(process("साढ़े तीन सौ दशमलव दो दो"), "३५०.२२"); + } +} diff --git a/src/asr/hi/fraction.rs b/src/asr/hi/fraction.rs new file mode 100644 index 0000000..e132c21 --- /dev/null +++ b/src/asr/hi/fraction.rs @@ -0,0 +1,362 @@ +//! Fraction tagger for Hindi. +//! +//! Converts Hindi fraction expressions to numeric form: +//! - "एक सौ नौ बटा एक सौ चौबीस" → "१०९/१२४" +//! - "एक सौ तैंतीस सही एक बटा दो" → "१३३ १/२" +//! - "डेढ़" → "१ १/२" +//! - "ढाई" → "२ १/२" +//! - "आधा" → "१/२" +//! - "सवा पैंतीस" → "३५ १/४" +//! - "तीन चौथाई" → "३/४" +//! - "साढ़े चार सौ बटा दस" → "४५०/१०" + +use super::cardinal; + +/// Check if the words starting at `start` contain a scale word. +fn has_scale_word(words: &[&str], start: usize) -> bool { + for j in start..words.len() { + if cardinal::scale_value(words[j]).is_some() { + return true; + } + if !cardinal::is_hi_number_word(words[j]) && !cardinal::is_modifier(words[j]) { + break; + } + } + false +} + +/// Check if word is a unit/currency/time marker that means this modifier is NOT a fraction context. +fn is_non_fraction_context(word: &str) -> bool { + // Time markers + if matches!(word, "बजे" | "बजकर" | "बजके" | "घंटा" | "घंटे") + { + return true; + } + // Measure/money context will be handled by those modules + false +} + +/// Process fraction patterns in a string. +pub fn process(input: &str) -> String { + let words: Vec<&str> = input.split_whitespace().collect(); + if words.is_empty() { + return input.to_string(); + } + + let mut result = Vec::new(); + let mut i = 0; + + while i < words.len() { + // Check for standalone special fractions + match words[i] { + "आधा" => { + result.push("१/२".to_string()); + i += 1; + continue; + } + "पाव" => { + result.push("१/४".to_string()); + i += 1; + continue; + } + _ => {} + } + + // Check for "X चौथाई" or "X तिहाई" patterns + if i + 1 < words.len() { + if let Some(n) = cardinal::word_to_value(words[i]) { + if words[i + 1] == "चौथाई" { + result.push(format!("{}/४", cardinal::to_devanagari(n))); + i += 2; + continue; + } + if words[i + 1] == "तिहाई" { + result.push(format!("{}/३", cardinal::to_devanagari(n))); + i += 2; + continue; + } + } + } + + // Check for "X सही Y बटा Z" pattern (mixed fraction) — BEFORE बटा + if let Some((frac_str, consumed)) = try_parse_sahi_fraction(&words, i) { + result.push(frac_str); + i += consumed; + continue; + } + + // Check for "X बटा Y" pattern (simple fraction) + // This handles modifier-led numerators too: "साढ़े चार सौ बटा दस" → "४५०/१०" + if let Some((frac_str, consumed)) = try_parse_bata_fraction(&words, i) { + result.push(frac_str); + i += consumed; + continue; + } + + // Check for standalone modifier-based fractions + // ONLY when the modifier is truly standalone (not followed by scale words or non-fraction context) + if cardinal::is_modifier(words[i]) { + if let Some((frac_str, consumed)) = try_parse_modifier_fraction(&words, i) { + result.push(frac_str); + i += consumed; + continue; + } + } + + result.push(words[i].to_string()); + i += 1; + } + + result.join(" ") +} + +/// Try to parse a "X बटा Y" fraction. +fn try_parse_bata_fraction(words: &[&str], start: usize) -> Option<(String, usize)> { + // Find "बटा" in the upcoming words + let mut bata_pos = None; + let max_look = (start + 12).min(words.len()); + + for j in start..max_look { + if words[j] == "बटा" { + bata_pos = Some(j); + break; + } + // Stop looking if we hit a non-number, non-modifier word + if !cardinal::is_hi_number_word(words[j]) && !cardinal::is_modifier(words[j]) { + break; + } + } + + let bata_pos = bata_pos?; + + // Parse numerator (before बटा) + if bata_pos == start { + return None; // No numerator + } + + let num_words: Vec<&str> = words[start..bata_pos].to_vec(); + + // Check if numerator words are valid (number words or modifiers) + if !num_words + .iter() + .all(|w| cardinal::is_hi_number_word(w) || cardinal::is_modifier(w)) + { + return None; + } + + let numerator = cardinal::words_to_number(&num_words)?; + + // Parse denominator (after बटा) + let denom_start = bata_pos + 1; + let mut denom_end = denom_start; + while denom_end < words.len() + && (cardinal::is_hi_number_word(words[denom_end]) + || cardinal::is_modifier(words[denom_end])) + { + denom_end += 1; + } + + if denom_end == denom_start { + return None; + } + + let denom_words: Vec<&str> = words[denom_start..denom_end].to_vec(); + let denominator = cardinal::words_to_number(&denom_words)?; + + let frac_str = format!( + "{}/{}", + cardinal::to_devanagari(numerator), + cardinal::to_devanagari(denominator) + ); + Some((frac_str, denom_end - start)) +} + +/// Try to parse a "X सही Y बटा Z" mixed fraction. +fn try_parse_sahi_fraction(words: &[&str], start: usize) -> Option<(String, usize)> { + // Find "सही" in the upcoming words + let mut sahi_pos = None; + let max_look = (start + 12).min(words.len()); + + for j in start..max_look { + if words[j] == "सही" { + sahi_pos = Some(j); + break; + } + if !cardinal::is_hi_number_word(words[j]) && !cardinal::is_modifier(words[j]) { + break; + } + } + + let sahi_pos = sahi_pos?; + + if sahi_pos == start { + return None; + } + + // Parse whole number (before सही) + let whole_words: Vec<&str> = words[start..sahi_pos].to_vec(); + if !whole_words + .iter() + .all(|w| cardinal::is_hi_number_word(w) || cardinal::is_modifier(w)) + { + return None; + } + let whole = cardinal::words_to_number(&whole_words)?; + + // After सही, expect "Y बटा Z" + let frac_start = sahi_pos + 1; + if let Some((frac_str, consumed)) = try_parse_bata_fraction(words, frac_start) { + let result = format!("{} {}", cardinal::to_devanagari(whole), frac_str); + return Some((result, sahi_pos - start + 1 + consumed)); + } + + None +} + +/// Try to parse modifier-based fractions. +/// Only handles truly standalone modifiers (not followed by scale words or non-fraction context). +/// - "डेढ़" (alone or followed by non-number) → "१ १/२" +/// - "ढाई" (alone or followed by non-number) → "२ १/२" +/// - "सवा X" (X has no scale word) → "X १/४" +/// - "साढ़े X" (X has no scale word) → "X १/२" +/// - "पौने X" (X has no scale word) → "(X-1) ३/४" +fn try_parse_modifier_fraction(words: &[&str], start: usize) -> Option<(String, usize)> { + let modifier = words[start]; + + match modifier { + "डेढ़" => { + // Only standalone — NOT followed by scale word or number+scale + if start + 1 < words.len() { + let next = words[start + 1]; + // If followed by a number word or scale word, let cardinal/money/measure handle it + if cardinal::is_hi_number_word(next) + || cardinal::is_modifier(next) + || is_non_fraction_context(next) + { + return None; + } + } + Some(("१ १/२".to_string(), 1)) + } + "ढाई" => { + if start + 1 < words.len() { + let next = words[start + 1]; + if cardinal::is_hi_number_word(next) + || cardinal::is_modifier(next) + || is_non_fraction_context(next) + { + return None; + } + } + Some(("२ १/२".to_string(), 1)) + } + "सवा" => { + // सवा + number (no scale) → "N 1/4" + if start + 1 < words.len() { + // If the following number words contain a scale word, let cardinal handle it + if has_scale_word(words, start + 1) { + return None; + } + // If followed by time/money context, skip + if is_non_fraction_context(words[start + 1]) { + return None; + } + // Collect number words + let mut end = start + 1; + while end < words.len() && cardinal::is_hi_number_word(words[end]) { + end += 1; + } + if end > start + 1 { + let num_words: Vec<&str> = words[start + 1..end].to_vec(); + if let Some(val) = cardinal::words_to_number(&num_words) { + return Some(( + format!("{} १/४", cardinal::to_devanagari(val)), + end - start, + )); + } + } + } + // सवा alone at end of input + Some(("१/४".to_string(), 1)) + } + "साढ़े" => { + if start + 1 < words.len() { + // If the following number words contain a scale word, let cardinal handle it + if has_scale_word(words, start + 1) { + return None; + } + if is_non_fraction_context(words[start + 1]) { + return None; + } + // Collect number words + let mut end = start + 1; + while end < words.len() && cardinal::is_hi_number_word(words[end]) { + end += 1; + } + if end > start + 1 { + let num_words: Vec<&str> = words[start + 1..end].to_vec(); + if let Some(val) = cardinal::words_to_number(&num_words) { + return Some(( + format!("{} १/२", cardinal::to_devanagari(val)), + end - start, + )); + } + } + } + // साढ़े alone + Some(("१/२".to_string(), 1)) + } + "पौन" | "पौना" | "पौने" => { + if start + 1 < words.len() { + // If the following number words contain a scale word, let cardinal handle it + if has_scale_word(words, start + 1) { + return None; + } + if is_non_fraction_context(words[start + 1]) { + return None; + } + // Collect number words + let mut end = start + 1; + while end < words.len() && cardinal::is_hi_number_word(words[end]) { + end += 1; + } + if end > start + 1 { + let num_words: Vec<&str> = words[start + 1..end].to_vec(); + if let Some(val) = cardinal::words_to_number(&num_words) { + let whole = val - 1; + return Some(( + format!("{} ३/४", cardinal::to_devanagari(whole)), + end - start, + )); + } + } + } + // पौन/पौना alone + Some(("३/४".to_string(), 1)) + } + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bata() { + assert_eq!(process("एक सौ नौ बटा एक सौ चौबीस"), "१०९/१२४"); + assert_eq!(process("एक सौ एक बटा दो"), "१०१/२"); + } + + #[test] + fn test_sahi() { + assert_eq!(process("एक सौ तैंतीस सही एक बटा दो"), "१३३ १/२"); + } + + #[test] + fn test_standalone() { + assert_eq!(process("डेढ़"), "१ १/२"); + assert_eq!(process("ढाई"), "२ १/२"); + assert_eq!(process("आधा"), "१/२"); + } +} diff --git a/src/asr/hi/measure.rs b/src/asr/hi/measure.rs new file mode 100644 index 0000000..a5b51c1 --- /dev/null +++ b/src/asr/hi/measure.rs @@ -0,0 +1,346 @@ +//! Measure tagger for Hindi. +//! +//! Converts Hindi measurement expressions to numeric form: +//! - "दो सौ छह ग्राम" → "२०६ g" +//! - "दो सौ छह दशमलव दो नौ ग्राम" → "२०६.२९ g" +//! - "दो बाई दो" → "२x२" +//! - "साढ़े सात वर्ष" → "७.५ yr" +//! - "पौने ग्यारह घंटे" → "१०.७५ h" +//! - "डेढ़ दर्जन" → "१.५ doz" + +use super::cardinal; + +/// Unit mappings: (Hindi name variants, symbol) +const UNITS: &[(&[&str], &str)] = &[ + (&["वर्गसेंटीमीटर", "वर्ग सेंटीमीटर"], "cm²"), + (&["क्यूबिकमिलीमीटर", "क्यूबिक मिलीमीटर", "घन मिलीमीटर"], "mm³"), + (&["वर्ग माइक्रोमीटर"], "µm²"), + (&["घन फीट", "घनफीट"], "ft³"), + (&["किलोमीटर प्रति घंटा"], "km/h"), + (&["मील प्रति घंटा"], "mi/h"), + (&["मीट्रिक टन"], "t"), + (&["मिलीमीटर"], "mm"), + (&["मिलिग्राम"], "mg"), + (&["माइक्रॉन"], "µm"), + (&["सेल्सियस"], "°C"), + (&["डेसिग्राम"], "dg"), + (&["कैल्विन"], "K"), + (&["किलोमीटर"], "km"), + (&["हेक्टेयर"], "ha"), + (&["ऐंपीयर"], "A"), + (&["गैलन"], "gal"), + (&["महीने", "महीना"], "mo"), + (&["दर्जन"], "doz"), + (&["लीटर"], "L"), + (&["पिंट"], "pt"), + (&["ग्राम"], "g"), + (&["इंच"], "in"), + (&["फुट"], "ft"), + (&["एकड़"], "ac"), + (&["किग्रा"], "kg"), + (&["मीटर"], "m"), + (&["वर्ष"], "yr"), + (&["घंटे", "घंटा"], "h"), +]; + +/// Process measure patterns in a string. +pub fn process(input: &str) -> String { + let words: Vec<&str> = input.split_whitespace().collect(); + if words.is_empty() { + return input.to_string(); + } + + let mut result = Vec::new(); + let mut i = 0; + + while i < words.len() { + // Check for "X बाई Y" (dimension) pattern + if let Some((dim_str, consumed)) = try_parse_dimension(&words, i) { + result.push(dim_str); + i += consumed; + continue; + } + + // Check for number + unit pattern + if let Some((measure_str, consumed)) = try_parse_measure(&words, i) { + result.push(measure_str); + i += consumed; + continue; + } + + result.push(words[i].to_string()); + i += 1; + } + + result.join(" ") +} + +/// Try to parse a measurement expression. +fn try_parse_measure(words: &[&str], start: usize) -> Option<(String, usize)> { + // Find a unit within reasonable range after number words + let max_look = (start + 15).min(words.len()); + + for end in start..max_look { + // Try matching unit names starting at position `end` + for &(names, symbol) in UNITS { + for &name in names { + let name_words: Vec<&str> = name.split_whitespace().collect(); + let name_len = name_words.len(); + + if end + name_len > words.len() { + continue; + } + + let matches = name_words + .iter() + .enumerate() + .all(|(j, &nw)| words[end + j] == nw); + if !matches { + continue; + } + + // Found unit at end..end+name_len + // Parse number before it + let span = &words[start..end]; + if span.is_empty() { + continue; + } + + // Check for दशमलव (decimal) + let dashm_pos = span.iter().position(|&w| w == "दशमलव"); + + if let Some(dp) = dashm_pos { + let int_words = &span[..dp]; + let frac_words = &span[dp + 1..]; + + if int_words.is_empty() + || !int_words + .iter() + .all(|w| cardinal::is_hi_number_word(w) || cardinal::is_modifier(w)) + { + continue; + } + + let int_val = cardinal::words_to_number(&int_words.to_vec())?; + + let frac_digits: Vec = frac_words + .iter() + .filter_map(|w| cardinal::word_to_value(w).filter(|&v| v <= 9)) + .collect(); + + if frac_digits.len() != frac_words.len() { + continue; + } + + let int_str = cardinal::to_devanagari(int_val); + let frac_str: String = frac_digits + .iter() + .map(|&d| cardinal::to_devanagari_digit(d as u8)) + .collect(); + + let result = format!("{}.{} {}", int_str, frac_str, symbol); + return Some((result, end + name_len - start)); + } + + // No decimal — check for modifiers that produce decimals + if !span + .iter() + .all(|w| cardinal::is_hi_number_word(w) || cardinal::is_modifier(w)) + { + continue; + } + + // Check if modifier produces a decimal result + if let Some(measure_str) = try_modifier_measure(span, symbol) { + return Some((measure_str, end + name_len - start)); + } + + // Plain number + let num_words: Vec<&str> = span.to_vec(); + let val = cardinal::words_to_number(&num_words)?; + let result = format!("{} {}", cardinal::to_devanagari(val), symbol); + return Some((result, end + name_len - start)); + } + } + } + + None +} + +/// Handle modifier-based measures that produce decimal output. +/// Uses find_lowest_scale to correctly apply modifiers to the scale, not the total. +/// e.g., "साढ़े सात" + yr → "७.५ yr" +/// "पौने ग्यारह" + h → "१०.७५ h" +/// "डेढ़" + doz → "१.५ doz" +/// "ढाई" + mo → "२.५ mo" +fn try_modifier_measure(span: &[&str], symbol: &str) -> Option { + if span.is_empty() { + return None; + } + + let modifier = span[0]; + if !cardinal::is_modifier(modifier) { + return None; + } + + let rest = &span[1..]; + + match modifier { + "डेढ़" => { + if rest.is_empty() { + return Some(format!("१.५ {}", symbol)); + } + let base = cardinal::words_to_number(&rest.to_vec())?; + let lowest = cardinal::find_lowest_scale(rest); + let result = base + lowest / 2; + return format_measure_result(result as f64, lowest as f64 / 2.0, symbol); + } + "ढाई" => { + if rest.is_empty() { + return Some(format!("२.५ {}", symbol)); + } + let base = cardinal::words_to_number(&rest.to_vec())?; + let lowest = cardinal::find_lowest_scale(rest); + let result = base + lowest + lowest / 2; + return format_measure_result(result as f64, (lowest + lowest / 2) as f64, symbol); + } + "साढ़े" => { + if rest.is_empty() { + return None; + } + let base = cardinal::words_to_number(&rest.to_vec())?; + let lowest = cardinal::find_lowest_scale(rest); + let half = lowest as f64 / 2.0; + let result = base as f64 + half; + return format_measure_decimal(result, symbol); + } + "सवा" => { + if rest.is_empty() { + return None; + } + let base = cardinal::words_to_number(&rest.to_vec())?; + let lowest = cardinal::find_lowest_scale(rest); + let quarter = lowest as f64 / 4.0; + let result = base as f64 + quarter; + return format_measure_decimal(result, symbol); + } + "पौने" | "पौन" | "पौना" => { + if rest.is_empty() { + return None; + } + let base = cardinal::words_to_number(&rest.to_vec())?; + let lowest = cardinal::find_lowest_scale(rest); + let quarter = lowest as f64 / 4.0; + let result = base as f64 - quarter; + return format_measure_decimal(result, symbol); + } + _ => None, + } +} + +/// Format a measure result as decimal or integer. +fn format_measure_decimal(result: f64, symbol: &str) -> Option { + if result == result.floor() { + Some(format!( + "{} {}", + cardinal::to_devanagari(result as i64), + symbol + )) + } else { + let formatted = format!("{:.2}", result); + let trimmed = formatted.trim_end_matches('0').trim_end_matches('.'); + Some(format!( + "{} {}", + cardinal::to_devanagari_str(trimmed), + symbol + )) + } +} + +fn format_measure_result(result: f64, _fraction: f64, symbol: &str) -> Option { + format_measure_decimal(result, symbol) +} + +/// Try to parse a "X बाई Y" dimension pattern. +fn try_parse_dimension(words: &[&str], start: usize) -> Option<(String, usize)> { + // Find "बाई" in upcoming words + let max_look = (start + 8).min(words.len()); + + for j in start..max_look { + if words[j] == "बाई" { + // Parse X before बाई + let x_words: Vec<&str> = words[start..j].to_vec(); + if x_words.is_empty() || !x_words.iter().all(|w| cardinal::is_hi_number_word(w)) { + continue; + } + let x = cardinal::words_to_number(&x_words)?; + + // Parse Y after बाई + let mut y_end = j + 1; + while y_end < words.len() && cardinal::is_hi_number_word(words[y_end]) { + y_end += 1; + } + if y_end == j + 1 { + continue; + } + let y_words: Vec<&str> = words[j + 1..y_end].to_vec(); + let y = cardinal::words_to_number(&y_words)?; + + // Check for trailing unit + let mut unit_str = String::new(); + let mut final_end = y_end; + if y_end < words.len() { + for &(names, symbol) in UNITS { + for &name in names { + let name_words: Vec<&str> = name.split_whitespace().collect(); + let name_len = name_words.len(); + if y_end + name_len <= words.len() { + let matches = name_words + .iter() + .enumerate() + .all(|(k, &nw)| words[y_end + k] == nw); + if matches { + unit_str = format!(" {}", symbol); + final_end = y_end + name_len; + break; + } + } + } + if !unit_str.is_empty() { + break; + } + } + } + + let dim = format!( + "{}x{}{}", + cardinal::to_devanagari(x), + cardinal::to_devanagari(y), + unit_str + ); + return Some((dim, final_end - start)); + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(process("दो सौ छह ग्राम"), "२०६ g"); + } + + #[test] + fn test_decimal_measure() { + assert_eq!(process("दो सौ छह दशमलव दो नौ ग्राम"), "२०६.२९ g"); + } + + #[test] + fn test_dimension() { + assert_eq!(process("दो बाई दो"), "२x२"); + } +} diff --git a/src/asr/hi/mod.rs b/src/asr/hi/mod.rs new file mode 100644 index 0000000..1dc807e --- /dev/null +++ b/src/asr/hi/mod.rs @@ -0,0 +1,14 @@ +//! Hindi inverse text normalization taggers. + +pub mod address; +pub mod cardinal; +pub mod date; +pub mod decimal; +pub mod fraction; +pub mod measure; +pub mod money; +pub mod ordinal; +pub mod telephone; +pub mod time; +pub mod whitelist; +pub mod word; diff --git a/src/asr/hi/money.rs b/src/asr/hi/money.rs new file mode 100644 index 0000000..290e85d --- /dev/null +++ b/src/asr/hi/money.rs @@ -0,0 +1,273 @@ +//! Money tagger for Hindi. +//! +//! Converts Hindi currency expressions to symbolic form: +//! - "बारह हज़ार तेरह डॉलर" → "$१२०१३" +//! - "दो सौ छह रुपये दो सौ छह पैसे" → "₹२०६.२०६" +//! - "साढ़े सात सौ डॉलर" → "$७५०" +//! - "ढाई करोड़ रुपए" → "₹२५००००००" + +use super::cardinal; + +/// Currency mappings: (Hindi names, symbol) +/// Multiple Hindi names can map to the same symbol. +/// Longer names listed first to avoid partial matches. +const CURRENCIES: &[(&[&str], &str)] = &[ + (&["अल्जीरियाई दिनार"], "دج"), + (&["बेलारूसी रूबल"], "br"), + (&["चीनी युआन"], "元"), + (&["आर्मेनियाई ड्राम"], "֏"), + (&["अरूबान फ्लोरिन"], "ƒ"), + (&["त्रिनिदाद और टोबैगो डॉलर"], "tt$"), + (&["तुर्की लिरा"], "₺"), + (&["युगांडा शिलिंग"], "ush"), + (&["यूक्रेनी ग्रिव्ना"], "₴"), + (&["वेनेजुएलन बोलिवार"], "bs."), + (&["साइप्रस पाउंड"], "cyp"), + (&["बहरीन दिरहम"], ".د.ب"), + (&["अजरबैजानी मनात"], "₼"), + (&["बुरुंडी फ्रैंक"], "fbu"), + (&["कैमन आइलैंड्स डॉलर"], "ci$"), + (&["लिलांगेनी"], "l"), + (&["बिटकॉइन"], "₿"), + (&["वॉन"], "₩"), + (&["लीरा"], "₺"), + (&["यूरो"], "€"), + (&["डॉलर"], "$"), + (&["रुपये", "रुपए", "रुपिया", "रुपेया"], "₹"), + (&["पैसे", "पैसा"], "p"), +]; + +/// Process money patterns in a string. +pub fn process(input: &str) -> String { + let words: Vec<&str> = input.split_whitespace().collect(); + if words.is_empty() { + return input.to_string(); + } + + let mut result = Vec::new(); + let mut i = 0; + + while i < words.len() { + // Try to find a currency name starting at various positions + if let Some((money_str, consumed)) = try_parse_money(&words, i) { + // Remove any number words we already added to result + result.push(money_str); + i += consumed; + continue; + } + + result.push(words[i].to_string()); + i += 1; + } + + result.join(" ") +} + +/// Try to parse a money expression starting at or before position `start`. +fn try_parse_money(words: &[&str], start: usize) -> Option<(String, usize)> { + // Scan forward from `start` looking for a currency name + // The pattern is: [number words] [दशमलव digit-words] currency_name + // or: [number words] currency_name [number words] [पैसे/पैसा unit] + + // First, try to find a currency name within a reasonable range + let max_look = (start + 20).min(words.len()); + + for end in start..max_look { + // Try matching currency names starting at position `end` + for &(names, symbol) in CURRENCIES { + for &name in names { + let name_words: Vec<&str> = name.split_whitespace().collect(); + let name_len = name_words.len(); + + if end + name_len > words.len() { + continue; + } + + // Check if words match the currency name + let matches = name_words + .iter() + .enumerate() + .all(|(j, &nw)| words[end + j] == nw); + if !matches { + continue; + } + + // Found a currency at position end..end+name_len + // Now parse the number before it + let (num_start, amount, has_decimal) = parse_money_amount(words, start, end); + + if num_start != start { + // Not starting at our position + continue; + } + + if let Some(amount_str) = amount { + // Special handling for रुपये + पैसे pattern + if symbol == "₹" { + let after_currency = end + name_len; + // Direct: "X रुपये Y पैसे" + if let Some((paise_str, paise_consumed)) = + try_parse_paise(words, after_currency) + { + let money = format!("₹{}.{}", amount_str, paise_str); + return Some((money, end + name_len + paise_consumed - start)); + } + // With और: "X रुपेया और Y पैसा" + if after_currency < words.len() && words[after_currency] == "और" { + if let Some((paise_str, paise_consumed)) = + try_parse_paise(words, after_currency + 1) + { + let money = format!("₹{}.{}", amount_str, paise_str); + return Some((money, end + name_len + 1 + paise_consumed - start)); + } + } + } + + // Check if this is a पैसे amount (separate from rupees) + if symbol == "p" { + let money = format!("p{}", amount_str); + return Some((money, end + name_len - start)); + } + + let money = if has_decimal { + format!("{}{}", symbol, amount_str) + } else { + format!("{}{}", symbol, amount_str) + }; + return Some((money, end + name_len - start)); + } + } + } + } + + None +} + +/// Parse the money amount (number + optional दशमलव digits) before a currency name. +/// Returns (actual_start, formatted_amount, has_decimal). +fn parse_money_amount( + words: &[&str], + start: usize, + currency_pos: usize, +) -> (usize, Option, bool) { + if currency_pos <= start { + return (start, None, false); + } + + // Check for "दशमलव" in the span + let span = &words[start..currency_pos]; + + // Find "दशमलव" position + let dashm_pos = span.iter().position(|&w| w == "दशमलव"); + + if let Some(dp) = dashm_pos { + // Integer part before दशमलव + let int_words = &span[..dp]; + let frac_words = &span[dp + 1..]; + + if int_words.is_empty() { + return (start, None, false); + } + + // Check all int_words are number words or modifiers + if !int_words + .iter() + .all(|w| cardinal::is_hi_number_word(w) || cardinal::is_modifier(w)) + { + return (start, None, false); + } + + let int_val = match cardinal::words_to_number(&int_words.to_vec()) { + Some(v) => v, + None => return (start, None, false), + }; + + // Parse fractional digits individually + let frac_digits: Vec = frac_words + .iter() + .filter_map(|w| cardinal::word_to_value(w).filter(|&v| v <= 9)) + .collect(); + + if frac_digits.len() != frac_words.len() { + return (start, None, false); + } + + let int_str = cardinal::to_devanagari(int_val); + let frac_str: String = frac_digits + .iter() + .map(|&d| cardinal::to_devanagari_digit(d as u8)) + .collect(); + + return (start, Some(format!("{}.{}", int_str, frac_str)), true); + } + + // No decimal — just a number + let num_words: Vec<&str> = span.to_vec(); + if !num_words + .iter() + .all(|w| cardinal::is_hi_number_word(w) || cardinal::is_modifier(w)) + { + return (start, None, false); + } + + let val = match cardinal::words_to_number(&num_words) { + Some(v) => v, + None => return (start, None, false), + }; + + (start, Some(cardinal::to_devanagari(val).to_string()), false) +} + +/// Try to parse a पैसे/पैसा amount after the main currency. +/// Pattern: number_words "पैसे"/"पैसा" +fn try_parse_paise(words: &[&str], start: usize) -> Option<(String, usize)> { + if start >= words.len() { + return None; + } + + let mut end = start; + while end < words.len() + && (cardinal::is_hi_number_word(words[end]) + || cardinal::is_modifier(words[end]) + || words[end] == "दशमलव") + { + end += 1; + } + + if end == start || end >= words.len() { + return None; + } + + // Must be followed by पैसे/पैसा + if words[end] != "पैसे" && words[end] != "पैसा" { + return None; + } + + let num_words: Vec<&str> = words[start..end].to_vec(); + let val = cardinal::words_to_number(&num_words)?; + let result = cardinal::to_devanagari(val).to_string(); + + Some((result, end + 1 - start)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(process("बारह हज़ार तेरह डॉलर"), "$१२०१३"); + assert_eq!(process("छियासठ तुर्की लिरा"), "₺६६"); + } + + #[test] + fn test_decimal() { + assert_eq!(process("बाईस दशमलव शून्य पाँच यूक्रेनी ग्रिव्ना"), "₴२२.०५"); + } + + #[test] + fn test_modifier() { + assert_eq!(process("डेढ़ सौ यूरो"), "€१५०"); + assert_eq!(process("डेढ़ हजार रुपए"), "₹१५००"); + } +} diff --git a/src/asr/hi/ordinal.rs b/src/asr/hi/ordinal.rs new file mode 100644 index 0000000..e202f9e --- /dev/null +++ b/src/asr/hi/ordinal.rs @@ -0,0 +1,121 @@ +//! Ordinal number tagger for Hindi. +//! +//! Converts Hindi ordinal expressions to Devanagari form: +//! - "सौवां" → "१००वां" +//! - "दसवीं" → "१०वीं" +//! - "एक सौ उन्नीसवें" → "११९वें" + +use super::cardinal; + +/// Ordinal suffixes in Hindi: वां, वीं, वें +const ORDINAL_SUFFIXES: &[&str] = &["वीं", "वां", "वें"]; + +/// Process ordinal patterns in a string. +pub fn process(input: &str) -> String { + let words: Vec<&str> = input.split_whitespace().collect(); + if words.is_empty() { + return input.to_string(); + } + + let mut result = Vec::new(); + let mut i = 0; + + while i < words.len() { + // Look for a word ending with an ordinal suffix + if let Some((suffix, base_end)) = find_ordinal_suffix(words[i]) { + // Try to parse the base word (the part before the suffix) + // First, try the current word alone + let base_word = &words[i][..base_end]; + + // Try building a multi-word number ending with this ordinal word + let mut best_start = i; + let mut best_val: Option = None; + + // Try spans ending at i + let min_start = if i >= 10 { i - 10 } else { 0 }; + for start in min_start..=i { + // All words from start to i-1 must be number words, plus the base of word[i] + let mut num_words: Vec<&str> = Vec::new(); + let mut valid = true; + + for j in start..i { + if cardinal::is_hi_number_word(words[j]) || cardinal::is_modifier(words[j]) { + num_words.push(words[j]); + } else { + valid = false; + break; + } + } + + if !valid { + continue; + } + + // Add the base part of the ordinal word + if !base_word.is_empty() { + num_words.push(base_word); + } + + if num_words.is_empty() { + continue; + } + + // Try to parse as a number + // For ordinals, the last word might have the suffix stripped + // We need to handle cases like "सौवां" where base="सौ" + if let Some(val) = cardinal::words_to_number(&num_words) { + best_start = start; + best_val = Some(val); + break; // Take the longest span + } + } + + if let Some(val) = best_val { + // Remove previously added words that are part of this number + let to_remove = i - best_start; + for _ in 0..to_remove { + result.pop(); + } + result.push(format!("{}{}", cardinal::to_devanagari(val), suffix)); + i += 1; + continue; + } + } + + result.push(words[i].to_string()); + i += 1; + } + + result.join(" ") +} + +/// Find an ordinal suffix at the end of a word. +/// Returns (suffix, byte_position_where_suffix_starts) if found. +fn find_ordinal_suffix(word: &str) -> Option<(&'static str, usize)> { + for &suffix in ORDINAL_SUFFIXES { + if word.ends_with(suffix) { + let base_end = word.len() - suffix.len(); + if base_end > 0 { + return Some((suffix, base_end)); + } + } + } + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(process("सौवां"), "१००वां"); + assert_eq!(process("दसवीं"), "१०वीं"); + assert_eq!(process("दसवें"), "१०वें"); + } + + #[test] + fn test_compound() { + assert_eq!(process("एक सौ उन्नीसवां"), "११९वां"); + } +} diff --git a/src/asr/hi/telephone.rs b/src/asr/hi/telephone.rs new file mode 100644 index 0000000..d7dc561 --- /dev/null +++ b/src/asr/hi/telephone.rs @@ -0,0 +1,169 @@ +//! Telephone number tagger for Hindi. +//! +//! After cardinal processing, digit words have been converted to Devanagari digits. +//! This module concatenates sequences of single Devanagari digits into phone numbers: +//! - "१ १ १ १ १ १" → "१११११" +//! - "+९१ ९ ८ ७ ६ ..." → "+९१ ९८७६..." +//! - "०२ ०२ ..." → "०२०२..." +//! +//! Also handles प्लस prefix for international numbers and +//! digit words that cardinal may have left as single-character Devanagari digits. + +/// Map English digit word to Devanagari digit. +fn english_digit_to_devanagari(word: &str) -> Option { + match word { + "zero" => Some('०'), + "one" => Some('१'), + "two" => Some('२'), + "three" => Some('३'), + "four" => Some('४'), + "five" => Some('५'), + "six" => Some('६'), + "seven" => Some('७'), + "eight" => Some('८'), + "nine" => Some('९'), + _ => None, + } +} + +/// Check if a string is a single Devanagari digit. +fn is_devanagari_digit(s: &str) -> bool { + let mut chars = s.chars(); + if let Some(c) = chars.next() { + if chars.next().is_none() { + return ('०'..='९').contains(&c); + } + } + false +} + +/// Check if a string is a multi-digit Devanagari number (already converted by cardinal). +fn is_devanagari_number(s: &str) -> bool { + !s.is_empty() && s.chars().all(|c| ('०'..='९').contains(&c)) +} + +/// Process telephone patterns in a string. +/// At this point, cardinal has already converted number words to Devanagari digits. +/// We concatenate sequences of single Devanagari digits (and small multi-digit groups). +pub fn process(input: &str) -> String { + let words: Vec<&str> = input.split_whitespace().collect(); + if words.is_empty() { + return input.to_string(); + } + + let mut result = Vec::new(); + let mut i = 0; + + while i < words.len() { + // Check for "प्लस" prefix (international format) + if words[i] == "प्लस" || words[i] == "+" || words[i] == "plus" { + if let Some((phone_str, consumed)) = try_concat_devanagari_digits(&words, i + 1, 4) { + // First two digits form country code + let chars: Vec = phone_str.chars().collect(); + if chars.len() >= 2 { + let country_code: String = chars[..2].iter().collect(); + let rest: String = chars[2..].iter().collect(); + result.push(format!("+{} {}", country_code, rest)); + } else { + result.push(format!("+{}", phone_str)); + } + i += 1 + consumed; + continue; + } + } + + // Check for sequence of Devanagari digit tokens (single digits or small numbers) + if is_devanagari_digit(words[i]) || is_devanagari_number(words[i]) { + if let Some((phone_str, consumed)) = try_concat_devanagari_digits(&words, i, 4) { + result.push(phone_str); + i += consumed; + continue; + } + } + + // Check for English digit word sequences + if english_digit_to_devanagari(words[i]).is_some() { + if let Some((phone_str, consumed)) = try_concat_english_digits(&words, i, 4) { + result.push(phone_str); + i += consumed; + continue; + } + } + + result.push(words[i].to_string()); + i += 1; + } + + result.join(" ") +} + +/// Try to concatenate a sequence of English digit words into Devanagari digits. +fn try_concat_english_digits( + words: &[&str], + start: usize, + min_digits: usize, +) -> Option<(String, usize)> { + let mut digits = String::new(); + let mut i = start; + + while i < words.len() { + if let Some(d) = english_digit_to_devanagari(words[i]) { + digits.push(d); + i += 1; + } else { + break; + } + } + + let digit_count = digits.chars().count(); + if digit_count >= min_digits { + Some((digits, i - start)) + } else { + None + } +} + +/// Try to concatenate a sequence of Devanagari digit tokens. +/// Each token should be a single Devanagari digit or small Devanagari number. +/// Requires at least `min_digits` total digits to form a phone number. +fn try_concat_devanagari_digits( + words: &[&str], + start: usize, + min_digits: usize, +) -> Option<(String, usize)> { + let mut digits = String::new(); + let mut i = start; + + while i < words.len() { + if is_devanagari_number(words[i]) { + digits.push_str(words[i]); + i += 1; + } else { + break; + } + } + + let digit_count = digits.chars().count(); + if digit_count >= min_digits { + Some((digits, i - start)) + } else { + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + // After cardinal, "एक एक एक एक एक एक" → "१ १ १ १ १ १" + assert_eq!(process("१ १ १ १ १ १"), "११११११"); + assert_eq!(process("१ २ ३ ४ ५ ६"), "१२३४५६"); + } + + #[test] + fn test_international() { + assert_eq!(process("प्लस ९ १ ९ ८ ७ ६ ५ ४ ३ २ १ ०"), "+९१ ९८७६५४३२१०"); + } +} diff --git a/src/asr/hi/time.rs b/src/asr/hi/time.rs new file mode 100644 index 0000000..6c65372 --- /dev/null +++ b/src/asr/hi/time.rs @@ -0,0 +1,479 @@ +//! Time tagger for Hindi. +//! +//! Converts Hindi time expressions to formatted form: +//! - "एक बजे सात मिनट" → "१:०७" +//! - "ग्यारह बजे" → "११:००" +//! - "बारह पन्द्रह" → "१२:१५" +//! - "चार बजे पाँच सेकंड" → "४:००:०५" +//! - "सोलह घंटा एक मिनट सत्ताईस सेकंड" → "१६:०१:२७" +//! - "ढाई बजे" → "२:३०" +//! - "सवा चार बजे" → "४:१५" +//! - "साढ़े ग्यारह" → "११:३०" +//! - "पौने पाँच" → "४:४५" +//! - "तीन मिनट उन्नीस सेकंड" → "००:०३:१९" + +use super::cardinal; + +fn is_baje(w: &str) -> bool { + matches!(w, "बजे" | "बजकर" | "बजके") +} + +fn is_minute_word(w: &str) -> bool { + w == "मिनट" +} + +fn is_second_word(w: &str) -> bool { + matches!(w, "सेकंड" | "सेकण्ड") +} + +fn is_hour_word(w: &str) -> bool { + // Only match singular "घंटा" for time; "घंटे" (plural/oblique) is for measure/duration + w == "घंटा" +} + +/// Check if a word is a measurement unit that means this is NOT a time context. +fn is_measure_unit(w: &str) -> bool { + matches!( + w, + "ग्राम" + | "किग्रा" + | "मीटर" + | "किलोमीटर" + | "मिलीमीटर" + | "लीटर" + | "पिंट" + | "गैलन" + | "इंच" + | "फुट" + | "एकड़" + | "हेक्टेयर" + | "वर्ष" + | "महीने" + | "महीना" + | "दर्जन" + | "सेल्सियस" + | "कैल्विन" + | "ऐंपीयर" + | "माइक्रॉन" + | "मिलिग्राम" + | "डेसिग्राम" + | "मीट्रिक" + | "वर्ग" + | "वर्गसेंटीमीटर" + | "क्यूबिकमिलीमीटर" + | "घन" + | "दशमलव" + | "घंटे" + ) +} + +/// Process time patterns in a string. +pub fn process(input: &str) -> String { + let words: Vec<&str> = input.split_whitespace().collect(); + if words.is_empty() { + return input.to_string(); + } + + let mut result = Vec::new(); + let mut i = 0; + + while i < words.len() { + // 1. Modifier-led time: डेढ़/ढाई बजे/घंटा, सवा/साढ़े/पौने + number + बजे/घंटा + // Also: साढ़े X (standalone time) and पौने X (standalone time) + // But NOT when followed by a unit word (measure context) + if cardinal::is_modifier(words[i]) { + if let Some((time_str, consumed)) = try_parse_modifier_time(&words, i) { + result.push(time_str); + i += consumed; + continue; + } + } + + // 2. Duration: X मिनट Y सेकंड (no hour) + if cardinal::is_hi_number_word(words[i]) || cardinal::is_modifier(words[i]) { + if let Some((time_str, consumed)) = try_parse_duration(&words, i) { + result.push(time_str); + i += consumed; + continue; + } + } + + // 3. Standard time: X बजे/बजकर/बजके [Y मिनट] [Z सेकंड] + if cardinal::is_hi_number_word(words[i]) { + if let Some((time_str, consumed)) = try_parse_standard_time(&words, i) { + result.push(time_str); + i += consumed; + continue; + } + } + + // 4. X घंटा Y मिनट/सेकंड (only with following मिनट/सेकंड) + if cardinal::is_hi_number_word(words[i]) { + if let Some((time_str, consumed)) = try_parse_ghanta_time(&words, i) { + result.push(time_str); + i += consumed; + continue; + } + } + + // 5. Two-number time: "बारह पन्द्रह" → "१२:१५" + // Only at END of input or followed by non-number, non-time-marker word + // and NOT preceded by another digit word + if cardinal::is_hi_number_word(words[i]) { + if let Some((time_str, consumed)) = try_parse_two_number_time(&words, i, &result) { + result.push(time_str); + i += consumed; + continue; + } + } + + result.push(words[i].to_string()); + i += 1; + } + + result.join(" ") +} + +/// Try to parse modifier-led time. +fn try_parse_modifier_time(words: &[&str], start: usize) -> Option<(String, usize)> { + let modifier = words[start]; + + match modifier { + "डेढ़" => { + // डेढ़ बजे → 1:30, डेढ़ घंटा → 1:30 + if start + 1 < words.len() + && (is_baje(words[start + 1]) || is_hour_word(words[start + 1])) + { + return Some(("१:३०".to_string(), 2)); + } + } + "ढाई" => { + if start + 1 < words.len() + && (is_baje(words[start + 1]) || is_hour_word(words[start + 1])) + { + return Some(("२:३०".to_string(), 2)); + } + } + "सवा" => { + // सवा X बजे → X:15 + if start + 2 < words.len() { + if let Some(hour) = cardinal::word_to_value(words[start + 1]) { + if hour >= 1 && hour <= 24 && is_baje(words[start + 2]) { + return Some((format!("{}:{}", cardinal::to_devanagari(hour), "१५"), 3)); + } + } + } + } + "साढ़े" => { + if start + 1 < words.len() { + if let Some(hour) = cardinal::word_to_value(words[start + 1]) { + if hour >= 1 && hour <= 24 { + // साढ़े X बजे → X:30 + if start + 2 < words.len() && is_baje(words[start + 2]) { + return Some(( + format!("{}:{}", cardinal::to_devanagari(hour), "३०"), + 3, + )); + } + // साढ़े X alone — ONLY if NOT followed by unit word or number + if start + 2 < words.len() { + let next = words[start + 2]; + if cardinal::is_hi_number_word(next) + || cardinal::is_modifier(next) + || is_measure_unit(next) + { + return None; + } + } + return Some((format!("{}:{}", cardinal::to_devanagari(hour), "३०"), 2)); + } + } + } + } + "पौने" | "पौन" | "पौना" => { + if start + 1 < words.len() { + if let Some(hour) = cardinal::word_to_value(words[start + 1]) { + if hour >= 2 && hour <= 24 { + let actual_hour = hour - 1; + // पौने X बजे → (X-1):45 + if start + 2 < words.len() && is_baje(words[start + 2]) { + return Some(( + format!("{}:{}", cardinal::to_devanagari(actual_hour), "४५"), + 3, + )); + } + // पौने X घंटा → (X-1):45 + if start + 2 < words.len() && is_hour_word(words[start + 2]) { + return Some(( + format!("{}:{}", cardinal::to_devanagari(actual_hour), "४५"), + 3, + )); + } + // पौने X alone — ONLY if NOT followed by unit word or number + if start + 2 < words.len() { + let next = words[start + 2]; + if cardinal::is_hi_number_word(next) + || cardinal::is_modifier(next) + || is_measure_unit(next) + { + return None; + } + } + return Some(( + format!("{}:{}", cardinal::to_devanagari(actual_hour), "४५"), + 2, + )); + } + } + } + } + _ => {} + } + + None +} + +/// Try to parse standard time: X बजे/बजकर/बजके [Y मिनट] [Z सेकंड] +fn try_parse_standard_time(words: &[&str], start: usize) -> Option<(String, usize)> { + let mut hour_end = start; + while hour_end < words.len() && cardinal::is_hi_number_word(words[hour_end]) { + hour_end += 1; + } + + if hour_end == start || hour_end >= words.len() { + return None; + } + + let time_marker = words[hour_end]; + if !is_baje(time_marker) { + return None; + } + + let hour_words: Vec<&str> = words[start..hour_end].to_vec(); + let hour = cardinal::words_to_number(&hour_words)?; + + let mut pos = hour_end + 1; + let mut minute: Option = None; + let mut second: Option = None; + + // Look for minutes + let (min_end, min_val) = find_number_then_keyword(words, pos, is_minute_word); + if let Some(mv) = min_val { + minute = Some(mv); + pos = min_end; + } + + // Look for seconds + let (sec_end, sec_val) = find_number_then_keyword(words, pos, is_second_word); + if let Some(sv) = sec_val { + second = Some(sv); + pos = sec_end; + } + + // If no minutes found but seconds directly follow + if minute.is_none() && second.is_none() { + let (sec_end2, sec_val2) = find_number_then_keyword(words, pos, is_second_word); + if let Some(sv) = sec_val2 { + second = Some(sv); + pos = sec_end2; + } + } + + let time_str = format_time(hour, minute.unwrap_or(0), second); + Some((time_str, pos - start)) +} + +/// Try to parse "X घंटा Y मिनट/सेकंड" (requires at least मिनट or सेकंड following). +fn try_parse_ghanta_time(words: &[&str], start: usize) -> Option<(String, usize)> { + let mut hour_end = start; + while hour_end < words.len() && cardinal::is_hi_number_word(words[hour_end]) { + hour_end += 1; + } + + if hour_end == start || hour_end >= words.len() { + return None; + } + + if !is_hour_word(words[hour_end]) { + return None; + } + + let hour_words: Vec<&str> = words[start..hour_end].to_vec(); + let hour = cardinal::words_to_number(&hour_words)?; + + let mut pos = hour_end + 1; + let mut minute: Option = None; + let mut second: Option = None; + + // Look for minutes + let (min_end, min_val) = find_number_then_keyword(words, pos, is_minute_word); + if let Some(mv) = min_val { + minute = Some(mv); + pos = min_end; + } + + // Look for seconds + let (sec_end, sec_val) = find_number_then_keyword(words, pos, is_second_word); + if let Some(sv) = sec_val { + second = Some(sv); + pos = sec_end; + } + + // If no minutes found but seconds directly follow + if minute.is_none() && second.is_none() { + let (sec_end2, sec_val2) = find_number_then_keyword(words, pos, is_second_word); + if let Some(sv) = sec_val2 { + second = Some(sv); + pos = sec_end2; + } + } + + // Must have found at least one of मिनट or सेकंड to be a time expression + if minute.is_none() && second.is_none() { + return None; + } + + let time_str = format_time(hour, minute.unwrap_or(0), second); + Some((time_str, pos - start)) +} + +/// Try to parse two consecutive number words as hour:minute. +/// Very restrictive: only matches when it's clearly a standalone time expression. +/// Must not be part of a longer digit word sequence (address/telephone). +fn try_parse_two_number_time( + words: &[&str], + start: usize, + result: &[String], +) -> Option<(String, usize)> { + if start + 1 >= words.len() { + return None; + } + + // Both must be single-word values + let hour = cardinal::word_to_value(words[start])?; + let minute = cardinal::word_to_value(words[start + 1])?; + + // Valid ranges — hour must be reasonable for time + if hour < 1 || hour > 24 || minute < 0 || minute > 59 { + return None; + } + + // Minute word must represent a value >= 10 (like पन्द्रह=15, अठारह=18) + // Single digits 0-9 are too ambiguous (could be address digits) + if minute < 10 { + return None; + } + + // Must NOT be followed by another digit/number word (would be address/telephone) + if start + 2 < words.len() { + let next = words[start + 2]; + if cardinal::is_hi_number_word(next) || cardinal::is_modifier(next) { + return None; + } + if next == "दशमलव" || is_measure_unit(next) { + return None; + } + } + + // Must NOT be preceded by a digit result or number word + if let Some(last) = result.last() { + if last.chars().all(|c| "०१२३४५६७८९".contains(c)) { + return None; + } + } + // Also check if the word before start is a digit word (not yet processed into result) + if start > 0 && cardinal::is_hi_number_word(words[start - 1]) { + return None; + } + + let time_str = format!( + "{}:{}", + cardinal::to_devanagari(hour), + format_two_digit_devanagari(minute) + ); + Some((time_str, 2)) +} + +/// Try to parse a duration: X मिनट Y सेकंड (no hour) +fn try_parse_duration(words: &[&str], start: usize) -> Option<(String, usize)> { + let (min_end, min_val) = find_number_then_keyword(words, start, is_minute_word); + if let Some(mv) = min_val { + let (sec_end, sec_val) = find_number_then_keyword(words, min_end, is_second_word); + if let Some(sv) = sec_val { + let time_str = format!( + "{}:{}:{}", + "००", + format_two_digit_devanagari(mv), + format_two_digit_devanagari(sv) + ); + return Some((time_str, sec_end - start)); + } + } + None +} + +/// Find a number span followed by a keyword. +fn find_number_then_keyword( + words: &[&str], + start: usize, + is_keyword: fn(&str) -> bool, +) -> (usize, Option) { + if start >= words.len() { + return (start, None); + } + + let mut end = start; + while end < words.len() + && (cardinal::is_hi_number_word(words[end]) || cardinal::is_modifier(words[end])) + { + end += 1; + } + + if end == start || end >= words.len() || !is_keyword(words[end]) { + return (start, None); + } + + let num_words: Vec<&str> = words[start..end].to_vec(); + let val = cardinal::words_to_number(&num_words); + if val.is_some() { + (end + 1, val) + } else { + (start, None) + } +} + +fn format_two_digit_devanagari(n: i64) -> String { + let s = format!("{:02}", n); + cardinal::to_devanagari_str(&s) +} + +fn format_time(hour: i64, minute: i64, second: Option) -> String { + let h = cardinal::to_devanagari(hour); + let m = format_two_digit_devanagari(minute); + + if let Some(s) = second { + let sec = format_two_digit_devanagari(s); + format!("{}:{}:{}", h, m, sec) + } else { + format!("{}:{}", h, m) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(process("एक बजे सात मिनट"), "१:०७"); + assert_eq!(process("ग्यारह बजे"), "११:००"); + } + + #[test] + fn test_modifier() { + assert_eq!(process("ढाई बजे"), "२:३०"); + assert_eq!(process("सवा चार बजे"), "४:१५"); + assert_eq!(process("साढ़े ग्यारह"), "११:३०"); + assert_eq!(process("पौने पाँच"), "४:४५"); + } +} diff --git a/src/asr/hi/whitelist.rs b/src/asr/hi/whitelist.rs new file mode 100644 index 0000000..d3835b3 --- /dev/null +++ b/src/asr/hi/whitelist.rs @@ -0,0 +1,76 @@ +//! Whitelist tagger for Hindi. +//! +//! Maps specific Hindi phrases to their abbreviated forms: +//! - "मास्टर निखिल तनिष" → "मा. निखिल तनिष" +//! - "श्रीमती ज्योत्सना" → "स्मि. ज्योत्सना" +//! - "डॉक्टर" → "डॉ." +//! - "पाव" → "१/४" +//! - "आधा कप चाय" → "१/२ कप चाय" + +/// Whitelist entries: (input phrase, output) +/// Sorted longest first to avoid partial matches. +const WHITELIST: &[(&str, &str)] = &[ + ("श्रीमान", "श्री."), + ("श्रीमती", "स्मि."), + ("मास्टर", "मा."), + ("डॉक्टर", "डॉ."), + ("कुमारी", "कु."), + ("पाव", "१/४"), + ("आधा", "१/२"), +]; + +/// Process whitelist patterns in a string. +pub fn process(input: &str) -> String { + let words: Vec<&str> = input.split_whitespace().collect(); + if words.is_empty() { + return input.to_string(); + } + + let mut result = Vec::new(); + let mut i = 0; + + while i < words.len() { + let mut matched = false; + + for &(term, replacement) in WHITELIST { + let term_words: Vec<&str> = term.split_whitespace().collect(); + let term_len = term_words.len(); + + if i + term_len <= words.len() { + let matches = term_words + .iter() + .enumerate() + .all(|(j, &tw)| words[i + j] == tw); + if matches { + result.push(replacement.to_string()); + i += term_len; + matched = true; + break; + } + } + } + + if !matched { + result.push(words[i].to_string()); + i += 1; + } + } + + result.join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(process("डॉक्टर"), "डॉ."); + assert_eq!(process("कुमारी"), "कु."); + } + + #[test] + fn test_with_name() { + assert_eq!(process("डॉक्टर प्रशांत"), "डॉ. प्रशांत"); + } +} diff --git a/src/asr/hi/word.rs b/src/asr/hi/word.rs new file mode 100644 index 0000000..477f51b --- /dev/null +++ b/src/asr/hi/word.rs @@ -0,0 +1,9 @@ +//! Word tagger for Hindi. +//! +//! Pass-through: returns input unchanged. +//! Handles words that should not be normalized. + +/// Process word patterns (pass-through). +pub fn process(input: &str) -> String { + input.to_string() +} diff --git a/src/asr/ja/cardinal.rs b/src/asr/ja/cardinal.rs new file mode 100644 index 0000000..d160c75 --- /dev/null +++ b/src/asr/ja/cardinal.rs @@ -0,0 +1,279 @@ +//! Cardinal number tagger for Japanese. +//! +//! Converts kanji numerals to Arabic numerals: +//! - "一" → "1" +//! - "五千億" → "500,000,000,000" +//! - "十一兆一" → "11,000,000,000,001" + +/// Map a single kanji digit to its value. +pub fn kanji_digit(c: char) -> Option { + match c { + '零' | '〇' => Some(0), + '一' => Some(1), + '二' => Some(2), + '三' => Some(3), + '四' => Some(4), + '五' => Some(5), + '六' => Some(6), + '七' => Some(7), + '八' => Some(8), + '九' => Some(9), + _ => None, + } +} + +/// Check if a character is a kanji numeral (digit or scale). +pub fn is_kanji_numeral(c: char) -> bool { + kanji_digit(c).is_some() || matches!(c, '十' | '百' | '千' | '万' | '億' | '兆') +} + +/// Parse a kanji number string to an integer. +/// +/// Handles the full Japanese number system: +/// - Scale: 兆(10^12), 億(10^8), 万(10^4) +/// - Within each group: 千(1000), 百(100), 十(10) + digits +/// +/// Examples: +/// - "一" → 1 +/// - "二十" → 20 +/// - "百" → 100 +/// - "千九百九十九" → 1999 +/// - "五千億" → 500_000_000_000 +/// - "一兆百万" → 1_000_001_000_000 +pub fn kanji_to_number(input: &str) -> Option { + let chars: Vec = input.chars().collect(); + if chars.is_empty() { + return None; + } + + // All characters must be kanji numerals + if !chars.iter().all(|&c| is_kanji_numeral(c)) { + return None; + } + + let mut result: i64 = 0; + let mut i = 0; + + // Process 兆 group + if let Some(pos) = chars.iter().position(|&c| c == '兆') { + let group = if pos == 0 { + 1 + } else { + parse_sub_man(&chars[..pos])? + }; + result += group * 1_000_000_000_000; + i = pos + 1; + } + + // Process 億 group + let remaining = &chars[i..]; + if let Some(pos) = remaining.iter().position(|&c| c == '億') { + let group = if pos == 0 { + 1 + } else { + parse_sub_man(&remaining[..pos])? + }; + result += group * 100_000_000; + i += pos + 1; + } + + // Process 万 group + let remaining = &chars[i..]; + if let Some(pos) = remaining.iter().position(|&c| c == '万') { + let group = if pos == 0 { + 1 + } else { + parse_sub_man(&remaining[..pos])? + }; + result += group * 10_000; + i += pos + 1; + } + + // Process remaining (0-9999) + let remaining = &chars[i..]; + if !remaining.is_empty() { + result += parse_sub_man(remaining)?; + } + + if result == 0 && !chars.iter().any(|&c| c == '零' || c == '〇') { + // Didn't parse anything meaningful + if chars.is_empty() { + return None; + } + } + + Some(result) +} + +/// Parse a sub-万 number (0-9999): 千百十 scale. +fn parse_sub_man(chars: &[char]) -> Option { + if chars.is_empty() { + return None; + } + + let mut result: i64 = 0; + let mut i = 0; + + // Process 千 + if let Some(pos) = chars[i..].iter().position(|&c| c == '千') { + let pos = pos + i; + let multiplier = if pos == i { + 1 // bare 千 + } else if pos == i + 1 { + kanji_digit(chars[i])? + } else { + return None; + }; + result += multiplier * 1000; + i = pos + 1; + } + + // Process 百 + if i < chars.len() { + if let Some(pos) = chars[i..].iter().position(|&c| c == '百') { + let pos = pos + i; + let multiplier = if pos == i { + 1 // bare 百 + } else if pos == i + 1 { + kanji_digit(chars[i])? + } else { + return None; + }; + result += multiplier * 100; + i = pos + 1; + } + } + + // Process 十 + if i < chars.len() { + if let Some(pos) = chars[i..].iter().position(|&c| c == '十') { + let pos = pos + i; + let multiplier = if pos == i { + 1 // bare 十 + } else if pos == i + 1 { + kanji_digit(chars[i])? + } else { + return None; + }; + result += multiplier * 10; + i = pos + 1; + } + } + + // Process remaining digit + if i < chars.len() { + if chars.len() - i == 1 { + result += kanji_digit(chars[i])?; + } else { + return None; // unexpected extra characters + } + } + + Some(result) +} + +/// Format a number with comma separators. +pub fn format_with_commas(n: i64) -> String { + if n == 0 { + return "0".to_string(); + } + + let negative = n < 0; + let mut num = if negative { + (n as i128).abs() as u64 + } else { + n as u64 + }; + let mut groups: Vec = Vec::new(); + + while num > 0 { + let group = num % 1000; + groups.push(group.to_string()); + num /= 1000; + } + + groups.reverse(); + + if groups.is_empty() { + return "0".to_string(); + } + + // First group has no leading zeros + let mut result = groups[0].clone(); + for g in &groups[1..] { + result.push(','); + result.push_str(&format!("{:03}", g.parse::().unwrap())); + } + + if negative { + format!("-{}", result) + } else { + result + } +} + +/// Find and replace kanji number spans in a string. +/// Returns the string with all kanji number sequences replaced by Arabic numerals. +pub fn replace_kanji_numbers(input: &str) -> String { + let chars: Vec = input.chars().collect(); + let mut result = String::new(); + let mut i = 0; + + while i < chars.len() { + if is_kanji_numeral(chars[i]) { + // Find the end of the kanji numeral span + let start = i; + while i < chars.len() && is_kanji_numeral(chars[i]) { + i += 1; + } + let kanji_span: String = chars[start..i].iter().collect(); + if let Some(num) = kanji_to_number(&kanji_span) { + result.push_str(&format_with_commas(num)); + } else { + result.push_str(&kanji_span); + } + } else { + result.push(chars[i]); + i += 1; + } + } + + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(kanji_to_number("一"), Some(1)); + assert_eq!(kanji_to_number("百"), Some(100)); + assert_eq!(kanji_to_number("十"), Some(10)); + assert_eq!(kanji_to_number("二十"), Some(20)); + } + + #[test] + fn test_large() { + assert_eq!(kanji_to_number("五千億"), Some(500_000_000_000)); + assert_eq!(kanji_to_number("五兆"), Some(5_000_000_000_000)); + assert_eq!(kanji_to_number("一兆百万"), Some(1_000_001_000_000)); + } + + #[test] + fn test_commas() { + assert_eq!(format_with_commas(1), "1"); + assert_eq!(format_with_commas(100), "100"); + assert_eq!(format_with_commas(1000), "1,000"); + assert_eq!(format_with_commas(50000), "50,000"); + assert_eq!(format_with_commas(500_000_000_000), "500,000,000,000"); + } + + #[test] + fn test_replace() { + assert_eq!( + replace_kanji_numbers("そこに鳥一羽がいます"), + "そこに鳥1羽がいます" + ); + } +} diff --git a/src/asr/ja/date.rs b/src/asr/ja/date.rs new file mode 100644 index 0000000..c0af4fe --- /dev/null +++ b/src/asr/ja/date.rs @@ -0,0 +1,259 @@ +//! Date tagger for Japanese. +//! +//! Converts kanji dates to Arabic numeral form: +//! - "一月二十二日" → "1月22日" +//! - "七十年代" → "70年代" +//! - "三月一日水曜日" → "3月1日(水)" +//! - "五から九日" → "5〜9日" + +use super::cardinal; + +/// Day-of-week mappings: full form → abbreviated form +const WEEKDAYS: &[(&str, &str)] = &[ + ("月曜日", "(月)"), + ("火曜日", "(火)"), + ("水曜日", "(水)"), + ("木曜日", "(木)"), + ("金曜日", "(金)"), + ("土曜日", "(土)"), + ("日曜日", "(日)"), +]; + +/// Process date patterns in a string. +pub fn process(input: &str) -> String { + let mut result = input.to_string(); + + // Process day-of-week patterns first (before 日 processing) + // e.g., "三月一日水曜日" → "三月一日(水)" + for &(full, abbr) in WEEKDAYS { + result = result.replace(full, abbr); + } + + // Process range patterns: XからY日, XからY月, XからY年代 + result = process_ranges(&result); + + // Process 世紀 patterns + result = process_suffix(&result, "世紀"); + + // Process 年代 patterns + result = process_suffix(&result, "年代"); + + // Process 年 patterns (but not 年代) + result = process_year(&result); + + // Process 月 patterns (but not 月曜日 which is already handled) + result = process_suffix(&result, "月"); + + // Process 日 patterns (but not 日曜日 etc.) + result = process_day(&result); + + result +} + +/// Process range patterns: "XからY日" → "X〜Y日" +fn process_ranges(input: &str) -> String { + let kara = "から"; + let mut result = String::new(); + let mut remaining = input; + + while let Some(kara_pos) = remaining.find(kara) { + let before_kara = &remaining[..kara_pos]; + let after_kara = &remaining[kara_pos + kara.len()..]; + + // Find kanji number before から + let before_chars: Vec = before_kara.chars().collect(); + let mut num_start = before_chars.len(); + while num_start > 0 && cardinal::is_kanji_numeral(before_chars[num_start - 1]) { + num_start -= 1; + } + + // Find kanji number + suffix after から + let after_chars: Vec = after_kara.chars().collect(); + let mut num_end = 0; + while num_end < after_chars.len() && cardinal::is_kanji_numeral(after_chars[num_end]) { + num_end += 1; + } + + // Check if followed by a date suffix (日, 月, 年代) + let after_num: String = after_chars[num_end..].iter().collect(); + let has_date_suffix = after_num.starts_with('日') + || after_num.starts_with('月') + || after_num.starts_with("年代"); + + if num_start < before_chars.len() && num_end > 0 && has_date_suffix { + let prefix: String = before_chars[..num_start].iter().collect(); + let num1_kanji: String = before_chars[num_start..].iter().collect(); + let num2_kanji: String = after_chars[..num_end].iter().collect(); + + if let (Some(n1), Some(n2)) = ( + cardinal::kanji_to_number(&num1_kanji), + cardinal::kanji_to_number(&num2_kanji), + ) { + result.push_str(&prefix); + result.push_str(&n1.to_string()); + result.push('〜'); + result.push_str(&n2.to_string()); + remaining = &after_kara[num2_kanji.len()..]; + continue; + } + } + + // No match, pass through + result.push_str(before_kara); + result.push_str(kara); + remaining = after_kara; + } + + result.push_str(remaining); + result +} + +/// Process generic suffix: find kanji number before suffix and convert. +fn process_suffix(input: &str, suffix: &str) -> String { + let mut result = String::new(); + let mut remaining = input; + + while let Some(pos) = remaining.find(suffix) { + let before = &remaining[..pos]; + let before_chars: Vec = before.chars().collect(); + + // Scan backwards for kanji number + let mut num_start = before_chars.len(); + while num_start > 0 && cardinal::is_kanji_numeral(before_chars[num_start - 1]) { + num_start -= 1; + } + + if num_start < before_chars.len() { + let prefix: String = before_chars[..num_start].iter().collect(); + let kanji: String = before_chars[num_start..].iter().collect(); + result.push_str(&prefix); + if let Some(num) = cardinal::kanji_to_number(&kanji) { + result.push_str(&num.to_string()); + } else { + result.push_str(&kanji); + } + } else { + result.push_str(before); + } + + result.push_str(suffix); + remaining = &remaining[pos + suffix.len()..]; + } + + result.push_str(remaining); + result +} + +/// Process 年 suffix, but avoid matching 年代 (already handled). +fn process_year(input: &str) -> String { + let mut result = String::new(); + let mut remaining = input; + + while let Some(pos) = remaining.find('年') { + let after_year = &remaining[pos + '年'.len_utf8()..]; + + // Skip if this is 年代 (already handled) + if after_year.starts_with('代') { + result.push_str(&remaining[..pos + '年'.len_utf8()]); + remaining = after_year; + continue; + } + + let before = &remaining[..pos]; + let before_chars: Vec = before.chars().collect(); + + let mut num_start = before_chars.len(); + while num_start > 0 && cardinal::is_kanji_numeral(before_chars[num_start - 1]) { + num_start -= 1; + } + + if num_start < before_chars.len() { + let prefix: String = before_chars[..num_start].iter().collect(); + let kanji: String = before_chars[num_start..].iter().collect(); + result.push_str(&prefix); + if let Some(num) = cardinal::kanji_to_number(&kanji) { + result.push_str(&num.to_string()); + } else { + result.push_str(&kanji); + } + } else { + result.push_str(before); + } + + result.push('年'); + remaining = after_year; + } + + result.push_str(remaining); + result +} + +/// Process 日 suffix, but avoid matching day-of-week abbreviations like (日). +fn process_day(input: &str) -> String { + let mut result = String::new(); + let mut remaining = input; + + while let Some(pos) = remaining.find('日') { + // Check if this 日 is part of a day-of-week abbreviation (日) + // or if it's preceded by ( — skip those + let before = &remaining[..pos]; + if before.ends_with('(') || before.ends_with('(') { + result.push_str(&remaining[..pos + '日'.len_utf8()]); + remaining = &remaining[pos + '日'.len_utf8()..]; + continue; + } + + let before_chars: Vec = before.chars().collect(); + + let mut num_start = before_chars.len(); + while num_start > 0 && cardinal::is_kanji_numeral(before_chars[num_start - 1]) { + num_start -= 1; + } + + if num_start < before_chars.len() { + let prefix: String = before_chars[..num_start].iter().collect(); + let kanji: String = before_chars[num_start..].iter().collect(); + result.push_str(&prefix); + if let Some(num) = cardinal::kanji_to_number(&kanji) { + result.push_str(&num.to_string()); + } else { + result.push_str(&kanji); + } + } else { + result.push_str(before); + } + + result.push('日'); + remaining = &remaining[pos + '日'.len_utf8()..]; + } + + result.push_str(remaining); + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(process("一月"), "1月"); + assert_eq!(process("一月二十二日"), "1月22日"); + } + + #[test] + fn test_weekday() { + assert_eq!(process("三月一日水曜日"), "3月1日(水)"); + } + + #[test] + fn test_range() { + assert_eq!(process("五から九日"), "5〜9日"); + assert_eq!(process("七十から八十年代"), "70〜80年代"); + } + + #[test] + fn test_century() { + assert_eq!(process("二十一世紀"), "21世紀"); + } +} diff --git a/src/asr/ja/decimal.rs b/src/asr/ja/decimal.rs new file mode 100644 index 0000000..ceb6d9c --- /dev/null +++ b/src/asr/ja/decimal.rs @@ -0,0 +1,128 @@ +//! Decimal number tagger for Japanese. +//! +//! Converts spoken Japanese decimals to written form: +//! - "マイナス一点零六" → "-1.06" +//! - "五点三" → "5.3" + +use super::cardinal; + +/// Process decimal patterns in a string. +/// Handles: マイナスX点YZ → -X.YZ +pub fn process(input: &str) -> String { + let mut result = String::new(); + let mut remaining = input; + + while !remaining.is_empty() { + // Try to find マイナス or a kanji number followed by 点 + if let Some((before, decimal_str, after)) = find_decimal(remaining) { + result.push_str(before); + result.push_str(&decimal_str); + remaining = after; + } else { + result.push_str(remaining); + break; + } + } + + result +} + +/// Find the next decimal expression in the string. +/// Returns (before, converted_decimal, after). +fn find_decimal(input: &str) -> Option<(&str, String, &str)> { + // Look for マイナス followed by decimal, or plain decimal (X点Y) + let chars: Vec = input.chars().collect(); + let mut byte_pos = 0; + + for (i, &c) in chars.iter().enumerate() { + // Check for マイナス prefix + if c == 'マ' && input[byte_pos..].starts_with("マイナス") { + let minus_len = "マイナス".len(); + let after_minus = &input[byte_pos + minus_len..]; + if let Some((dec_str, dec_byte_len)) = parse_decimal_at(after_minus) { + let before = &input[..byte_pos]; + let after = &input[byte_pos + minus_len + dec_byte_len..]; + return Some((before, format!("-{}", dec_str), after)); + } + } + + // Check for kanji digit that could start a decimal + if cardinal::is_kanji_numeral(c) || c == '零' { + if let Some((dec_str, dec_byte_len)) = parse_decimal_at(&input[byte_pos..]) { + let before = &input[..byte_pos]; + let after = &input[byte_pos + dec_byte_len..]; + return Some((before, dec_str, after)); + } + } + + byte_pos += c.len_utf8(); + } + + None +} + +/// Try to parse a decimal number starting at the given position. +/// Returns (formatted_string, bytes_consumed). +fn parse_decimal_at(input: &str) -> Option<(String, usize)> { + let chars: Vec = input.chars().collect(); + if chars.is_empty() { + return None; + } + + // Find 点 position + let ten_pos = chars.iter().position(|&c| c == '点')?; + + // Integer part: kanji before 点 + let int_chars: Vec = chars[..ten_pos].to_vec(); + if int_chars.is_empty() { + return None; + } + + // All int chars must be kanji numerals + if !int_chars.iter().all(|&c| cardinal::is_kanji_numeral(c)) { + return None; + } + + let int_val = cardinal::kanji_to_number(&int_chars.iter().collect::())?; + + // Fractional part: individual kanji digits after 点 + let frac_start = ten_pos + 1; + let mut frac_end = frac_start; + while frac_end < chars.len() { + let c = chars[frac_end]; + if cardinal::kanji_digit(c).is_some() { + frac_end += 1; + } else { + break; + } + } + + if frac_end == frac_start { + return None; // No fractional digits + } + + let frac_digits: String = chars[frac_start..frac_end] + .iter() + .map(|&c| cardinal::kanji_digit(c).unwrap().to_string()) + .collect(); + + let total_bytes: usize = chars[..frac_end].iter().map(|c| c.len_utf8()).sum(); + + Some((format!("{}.{}", int_val, frac_digits), total_bytes)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(process("マイナス一点零六"), "-1.06"); + assert_eq!(process("五点三"), "5.3"); + } + + #[test] + fn test_contextual() { + assert_eq!(process("答えはマイナス一点零六"), "答えは-1.06"); + } +} diff --git a/src/asr/ja/fraction.rs b/src/asr/ja/fraction.rs new file mode 100644 index 0000000..22ac293 --- /dev/null +++ b/src/asr/ja/fraction.rs @@ -0,0 +1,165 @@ +//! Fraction tagger for Japanese. +//! +//! Converts kanji fractions to Arabic numeral form: +//! - "八分の五" → "5/8" +//! - "マイナス八分の五" → "-5/8" +//! - "一と四分の三" → "1 3/4" +//! - "一荷四分の三" → "1 3/4" +//! +//! Japanese fractions use X分のY where X is denominator and Y is numerator. + +use super::cardinal; + +/// Process fraction patterns in a string. +pub fn process(input: &str) -> String { + let bun_no = "分の"; + let mut result = String::new(); + let mut remaining = input; + + while let Some(bun_no_pos) = remaining.find(bun_no) { + let before_bun_no = &remaining[..bun_no_pos]; + let after_bun_no = &remaining[bun_no_pos + bun_no.len()..]; + + // Parse denominator: kanji number immediately before 分の + let before_chars: Vec = before_bun_no.chars().collect(); + let mut denom_start = before_chars.len(); + while denom_start > 0 && cardinal::is_kanji_numeral(before_chars[denom_start - 1]) { + denom_start -= 1; + } + + if denom_start >= before_chars.len() { + // No kanji number before 分の, pass through + result.push_str(&remaining[..bun_no_pos + bun_no.len()]); + remaining = after_bun_no; + continue; + } + + let denom_kanji: String = before_chars[denom_start..].iter().collect(); + let denom = match cardinal::kanji_to_number(&denom_kanji) { + Some(d) => d, + None => { + result.push_str(&remaining[..bun_no_pos + bun_no.len()]); + remaining = after_bun_no; + continue; + } + }; + + // Parse numerator: kanji number immediately after 分の + let after_chars: Vec = after_bun_no.chars().collect(); + let mut numer_end = 0; + while numer_end < after_chars.len() && cardinal::is_kanji_numeral(after_chars[numer_end]) { + numer_end += 1; + } + + if numer_end == 0 { + // No kanji number after 分の, pass through + result.push_str(&remaining[..bun_no_pos + bun_no.len()]); + remaining = after_bun_no; + continue; + } + + let numer_kanji: String = after_chars[..numer_end].iter().collect(); + let numer = match cardinal::kanji_to_number(&numer_kanji) { + Some(n) => n, + None => { + result.push_str(&remaining[..bun_no_pos + bun_no.len()]); + remaining = after_bun_no; + continue; + } + }; + + let numer_byte_len: usize = after_chars[..numer_end].iter().map(|c| c.len_utf8()).sum(); + + // Build prefix before denominator + let prefix_before_denom: String = before_chars[..denom_start].iter().collect(); + + // Check for mixed number: XとY分のZ or X荷Y分のZ + if let Some((real_prefix, whole, negative)) = find_mixed_prefix(&prefix_before_denom) { + result.push_str(real_prefix); + if negative { + result.push_str(&format!("-{} {}/{}", whole, numer, denom)); + } else { + result.push_str(&format!("{} {}/{}", whole, numer, denom)); + } + } else if prefix_before_denom.ends_with("マイナス") { + // Negative fraction + let prefix = &prefix_before_denom[..prefix_before_denom.len() - "マイナス".len()]; + result.push_str(prefix); + result.push_str(&format!("-{}/{}", numer, denom)); + } else { + // Simple fraction + result.push_str(&prefix_before_denom); + result.push_str(&format!("{}/{}", numer, denom)); + } + + remaining = &after_bun_no[numer_byte_len..]; + } + + result.push_str(remaining); + result +} + +/// Check for mixed number prefix (XとY or X荷Y) in the text before the denominator. +/// Returns (text_before_whole, whole_number, is_negative) if found. +fn find_mixed_prefix(before_denom: &str) -> Option<(&str, i64, bool)> { + for separator in &["と", "荷"] { + if let Some(sep_pos) = before_denom.rfind(separator) { + let before_sep = &before_denom[..sep_pos]; + let before_sep_chars: Vec = before_sep.chars().collect(); + + // Find kanji number before separator + let mut num_start = before_sep_chars.len(); + while num_start > 0 && cardinal::is_kanji_numeral(before_sep_chars[num_start - 1]) { + num_start -= 1; + } + + if num_start < before_sep_chars.len() { + let kanji: String = before_sep_chars[num_start..].iter().collect(); + if let Some(whole) = cardinal::kanji_to_number(&kanji) { + let prefix = &before_sep[..before_sep.len() - kanji.len()]; + + let (real_prefix, is_negative) = if prefix.ends_with("マイナス") { + (&prefix[..prefix.len() - "マイナス".len()], true) + } else { + (prefix, false) + }; + + return Some((real_prefix, whole, is_negative)); + } + } + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(process("八分の五"), "5/8"); + assert_eq!(process("三分の一"), "1/3"); + } + + #[test] + fn test_negative() { + assert_eq!(process("マイナス八分の五"), "-5/8"); + } + + #[test] + fn test_mixed() { + assert_eq!(process("一と四分の三"), "1 3/4"); + assert_eq!(process("マイナス一荷四分の三"), "-1 3/4"); + } + + #[test] + fn test_contextual() { + assert_eq!(process("答えはマイナス八分の五"), "答えは-5/8"); + assert_eq!( + process("三分の一の人がその場を離れた"), + "1/3の人がその場を離れた" + ); + } +} diff --git a/src/asr/ja/mod.rs b/src/asr/ja/mod.rs new file mode 100644 index 0000000..ad7ef97 --- /dev/null +++ b/src/asr/ja/mod.rs @@ -0,0 +1,12 @@ +//! Japanese inverse text normalization. +//! +//! Converts kanji numerals and spoken-form Japanese to written form. +//! Uses a sentence-scanning approach: each processor scans the input +//! for its patterns and replaces kanji number spans in-place. + +pub mod cardinal; +pub mod date; +pub mod decimal; +pub mod fraction; +pub mod ordinal; +pub mod time; diff --git a/src/asr/ja/ordinal.rs b/src/asr/ja/ordinal.rs new file mode 100644 index 0000000..a37b9e5 --- /dev/null +++ b/src/asr/ja/ordinal.rs @@ -0,0 +1,113 @@ +//! Ordinal number tagger for Japanese. +//! +//! Converts kanji ordinals to Arabic numerals: +//! - "一番目" → "1番目" +//! - "第一" → "第1" + +use super::cardinal; + +/// Process ordinal patterns in a string. +/// Handles: X番目 → N番目, 第X → 第N +pub fn process(input: &str) -> String { + let mut result = input.to_string(); + + // Process 番目 patterns: find kanji numbers before 番目 + result = process_banme(&result); + + // Process 第 patterns: find kanji numbers after 第 + result = process_dai(&result); + + result +} + +/// Replace kanji numbers before 番目 with Arabic numerals. +fn process_banme(input: &str) -> String { + let suffix = "番目"; + let mut result = String::new(); + let mut remaining = input; + + while let Some(pos) = remaining.find(suffix) { + // Find the kanji number span ending just before 番目 + let before = &remaining[..pos]; + let chars: Vec = before.chars().collect(); + + // Scan backwards from end to find start of kanji number + let mut num_start = chars.len(); + while num_start > 0 && cardinal::is_kanji_numeral(chars[num_start - 1]) { + num_start -= 1; + } + + if num_start < chars.len() { + // Found kanji number before 番目 + let prefix: String = chars[..num_start].iter().collect(); + let kanji: String = chars[num_start..].iter().collect(); + result.push_str(&prefix); + if let Some(num) = cardinal::kanji_to_number(&kanji) { + result.push_str(&num.to_string()); + } else { + result.push_str(&kanji); + } + } else { + result.push_str(before); + } + + result.push_str(suffix); + remaining = &remaining[pos + suffix.len()..]; + } + + result.push_str(remaining); + result +} + +/// Replace kanji numbers after 第 with Arabic numerals. +fn process_dai(input: &str) -> String { + let prefix = "第"; + let mut result = String::new(); + let mut remaining = input; + + while let Some(pos) = remaining.find(prefix) { + result.push_str(&remaining[..pos]); + result.push_str(prefix); + + let after = &remaining[pos + prefix.len()..]; + let chars: Vec = after.chars().collect(); + + // Find end of kanji number span + let mut num_end = 0; + while num_end < chars.len() && cardinal::is_kanji_numeral(chars[num_end]) { + num_end += 1; + } + + if num_end > 0 { + let kanji: String = chars[..num_end].iter().collect(); + if let Some(num) = cardinal::kanji_to_number(&kanji) { + result.push_str(&num.to_string()); + } else { + result.push_str(&kanji); + } + remaining = &after[kanji.len()..]; + } else { + remaining = after; + } + } + + result.push_str(remaining); + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_banme() { + assert_eq!(process("一番目"), "1番目"); + assert_eq!(process("三千三百三十番目"), "3330番目"); + } + + #[test] + fn test_dai() { + assert_eq!(process("第一"), "第1"); + assert_eq!(process("第七万二千六"), "第72006"); + } +} diff --git a/src/asr/ja/time.rs b/src/asr/ja/time.rs new file mode 100644 index 0000000..1b31135 --- /dev/null +++ b/src/asr/ja/time.rs @@ -0,0 +1,144 @@ +//! Time tagger for Japanese. +//! +//! Converts kanji time expressions to Arabic numeral form: +//! - "七時一分" → "7時1分" +//! - "正午一分前" → "正午1分前" +//! - "零時" → "0時" + +use super::cardinal; + +/// Process time patterns in a string. +pub fn process(input: &str) -> String { + let mut result = input.to_string(); + + // Process 時 patterns (convert kanji before 時) + result = process_hour(&result); + + // Process 分 patterns (convert kanji before 分, but not X分の which is fractions) + result = process_minute(&result); + + result +} + +/// Process 時 suffix: convert kanji numbers before 時 to Arabic. +fn process_hour(input: &str) -> String { + let suffix = "時"; + let mut result = String::new(); + let mut remaining = input; + + while let Some(pos) = remaining.find(suffix) { + let before = &remaining[..pos]; + let before_chars: Vec = before.chars().collect(); + + // Scan backwards for kanji number + let mut num_start = before_chars.len(); + while num_start > 0 && cardinal::is_kanji_numeral(before_chars[num_start - 1]) { + num_start -= 1; + } + + // Also handle 零 (not in is_kanji_numeral but is a valid hour digit) + while num_start > 0 && before_chars[num_start - 1] == '零' { + num_start -= 1; + } + + if num_start < before_chars.len() { + let prefix: String = before_chars[..num_start].iter().collect(); + let kanji: String = before_chars[num_start..].iter().collect(); + result.push_str(&prefix); + + // Handle 零 specially + if kanji == "零" { + result.push('0'); + } else if let Some(num) = cardinal::kanji_to_number(&kanji) { + result.push_str(&num.to_string()); + } else { + result.push_str(&kanji); + } + } else { + result.push_str(before); + } + + result.push_str(suffix); + remaining = &remaining[pos + suffix.len()..]; + } + + result.push_str(remaining); + result +} + +/// Process 分 suffix: convert kanji numbers before 分 to Arabic. +/// Skip if followed by の (fraction pattern handled elsewhere). +fn process_minute(input: &str) -> String { + let suffix = "分"; + let mut result = String::new(); + let mut remaining = input; + + while let Some(pos) = remaining.find(suffix) { + let after_suffix = &remaining[pos + suffix.len()..]; + + // Skip if this is a fraction pattern (分の) + if after_suffix.starts_with('の') { + result.push_str(&remaining[..pos + suffix.len()]); + remaining = after_suffix; + continue; + } + + let before = &remaining[..pos]; + let before_chars: Vec = before.chars().collect(); + + // Scan backwards for kanji number + let mut num_start = before_chars.len(); + while num_start > 0 && cardinal::is_kanji_numeral(before_chars[num_start - 1]) { + num_start -= 1; + } + + if num_start < before_chars.len() { + let prefix: String = before_chars[..num_start].iter().collect(); + let kanji: String = before_chars[num_start..].iter().collect(); + result.push_str(&prefix); + if let Some(num) = cardinal::kanji_to_number(&kanji) { + result.push_str(&num.to_string()); + } else { + result.push_str(&kanji); + } + } else { + result.push_str(before); + } + + result.push_str(suffix); + remaining = after_suffix; + } + + result.push_str(remaining); + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(process("七時一分"), "7時1分"); + assert_eq!(process("零時"), "0時"); + assert_eq!(process("三時"), "3時"); + } + + #[test] + fn test_modifiers() { + assert_eq!(process("九時十分前"), "9時10分前"); + assert_eq!(process("正午十分過ぎ"), "正午10分過ぎ"); + assert_eq!(process("七時五十分頃"), "7時50分頃"); + } + + #[test] + fn test_noon() { + assert_eq!(process("正午一分前"), "正午1分前"); + } + + #[test] + fn test_skip_fraction() { + // 分の should not be processed as time + assert_eq!(process("三分の一"), "三分の一"); + } +} diff --git a/src/asr/mod.rs b/src/asr/mod.rs index 0cd9a1c..bc00ec1 100644 --- a/src/asr/mod.rs +++ b/src/asr/mod.rs @@ -14,18 +14,11 @@ //! - punctuation: spoken punctuation //! - whitelist: pass-through words -pub mod cardinal; -pub mod date; -pub mod decimal; -pub mod electronic; -pub mod measure; -pub mod money; -pub mod ordinal; -pub mod punctuation; -pub mod telephone; -pub mod time; -pub mod whitelist; -pub mod word; - -// TODO: Add remaining taggers -// pub mod fraction; +// Languages +pub mod de; +pub mod en; +pub mod es; +pub mod fr; +pub mod hi; +pub mod ja; +pub mod zh; diff --git a/src/asr/zh/cardinal.rs b/src/asr/zh/cardinal.rs new file mode 100644 index 0000000..6cacf48 --- /dev/null +++ b/src/asr/zh/cardinal.rs @@ -0,0 +1,612 @@ +//! Cardinal number tagger for Chinese. +//! +//! Converts Chinese numerals to Arabic numerals: +//! - "一百" → "100" +//! - "一万" → "1万" +//! - "九千九百九十九" → "9,999" +//! - "一億零一萬一千一百一十一" → "100,011,111" +//! +//! Handles both simplified and traditional characters. +//! Numbers with only 万/億 scale and no sub-units preserve the scale character. + +/// Map a single Chinese digit to its value. +/// Handles standard, traditional, and financial (大写) forms. +pub fn zh_digit(c: char) -> Option { + match c { + '零' | '〇' => Some(0), + '一' | '壹' => Some(1), + '二' | '两' | '兩' | '贰' | '貳' => Some(2), + '三' | '叁' | '參' => Some(3), + '四' | '肆' => Some(4), + '五' | '伍' => Some(5), + '六' | '陆' | '陸' => Some(6), + '七' | '柒' => Some(7), + '八' | '捌' => Some(8), + '九' | '玖' => Some(9), + _ => None, + } +} + +/// Check if a character is a Chinese numeral (digit or scale). +pub fn is_zh_numeral(c: char) -> bool { + zh_digit(c).is_some() || is_scale(c) +} + +/// Check if a character is a scale multiplier. +fn is_scale(c: char) -> bool { + matches!( + c, + '十' | '拾' | '百' | '佰' | '千' | '仟' | '万' | '萬' | '亿' | '億' + ) +} + +/// Scale value for a scale character. +fn scale_value(c: char) -> Option { + match c { + '十' | '拾' => Some(10), + '百' | '佰' => Some(100), + '千' | '仟' => Some(1000), + '万' | '萬' => Some(10_000), + '亿' | '億' => Some(100_000_000), + _ => None, + } +} + +/// Check if char is 万 or 萬. +fn is_wan(c: char) -> bool { + c == '万' || c == '萬' +} + +/// Check if char is 亿 or 億. +fn is_yi(c: char) -> bool { + c == '亿' || c == '億' +} + +/// Parse a Chinese number string to an integer. +/// +/// Handles the full Chinese number system including: +/// - Standard digits: 一二三四五六七八九 +/// - Traditional: 壹贰叁肆伍陆柒捌玖 +/// - Scales: 十百千万億 +/// - 零 as placeholder between non-adjacent scales +/// - 两/兩 as alternate for 2 +pub fn zh_to_number(input: &str) -> Option { + let chars: Vec = input.chars().collect(); + if chars.is_empty() { + return None; + } + + // All characters must be Chinese numerals + if !chars.iter().all(|&c| is_zh_numeral(c)) { + return None; + } + + // Reject if the input is solely 万/萬/亿/億 (which appear in formatted output) + if chars.iter().all(|&c| is_wan(c) || is_yi(c)) { + return None; + } + + let mut result: i64 = 0; + let mut i = 0; + + // Process 億 group + if let Some(pos) = chars.iter().position(|&c| is_yi(c)) { + let group = if pos == 0 { + 1 + } else { + parse_sub_yi(&chars[..pos])? + }; + result += group * 100_000_000; + i = pos + 1; + + // Skip 零 after 億 + if i < chars.len() && chars[i] == '零' { + i += 1; + } + } + + // Process 万 group + let remaining = &chars[i..]; + if let Some(pos) = remaining.iter().position(|&c| is_wan(c)) { + let group = if pos == 0 { + 1 + } else { + parse_sub_wan(&remaining[..pos])? + }; + result += group * 10_000; + i += pos + 1; + + // Skip 零 after 万 + if i < chars.len() && chars[i] == '零' { + i += 1; + } + } + + // Process remaining (0-9999) + let remaining = &chars[i..]; + if !remaining.is_empty() { + result += parse_sub_wan(remaining)?; + } + + if result == 0 && !chars.iter().any(|&c| c == '零' || c == '〇') { + if chars.is_empty() { + return None; + } + } + + Some(result) +} + +/// Parse a sub-万 number (0-9999): 千百十 scale. +fn parse_sub_wan(chars: &[char]) -> Option { + if chars.is_empty() { + return None; + } + + // Handle single zero + if chars.len() == 1 && chars[0] == '零' { + return Some(0); + } + + let mut result: i64 = 0; + let mut i = 0; + + // Skip leading 零 + if i < chars.len() && chars[i] == '零' { + i += 1; + } + + // Process 千 + if let Some(pos) = chars[i..].iter().position(|&c| c == '千' || c == '仟') { + let pos = pos + i; + let multiplier = if pos == i { + 1 // bare 千 + } else if pos == i + 1 { + zh_digit(chars[i])? + } else { + return None; + }; + result += multiplier * 1000; + i = pos + 1; + + // Skip 零 + if i < chars.len() && chars[i] == '零' { + i += 1; + } + } + + // Process 百 + if i < chars.len() { + if let Some(pos) = chars[i..].iter().position(|&c| c == '百' || c == '佰') { + let pos = pos + i; + let multiplier = if pos == i { + 1 // bare 百 + } else if pos == i + 1 { + zh_digit(chars[i])? + } else { + return None; + }; + result += multiplier * 100; + i = pos + 1; + + // Skip 零 + if i < chars.len() && chars[i] == '零' { + i += 1; + } + } + } + + // Process 十 + if i < chars.len() { + if let Some(pos) = chars[i..].iter().position(|&c| c == '十' || c == '拾') { + let pos = pos + i; + let multiplier = if pos == i { + 1 // bare 十 + } else if pos == i + 1 { + zh_digit(chars[i])? + } else { + return None; + }; + result += multiplier * 10; + i = pos + 1; + } + } + + // Process remaining digit + if i < chars.len() { + if chars.len() - i == 1 { + if chars[i] == '零' { + // trailing zero, ignore + } else { + result += zh_digit(chars[i])?; + } + } else { + return None; // unexpected extra characters + } + } + + Some(result) +} + +/// Parse a sub-億 number (up to 9999万9999). +/// This handles the range above 万 but below 億. +fn parse_sub_yi(chars: &[char]) -> Option { + if chars.is_empty() { + return None; + } + + let mut result: i64 = 0; + let mut i = 0; + + // Process 万 group within the 億 group + if let Some(pos) = chars.iter().position(|&c| is_wan(c)) { + let group = if pos == 0 { + 1 + } else { + parse_sub_wan(&chars[..pos])? + }; + result += group * 10_000; + i = pos + 1; + + // Skip 零 after 万 + if i < chars.len() && chars[i] == '零' { + i += 1; + } + } + + // Process remaining sub-万 + let remaining = &chars[i..]; + if !remaining.is_empty() { + result += parse_sub_wan(remaining)?; + } + + Some(result) +} + +/// Format a number with comma separators. +pub fn format_with_commas(n: i64) -> String { + if n == 0 { + return "0".to_string(); + } + + let negative = n < 0; + let mut num = if negative { + (n as i128).abs() as u64 + } else { + n as u64 + }; + let mut groups: Vec = Vec::new(); + + while num > 0 { + let group = num % 1000; + groups.push(group.to_string()); + num /= 1000; + } + + groups.reverse(); + + if groups.is_empty() { + return "0".to_string(); + } + + let mut result = groups[0].clone(); + for g in &groups[1..] { + result.push(','); + result.push_str(&format!("{:03}", g.parse::().unwrap())); + } + + if negative { + format!("-{}", result) + } else { + result + } +} + +/// Determine the output format for a Chinese number expression. +/// +/// Chinese cardinal output follows these rules: +/// - If the number has only a 万/億-scale with no sub-units, preserve the scale char: +/// "一万" → "1万", "十万" → "10万", "一百万" → "100万" +/// "一億" → "1億", "十億" → "10億" +/// - If the number has sub-万 digits after 万, expand fully with commas: +/// "一万一千" → "11,000", "九十万五千八百二十五" → "905,825" +/// - If the number has sub-億 digits after 億 that go below 万, expand fully. +/// +/// Returns the formatted string. +pub fn format_zh_cardinal(input: &str) -> Option { + let chars: Vec = input.chars().collect(); + if chars.is_empty() { + return None; + } + + if !chars.iter().all(|&c| is_zh_numeral(c)) { + return None; + } + + // Reject if the input is solely 万/萬/亿/億 (which appear in formatted output) + if chars.iter().all(|&c| is_wan(c) || is_yi(c)) { + return None; + } + + // Find 億 and 万 positions + let yi_pos = chars.iter().position(|&c| is_yi(c)); + let wan_pos_after_yi = if let Some(yp) = yi_pos { + chars[yp + 1..] + .iter() + .position(|&c| is_wan(c)) + .map(|p| p + yp + 1) + } else { + chars.iter().position(|&c| is_wan(c)) + }; + + // Determine if we have sub-units after the highest scale + let has_yi = yi_pos.is_some(); + let has_wan = wan_pos_after_yi.is_some(); + + if has_yi { + let yp = yi_pos.unwrap(); + let yi_char = chars[yp]; // preserve original 億/亿 + let yi_multiplier_chars = &chars[..yp]; + + // Parse the 億 multiplier + let yi_mult = if yi_multiplier_chars.is_empty() { + 1 + } else { + parse_sub_wan(yi_multiplier_chars)? + }; + + // Check what comes after 億 + let after_yi_start = yp + 1; + let mut after_yi = &chars[after_yi_start..]; + + // Skip 零 + if !after_yi.is_empty() && after_yi[0] == '零' { + after_yi = &after_yi[1..]; + } + + if after_yi.is_empty() { + // Pure 億 number: N億 (with commas in multiplier if ≥1000) + let mult_str = if yi_mult >= 1000 { + format_with_commas(yi_mult) + } else { + yi_mult.to_string() + }; + return Some(format!("{}{}", mult_str, yi_char)); + } + + // Check if after_yi contains 万 + if let Some(wp) = after_yi.iter().position(|&c| is_wan(c)) { + let wan_char = after_yi[wp]; + let wan_mult_chars = &after_yi[..wp]; + let wan_mult = if wan_mult_chars.is_empty() { + 1 + } else { + parse_sub_wan(wan_mult_chars)? + }; + + let after_wan_start = wp + 1; + let mut after_wan = &after_yi[after_wan_start..]; + + // Skip 零 + if !after_wan.is_empty() && after_wan[0] == '零' { + after_wan = &after_wan[1..]; + } + + if after_wan.is_empty() { + // 億 + 万 only, no sub-万: use mixed format like "N億" but with 万 in between + // Actually looking at test data, patterns like 一億一千萬 → 110,000,000 + // So when there's 億 AND 万, we always expand fully + let total = yi_mult * 100_000_000 + wan_mult * 10_000; + return Some(format_with_commas(total)); + } + + // Has sub-万 digits + let sub_wan = parse_sub_wan(after_wan)?; + let total = yi_mult * 100_000_000 + wan_mult * 10_000 + sub_wan; + return Some(format_with_commas(total)); + } + + // After 億 with no 万 — just sub-万 digits + let sub_wan = parse_sub_wan(after_yi)?; + let total = yi_mult * 100_000_000 + sub_wan; + return Some(format_with_commas(total)); + } + + if has_wan { + let wp = wan_pos_after_yi.unwrap(); + let wan_char = chars[wp]; // preserve original 万/萬 + let wan_mult_chars = &chars[..wp]; + + let wan_mult = if wan_mult_chars.is_empty() { + 1 + } else { + parse_sub_wan(wan_mult_chars)? + }; + + let after_wan_start = wp + 1; + let mut after_wan = &chars[after_wan_start..]; + + // Skip 零 + if !after_wan.is_empty() && after_wan[0] == '零' { + after_wan = &after_wan[1..]; + } + + if after_wan.is_empty() { + // Pure 万 number: N万 (with commas in multiplier if ≥1000) + let mult_str = if wan_mult >= 1000 { + format_with_commas(wan_mult) + } else { + wan_mult.to_string() + }; + return Some(format!("{}{}", mult_str, wan_char)); + } + + // Has sub-万 digits — expand fully + let sub_wan = parse_sub_wan(after_wan)?; + let total = wan_mult * 10_000 + sub_wan; + return Some(format_with_commas(total)); + } + + // No 万 or 億 — plain number + let num = parse_sub_wan(&chars)?; + Some(format_with_commas(num)) +} + +/// Format for ordinals: same as cardinal but no commas in expanded numbers, +/// and 万/億 multipliers are plain (no commas either). +pub fn format_zh_ordinal(input: &str) -> Option { + let chars: Vec = input.chars().collect(); + if chars.is_empty() { + return None; + } + + if !chars.iter().all(|&c| is_zh_numeral(c)) { + return None; + } + + // Find 億 and 万 positions + let yi_pos = chars.iter().position(|&c| is_yi(c)); + let wan_pos_after_yi = if let Some(yp) = yi_pos { + chars[yp + 1..] + .iter() + .position(|&c| is_wan(c)) + .map(|p| p + yp + 1) + } else { + chars.iter().position(|&c| is_wan(c)) + }; + + let has_yi = yi_pos.is_some(); + let has_wan = wan_pos_after_yi.is_some(); + + if has_yi { + let yp = yi_pos.unwrap(); + let yi_char = chars[yp]; + let yi_mult = if yp == 0 { + 1 + } else { + parse_sub_wan(&chars[..yp])? + }; + + let mut after_yi = &chars[yp + 1..]; + if !after_yi.is_empty() && after_yi[0] == '零' { + after_yi = &after_yi[1..]; + } + + if after_yi.is_empty() { + return Some(format!("{}{}", yi_mult, yi_char)); + } + + // Has stuff after 億 — expand fully (no commas) + let total = zh_to_number(input)?; + return Some(total.to_string()); + } + + if has_wan { + let wp = wan_pos_after_yi.unwrap(); + let wan_char = chars[wp]; + let wan_mult = if wp == 0 { + 1 + } else { + parse_sub_wan(&chars[..wp])? + }; + + let mut after_wan = &chars[wp + 1..]; + if !after_wan.is_empty() && after_wan[0] == '零' { + after_wan = &after_wan[1..]; + } + + if after_wan.is_empty() { + return Some(format!("{}{}", wan_mult, wan_char)); + } + + // Has sub-万 digits — expand fully (no commas) + let total = zh_to_number(input)?; + return Some(total.to_string()); + } + + // No 万 or 億 — plain number + let num = parse_sub_wan(&chars)?; + Some(num.to_string()) +} + +/// Format for money: no commas, no 万-preservation. Plain number output. +pub fn format_zh_money(input: &str) -> Option { + let num = zh_to_number(input)?; + Some(num.to_string()) +} + +/// Find and replace Chinese number spans in a string. +pub fn replace_zh_numbers(input: &str) -> String { + let chars: Vec = input.chars().collect(); + let mut result = String::new(); + let mut i = 0; + + while i < chars.len() { + if is_zh_numeral(chars[i]) { + let start = i; + while i < chars.len() && is_zh_numeral(chars[i]) { + i += 1; + } + let span: String = chars[start..i].iter().collect(); + if let Some(formatted) = format_zh_cardinal(&span) { + result.push_str(&formatted); + } else { + result.push_str(&span); + } + } else { + result.push(chars[i]); + i += 1; + } + } + + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(format_zh_cardinal("一百"), Some("100".to_string())); + assert_eq!(format_zh_cardinal("两百"), Some("200".to_string())); + assert_eq!(format_zh_cardinal("九百五十一"), Some("951".to_string())); + } + + #[test] + fn test_wan_preserved() { + assert_eq!(format_zh_cardinal("一万"), Some("1万".to_string())); + assert_eq!(format_zh_cardinal("十万"), Some("10万".to_string())); + assert_eq!(format_zh_cardinal("一百万"), Some("100万".to_string())); + } + + #[test] + fn test_wan_expanded() { + assert_eq!(format_zh_cardinal("一万一千"), Some("11,000".to_string())); + assert_eq!( + format_zh_cardinal("九千九百九十九"), + Some("9,999".to_string()) + ); + } + + #[test] + fn test_yi() { + assert_eq!(format_zh_cardinal("一億"), Some("1億".to_string())); + assert_eq!( + format_zh_cardinal("一億一千萬"), + Some("110,000,000".to_string()) + ); + } + + #[test] + fn test_traditional() { + assert_eq!(format_zh_cardinal("十萬"), Some("10萬".to_string())); + } + + #[test] + fn test_commas() { + assert_eq!(format_with_commas(1), "1"); + assert_eq!(format_with_commas(1000), "1,000"); + assert_eq!(format_with_commas(905825), "905,825"); + } +} diff --git a/src/asr/zh/date.rs b/src/asr/zh/date.rs new file mode 100644 index 0000000..9179284 --- /dev/null +++ b/src/asr/zh/date.rs @@ -0,0 +1,138 @@ +//! Date tagger for Chinese. +//! +//! Converts Chinese date expressions to Arabic numeral form: +//! - "一七九八年五月三十日" → "1798年5月30日" +//! - "公元一八三五年" → "公元1835年" +//! - "公元前一九九四年一月二日" → "公元前1994年1月2日" +//! - "纪元前一九三四年一月二日" → "公元前1934年1月2日" +//! - "纪元二零五六年二月三日" → "公元2056年2月3日" +//! +//! Year digits are parsed individually (一七九八 → 1798), +//! month and day use compound parsing (三十 → 30). + +use super::cardinal; + +/// Process date patterns in a string. +pub fn process(input: &str) -> String { + let mut result = input.to_string(); + + // Normalize 纪元前 → 公元前, 纪元 → 公元 (must do 纪元前 first) + result = result.replace("纪元前", "公元前"); + result = result.replace("纪元", "公元"); + + // Process 年 patterns (year digits individually) + result = process_year(&result); + + // Process 月 patterns + result = process_suffix(&result, "月"); + + // Process 日 patterns + result = process_suffix(&result, "日"); + + result +} + +/// Process year: digits before 年 are parsed individually (one digit per kanji). +fn process_year(input: &str) -> String { + let suffix = "年"; + let mut result = String::new(); + let mut remaining = input; + + while let Some(pos) = remaining.find(suffix) { + let before = &remaining[..pos]; + let before_chars: Vec = before.chars().collect(); + + // Scan backwards for Chinese digits (individual year digits) + let mut num_start = before_chars.len(); + while num_start > 0 && cardinal::zh_digit(before_chars[num_start - 1]).is_some() { + num_start -= 1; + } + + if num_start < before_chars.len() { + let prefix: String = before_chars[..num_start].iter().collect(); + result.push_str(&prefix); + + // Convert each digit individually + for &c in &before_chars[num_start..] { + if let Some(d) = cardinal::zh_digit(c) { + result.push_str(&d.to_string()); + } else { + result.push(c); + } + } + } else { + result.push_str(before); + } + + result.push_str(suffix); + remaining = &remaining[pos + suffix.len()..]; + } + + result.push_str(remaining); + result +} + +/// Process generic suffix (月, 日): kanji number before suffix is compound-parsed. +fn process_suffix(input: &str, suffix: &str) -> String { + let mut result = String::new(); + let mut remaining = input; + + while let Some(pos) = remaining.find(suffix) { + let before = &remaining[..pos]; + let before_chars: Vec = before.chars().collect(); + + // Scan backwards for Chinese numerals + let mut num_start = before_chars.len(); + while num_start > 0 && cardinal::is_zh_numeral(before_chars[num_start - 1]) { + num_start -= 1; + } + + if num_start < before_chars.len() { + let prefix: String = before_chars[..num_start].iter().collect(); + let kanji: String = before_chars[num_start..].iter().collect(); + result.push_str(&prefix); + if let Some(num) = cardinal::zh_to_number(&kanji) { + result.push_str(&num.to_string()); + } else { + result.push_str(&kanji); + } + } else { + result.push_str(before); + } + + result.push_str(suffix); + remaining = &remaining[pos + suffix.len()..]; + } + + result.push_str(remaining); + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_full_date() { + assert_eq!(process("一七九八年五月三十日"), "1798年5月30日"); + } + + #[test] + fn test_partial() { + assert_eq!(process("五月三十日"), "5月30日"); + assert_eq!(process("一七九八年"), "1798年"); + assert_eq!(process("八月"), "8月"); + } + + #[test] + fn test_gongyuan() { + assert_eq!(process("公元一七九八年五月三十日"), "公元1798年5月30日"); + assert_eq!(process("公元前一七九八年"), "公元前1798年"); + } + + #[test] + fn test_jiyuan() { + assert_eq!(process("纪元前一九三四年一月二日"), "公元前1934年1月2日"); + assert_eq!(process("纪元二零五六年二月三日"), "公元2056年2月3日"); + } +} diff --git a/src/asr/zh/decimal.rs b/src/asr/zh/decimal.rs new file mode 100644 index 0000000..57a8c48 --- /dev/null +++ b/src/asr/zh/decimal.rs @@ -0,0 +1,143 @@ +//! Decimal number tagger for Chinese. +//! +//! Converts Chinese decimal expressions to Arabic numeral form: +//! - "一点零五六" → "1.056" +//! - "负五万点二四五" → "-50,000.245" +//! - "壹佰点叁肆" → "100.34" +//! +//! Handles: 点/點 as decimal point, 负/負 as negative prefix, +//! traditional/financial characters. + +use super::cardinal; + +/// Process decimal patterns in a string. +pub fn process(input: &str) -> String { + let mut result = String::new(); + let mut remaining = input; + + while !remaining.is_empty() { + if let Some((before, dec_str, after)) = find_decimal(remaining) { + result.push_str(before); + result.push_str(&dec_str); + remaining = after; + } else { + result.push_str(remaining); + break; + } + } + + result +} + +/// Find the next decimal expression in the string. +fn find_decimal(input: &str) -> Option<(&str, String, &str)> { + let chars: Vec = input.chars().collect(); + let mut byte_pos = 0; + + for (_i, &c) in chars.iter().enumerate() { + // Check for 负/負 prefix + if c == '负' || c == '負' { + let after_neg = &input[byte_pos + c.len_utf8()..]; + if let Some((dec_str, dec_byte_len)) = parse_decimal_at(after_neg) { + let before = &input[..byte_pos]; + let after = &input[byte_pos + c.len_utf8() + dec_byte_len..]; + return Some((before, format!("-{}", dec_str), after)); + } + } + + // Check for Chinese digit that could start a decimal + if cardinal::is_zh_numeral(c) { + if let Some((dec_str, dec_byte_len)) = parse_decimal_at(&input[byte_pos..]) { + let before = &input[..byte_pos]; + let after = &input[byte_pos + dec_byte_len..]; + return Some((before, dec_str, after)); + } + } + + byte_pos += c.len_utf8(); + } + + None +} + +/// Try to parse a decimal number starting at the given position. +/// Returns (formatted_string, bytes_consumed). +fn parse_decimal_at(input: &str) -> Option<(String, usize)> { + let chars: Vec = input.chars().collect(); + if chars.is_empty() { + return None; + } + + // Find 点/點 position + let dian_pos = chars.iter().position(|&c| c == '点' || c == '點')?; + + // Integer part: Chinese numerals before 点 + let int_chars: Vec = chars[..dian_pos].to_vec(); + if int_chars.is_empty() { + return None; + } + + // All int chars must be Chinese numerals + if !int_chars.iter().all(|&c| cardinal::is_zh_numeral(c)) { + return None; + } + + // Parse integer part — fully expand (no 万-preservation for decimals) + let int_str: String = int_chars.iter().collect(); + let int_val = cardinal::zh_to_number(&int_str)?; + let int_formatted = cardinal::format_with_commas(int_val); + + // Fractional part: individual Chinese digits after 点 + let frac_start = dian_pos + 1; + let mut frac_end = frac_start; + while frac_end < chars.len() { + let c = chars[frac_end]; + if cardinal::zh_digit(c).is_some() { + frac_end += 1; + } else { + break; + } + } + + if frac_end == frac_start { + return None; // No fractional digits + } + + let frac_digits: String = chars[frac_start..frac_end] + .iter() + .map(|&c| cardinal::zh_digit(c).unwrap().to_string()) + .collect(); + + let total_bytes: usize = chars[..frac_end].iter().map(|c| c.len_utf8()).sum(); + + Some((format!("{}.{}", int_formatted, frac_digits), total_bytes)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(process("一点零五六"), "1.056"); + assert_eq!(process("两百点一"), "200.1"); + } + + #[test] + fn test_negative() { + assert_eq!(process("负五万点二四五"), "-50,000.245"); + assert_eq!(process("负一点一"), "-1.1"); + } + + #[test] + fn test_traditional() { + assert_eq!(process("一點零零五"), "1.005"); + assert_eq!(process("負十點五"), "-10.5"); + } + + #[test] + fn test_financial() { + assert_eq!(process("壹佰点叁肆"), "100.34"); + assert_eq!(process("伍拾壹点肆"), "51.4"); + } +} diff --git a/src/asr/zh/fraction.rs b/src/asr/zh/fraction.rs new file mode 100644 index 0000000..b7cb78a --- /dev/null +++ b/src/asr/zh/fraction.rs @@ -0,0 +1,121 @@ +//! Fraction tagger for Chinese. +//! +//! Converts Chinese fractions to Arabic numeral form: +//! - "五分之一" → "1/5" +//! - "一又二分之一" → "1又1/2" +//! +//! Chinese fractions use X分之Y where X is denominator and Y is numerator. +//! Mixed numbers use X又Y分之Z → X又Z/Y. + +use super::cardinal; + +/// Process fraction patterns in a string. +pub fn process(input: &str) -> String { + let fen_zhi = "分之"; + let mut result = String::new(); + let mut remaining = input; + + while let Some(fz_pos) = remaining.find(fen_zhi) { + let before_fz = &remaining[..fz_pos]; + let after_fz = &remaining[fz_pos + fen_zhi.len()..]; + + // Parse denominator: Chinese numerals immediately before 分之 + let before_chars: Vec = before_fz.chars().collect(); + let mut denom_start = before_chars.len(); + while denom_start > 0 && cardinal::is_zh_numeral(before_chars[denom_start - 1]) { + denom_start -= 1; + } + + if denom_start >= before_chars.len() { + // No Chinese numeral before 分之, pass through + result.push_str(&remaining[..fz_pos + fen_zhi.len()]); + remaining = after_fz; + continue; + } + + let denom_kanji: String = before_chars[denom_start..].iter().collect(); + let denom = match cardinal::zh_to_number(&denom_kanji) { + Some(d) => d, + None => { + result.push_str(&remaining[..fz_pos + fen_zhi.len()]); + remaining = after_fz; + continue; + } + }; + + // Parse numerator: Chinese numerals immediately after 分之 + let after_chars: Vec = after_fz.chars().collect(); + let mut numer_end = 0; + while numer_end < after_chars.len() && cardinal::is_zh_numeral(after_chars[numer_end]) { + numer_end += 1; + } + + if numer_end == 0 { + result.push_str(&remaining[..fz_pos + fen_zhi.len()]); + remaining = after_fz; + continue; + } + + let numer_kanji: String = after_chars[..numer_end].iter().collect(); + let numer = match cardinal::zh_to_number(&numer_kanji) { + Some(n) => n, + None => { + result.push_str(&remaining[..fz_pos + fen_zhi.len()]); + remaining = after_fz; + continue; + } + }; + + let numer_byte_len: usize = after_chars[..numer_end].iter().map(|c| c.len_utf8()).sum(); + + // Build prefix before denominator + let prefix: String = before_chars[..denom_start].iter().collect(); + + // Check for mixed number: X又Y分之Z + if prefix.ends_with('又') { + let before_you = &prefix[..prefix.len() - '又'.len_utf8()]; + let by_chars: Vec = before_you.chars().collect(); + let mut whole_start = by_chars.len(); + while whole_start > 0 && cardinal::is_zh_numeral(by_chars[whole_start - 1]) { + whole_start -= 1; + } + + if whole_start < by_chars.len() { + let whole_kanji: String = by_chars[whole_start..].iter().collect(); + if let Some(whole) = cardinal::zh_to_number(&whole_kanji) { + let real_prefix: String = by_chars[..whole_start].iter().collect(); + result.push_str(&real_prefix); + result.push_str(&format!("{}又{}/{}", whole, numer, denom)); + remaining = &after_fz[numer_byte_len..]; + continue; + } + } + } + + // Simple fraction + result.push_str(&prefix); + result.push_str(&format!("{}/{}", numer, denom)); + remaining = &after_fz[numer_byte_len..]; + } + + result.push_str(remaining); + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(process("五分之一"), "1/5"); + assert_eq!(process("二分之一"), "1/2"); + assert_eq!(process("十分之五"), "5/10"); + } + + #[test] + fn test_mixed() { + assert_eq!(process("三又五分之一"), "3又1/5"); + assert_eq!(process("一又二分之一"), "1又1/2"); + } +} diff --git a/src/asr/zh/mod.rs b/src/asr/zh/mod.rs new file mode 100644 index 0000000..06c7ed2 --- /dev/null +++ b/src/asr/zh/mod.rs @@ -0,0 +1,15 @@ +//! Chinese inverse text normalization. +//! +//! Converts Chinese numerals and spoken-form expressions to written form. +//! Uses a sentence-scanning approach: each processor scans the input +//! for its patterns and replaces Chinese number spans in-place. + +pub mod cardinal; +pub mod date; +pub mod decimal; +pub mod fraction; +pub mod money; +pub mod ordinal; +pub mod time; +pub mod whitelist; +pub mod word; diff --git a/src/asr/zh/money.rs b/src/asr/zh/money.rs new file mode 100644 index 0000000..4b74a07 --- /dev/null +++ b/src/asr/zh/money.rs @@ -0,0 +1,212 @@ +//! Money tagger for Chinese. +//! +//! Converts Chinese currency expressions to symbolic form: +//! - "一千美元" → "US$1000" +//! - "一千元" → "¥1000" +//! - "一万美元" → "US$1万" +//! - "一点五万美元" → "US$1.5万" +//! - "一千万美元" → "US$1000万" + +use super::cardinal; + +/// Currency mapping: (Chinese name, symbol) +/// Order matters: longer names first to avoid partial matches. +/// "元" must be last since it's a suffix of "美元", "欧元", "日元", "韩元". +const CURRENCIES: &[(&str, &str)] = &[ + ("印度卢布", "₹"), + ("美元", "US$"), + ("欧元", "€"), + ("英镑", "£"), + ("韩元", "₩"), + ("日元", "JPY¥"), + ("元", "¥"), +]; + +/// Process money patterns in a string. +pub fn process(input: &str) -> String { + let mut result = input.to_string(); + + for &(name, symbol) in CURRENCIES { + result = process_currency(&result, name, symbol); + } + + result +} + +/// Process a single currency: find Chinese number + currency name and replace. +fn process_currency(input: &str, currency_name: &str, symbol: &str) -> String { + let mut result = String::new(); + let mut remaining = input; + + while let Some(pos) = remaining.find(currency_name) { + let before = &remaining[..pos]; + let before_chars: Vec = before.chars().collect(); + + // For 元: skip if preceded by 公 or 纪 (公元, 公元前, 纪元) + if currency_name == "元" { + if before.ends_with('公') || before.ends_with('纪') { + result.push_str(&remaining[..pos + currency_name.len()]); + remaining = &remaining[pos + currency_name.len()..]; + continue; + } + } + + // Scan backwards for Chinese numerals or decimal point characters + let mut num_start = before_chars.len(); + while num_start > 0 { + let c = before_chars[num_start - 1]; + if cardinal::is_zh_numeral(c) || c == '点' || c == '點' { + num_start -= 1; + } else { + break; + } + } + + if num_start < before_chars.len() { + let prefix: String = before_chars[..num_start].iter().collect(); + let number_chars: String = before_chars[num_start..].iter().collect(); + + result.push_str(&prefix); + + // Check if it contains a decimal point + if number_chars.contains('点') || number_chars.contains('點') { + if let Some(formatted) = format_money_decimal(&number_chars) { + result.push_str(&format!("{}{}", symbol, formatted)); + } else { + result.push_str(&format!("{}{}", symbol, number_chars)); + } + } else { + // Format for money: 万-preservation, no commas + if let Some(formatted) = format_money_cardinal(&number_chars) { + result.push_str(&format!("{}{}", symbol, formatted)); + } else { + result.push_str(&format!("{}{}", symbol, number_chars)); + } + } + } else { + result.push_str(before); + } + + remaining = &remaining[pos + currency_name.len()..]; + } + + result.push_str(remaining); + result +} + +/// Format a cardinal number for money: 万-preservation, no commas. +/// - "一千" → "1000" +/// - "一万" → "1万" +/// - "一千万" → "1000万" +/// - "五十万" → "50万" +fn format_money_cardinal(input: &str) -> Option { + let chars: Vec = input.chars().collect(); + if chars.is_empty() || !chars.iter().all(|&c| cardinal::is_zh_numeral(c)) { + return None; + } + + // Find 万 position + let wan_pos = chars.iter().position(|&c| c == '万' || c == '萬'); + + if let Some(wp) = wan_pos { + let wan_char = chars[wp]; + let wan_mult = if wp == 0 { + 1 + } else { + cardinal::zh_to_number(&chars[..wp].iter().collect::())? + }; + + let mut after_wan = &chars[wp + 1..]; + if !after_wan.is_empty() && after_wan[0] == '零' { + after_wan = &after_wan[1..]; + } + + if after_wan.is_empty() { + // Pure 万: N万 (no commas in multiplier) + return Some(format!("{}{}", wan_mult, wan_char)); + } + + // Has sub-万: expand fully without commas + let total = cardinal::zh_to_number(input)?; + return Some(total.to_string()); + } + + // No 万 — plain number without commas + let num = cardinal::zh_to_number(input)?; + Some(num.to_string()) +} + +/// Format a decimal number for money display. +/// e.g., "一点五万" → "1.5万" +fn format_money_decimal(input: &str) -> Option { + let dian_pos = input.find('点').or_else(|| input.find('點'))?; + let dian_char = if input.contains('点') { '点' } else { '點' }; + + let int_part = &input[..dian_pos]; + let after_dian = &input[dian_pos + dian_char.len_utf8()..]; + + // Parse integer part + let int_chars: Vec = int_part.chars().collect(); + if int_chars.is_empty() || !int_chars.iter().all(|&c| cardinal::is_zh_numeral(c)) { + return None; + } + let int_val = cardinal::zh_to_number(&int_chars.iter().collect::())?; + + // Parse fractional part — check if it ends with 万/萬 + let after_chars: Vec = after_dian.chars().collect(); + if after_chars.is_empty() { + return None; + } + + let last_char = *after_chars.last().unwrap(); + if last_char == '万' || last_char == '萬' { + let frac_chars = &after_chars[..after_chars.len() - 1]; + let frac_digits: String = frac_chars + .iter() + .filter_map(|&c| cardinal::zh_digit(c).map(|d| d.to_string())) + .collect(); + if frac_digits.is_empty() { + return None; + } + Some(format!("{}.{}{}", int_val, frac_digits, last_char)) + } else { + let frac_digits: String = after_chars + .iter() + .filter_map(|&c| cardinal::zh_digit(c).map(|d| d.to_string())) + .collect(); + if frac_digits.is_empty() { + return None; + } + Some(format!("{}.{}", int_val, frac_digits)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_usd() { + assert_eq!(process("一千美元"), "US$1000"); + assert_eq!(process("一万美元"), "US$1万"); + assert_eq!(process("一点五万美元"), "US$1.5万"); + assert_eq!(process("一千万美元"), "US$1000万"); + } + + #[test] + fn test_cny() { + assert_eq!(process("一千元"), "¥1000"); + assert_eq!(process("一万元"), "¥1万"); + } + + #[test] + fn test_jpy() { + assert_eq!(process("一千日元"), "JPY¥1000"); + } + + #[test] + fn test_skip_gongyuan() { + // 公元 should not match 元 currency + assert_eq!(process("公元"), "公元"); + } +} diff --git a/src/asr/zh/ordinal.rs b/src/asr/zh/ordinal.rs new file mode 100644 index 0000000..0a83d16 --- /dev/null +++ b/src/asr/zh/ordinal.rs @@ -0,0 +1,69 @@ +//! Ordinal number tagger for Chinese. +//! +//! Converts Chinese ordinals to Arabic numerals: +//! - "第一百" → "第100" +//! - "第兩萬一千一百一十一" → "第21111" +//! +//! Uses 第 prefix. Numbers after 第 that have only 万/億-scale and no sub-units +//! still preserve the scale char (e.g., "第两万" → "第2万"). + +use super::cardinal; + +/// Process ordinal patterns in a string. +pub fn process(input: &str) -> String { + let prefix = "第"; + let mut result = String::new(); + let mut remaining = input; + + while let Some(pos) = remaining.find(prefix) { + result.push_str(&remaining[..pos]); + result.push_str(prefix); + + let after = &remaining[pos + prefix.len()..]; + let chars: Vec = after.chars().collect(); + + // Find end of Chinese numeral span + let mut num_end = 0; + while num_end < chars.len() && cardinal::is_zh_numeral(chars[num_end]) { + num_end += 1; + } + + if num_end > 0 { + let kanji: String = chars[..num_end].iter().collect(); + if let Some(formatted) = cardinal::format_zh_ordinal(&kanji) { + result.push_str(&formatted); + } else { + result.push_str(&kanji); + } + let byte_len: usize = chars[..num_end].iter().map(|c| c.len_utf8()).sum(); + remaining = &after[byte_len..]; + } else { + remaining = after; + } + } + + result.push_str(remaining); + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(process("第一百"), "第100"); + assert_eq!(process("第五百"), "第500"); + } + + #[test] + fn test_wan_preserved() { + assert_eq!(process("第两万"), "第2万"); + assert_eq!(process("第十万"), "第10万"); + } + + #[test] + fn test_expanded() { + assert_eq!(process("第兩萬一千一百一十一"), "第21111"); + } +} diff --git a/src/asr/zh/time.rs b/src/asr/zh/time.rs new file mode 100644 index 0000000..9d4c83f --- /dev/null +++ b/src/asr/zh/time.rs @@ -0,0 +1,244 @@ +//! Time tagger for Chinese. +//! +//! Converts Chinese time expressions to formatted form: +//! - "五点五分" → "05:05" +//! - "十三点五分十秒" → "13:05:10" +//! - "五点半" → "5点半" +//! - "五点一刻" → "5点1刻" +//! - "五分钟" → "5分钟" +//! - "五秒钟" → "5秒钟" +//! +//! Rules: +//! - X点Y分 → HH:MM (zero-padded) +//! - X点Y分Z秒 → HH:MM:SS (zero-padded) +//! - X点半 → N点半 (preserved, just convert digit) +//! - X点Y刻 → N点N刻 (preserved, just convert digit) +//! - X点 (alone) → N点 (preserved) +//! - X分钟 → N分钟, X秒钟 → N秒钟 (duration, just convert digit) + +use super::cardinal; + +/// Process time patterns in a string. +pub fn process(input: &str) -> String { + let mut result = String::new(); + let mut remaining = input; + + while !remaining.is_empty() { + if let Some((before, time_str, after)) = find_time_expr(remaining) { + result.push_str(before); + result.push_str(&time_str); + remaining = after; + } else { + result.push_str(remaining); + break; + } + } + + result +} + +/// Find the next time expression in the string. +fn find_time_expr(input: &str) -> Option<(&str, String, &str)> { + let chars: Vec = input.chars().collect(); + let mut byte_pos = 0; + + for (i, &c) in chars.iter().enumerate() { + // Look for 分钟 pattern (duration) + if c == '分' && i > 0 { + let after_fen = &input[byte_pos + c.len_utf8()..]; + if after_fen.starts_with('钟') { + // X分钟 pattern + let before_chars = &chars[..i]; + let mut num_start = before_chars.len(); + while num_start > 0 && cardinal::is_zh_numeral(before_chars[num_start - 1]) { + num_start -= 1; + } + if num_start < before_chars.len() { + let prefix_bytes: usize = chars[..num_start].iter().map(|c| c.len_utf8()).sum(); + let kanji: String = before_chars[num_start..].iter().collect(); + if let Some(num) = cardinal::zh_to_number(&kanji) { + let before = &input[..prefix_bytes]; + let after = &input[byte_pos + c.len_utf8() + '钟'.len_utf8()..]; + return Some((before, format!("{}分钟", num), after)); + } + } + } + } + + // Look for 秒钟 pattern (duration) + if c == '秒' && i > 0 { + let after_miao = &input[byte_pos + c.len_utf8()..]; + if after_miao.starts_with('钟') { + let before_chars = &chars[..i]; + let mut num_start = before_chars.len(); + while num_start > 0 && cardinal::is_zh_numeral(before_chars[num_start - 1]) { + num_start -= 1; + } + if num_start < before_chars.len() { + let prefix_bytes: usize = chars[..num_start].iter().map(|c| c.len_utf8()).sum(); + let kanji: String = before_chars[num_start..].iter().collect(); + if let Some(num) = cardinal::zh_to_number(&kanji) { + let before = &input[..prefix_bytes]; + let after = &input[byte_pos + c.len_utf8() + '钟'.len_utf8()..]; + return Some((before, format!("{}秒钟", num), after)); + } + } + } + } + + // Look for 点 as time separator (X点Y分) + if (c == '点' || c == '點') && i > 0 { + // Check if preceded by Chinese numerals + let before_chars = &chars[..i]; + let mut num_start = before_chars.len(); + while num_start > 0 && cardinal::is_zh_numeral(before_chars[num_start - 1]) { + num_start -= 1; + } + + if num_start < before_chars.len() { + let hour_kanji: String = before_chars[num_start..].iter().collect(); + if let Some(hour) = cardinal::zh_to_number(&hour_kanji) { + let prefix_bytes: usize = chars[..num_start].iter().map(|c| c.len_utf8()).sum(); + let after_dian = &chars[i + 1..]; + + // Check what follows 点 + if let Some(time_result) = parse_after_dian(hour, after_dian) { + let before = &input[..prefix_bytes]; + let consumed_bytes: usize = chars[num_start..i + 1 + time_result.1] + .iter() + .map(|c| c.len_utf8()) + .sum(); + let after = &input[prefix_bytes + consumed_bytes..]; + return Some((before, time_result.0, after)); + } + } + } + } + + byte_pos += c.len_utf8(); + } + + None +} + +/// Parse what comes after 点 in a time expression. +/// Returns (formatted_time, chars_consumed_after_dian). +fn parse_after_dian(hour: i64, after_dian: &[char]) -> Option<(String, usize)> { + if after_dian.is_empty() { + // X点 alone + return Some((format!("{}点", hour), 0)); + } + + // Check for 半 + if after_dian[0] == '半' { + return Some((format!("{}点半", hour), 1)); + } + + // Check for X刻 + let mut num_end = 0; + while num_end < after_dian.len() && cardinal::is_zh_numeral(after_dian[num_end]) { + num_end += 1; + } + + if num_end > 0 && num_end < after_dian.len() && after_dian[num_end] == '刻' { + let kanji: String = after_dian[..num_end].iter().collect(); + if let Some(quarter) = cardinal::zh_to_number(&kanji) { + return Some((format!("{}点{}刻", hour, quarter), num_end + 1)); + } + } + + // Check for Y分 (and optional Z秒) + if num_end > 0 && num_end < after_dian.len() && after_dian[num_end] == '分' { + let min_kanji: String = after_dian[..num_end].iter().collect(); + if let Some(minute) = cardinal::zh_to_number(&min_kanji) { + let after_fen = &after_dian[num_end + 1..]; + + // Check for seconds + let mut sec_end = 0; + while sec_end < after_fen.len() && cardinal::is_zh_numeral(after_fen[sec_end]) { + sec_end += 1; + } + + if sec_end > 0 && sec_end < after_fen.len() && after_fen[sec_end] == '秒' { + let sec_kanji: String = after_fen[..sec_end].iter().collect(); + if let Some(second) = cardinal::zh_to_number(&sec_kanji) { + // HH:MM:SS + let total_consumed = num_end + 1 + sec_end + 1; + return Some(( + format!("{:02}:{:02}:{:02}", hour, minute, second), + total_consumed, + )); + } + } + + // HH:MM only + let total_consumed = num_end + 1; + return Some((format!("{:02}:{:02}", hour, minute), total_consumed)); + } + } + + // Check for 零Y分 pattern (e.g., 十三点零五分) + if !after_dian.is_empty() && after_dian[0] == '零' { + let rest = &after_dian[1..]; + let mut num_end2 = 0; + while num_end2 < rest.len() && cardinal::is_zh_numeral(rest[num_end2]) { + num_end2 += 1; + } + if num_end2 > 0 && num_end2 < rest.len() && rest[num_end2] == '分' { + let min_kanji: String = rest[..num_end2].iter().collect(); + if let Some(minute) = cardinal::zh_to_number(&min_kanji) { + let total_consumed = 1 + num_end2 + 1; // 零 + digits + 分 + return Some((format!("{:02}:{:02}", hour, minute), total_consumed)); + } + } + } + + // Check if what follows looks like decimal digits (not time) + // If digits follow 点 without a time suffix, this is a decimal, not time + if !after_dian.is_empty() && cardinal::zh_digit(after_dian[0]).is_some() { + return None; // Let the decimal processor handle this + } + + // X点 alone (no following digits or time suffixes) + Some((format!("{}点", hour), 0)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_hhmm() { + assert_eq!(process("五点五分"), "05:05"); + assert_eq!(process("十三点五分"), "13:05"); + } + + #[test] + fn test_hhmmss() { + assert_eq!(process("一点五分十秒"), "01:05:10"); + assert_eq!(process("十三点五分十秒"), "13:05:10"); + } + + #[test] + fn test_half() { + assert_eq!(process("五点半"), "5点半"); + } + + #[test] + fn test_quarter() { + assert_eq!(process("五点一刻"), "5点1刻"); + assert_eq!(process("两点三刻"), "2点3刻"); + } + + #[test] + fn test_alone() { + assert_eq!(process("六点"), "6点"); + assert_eq!(process("十点"), "10点"); + } + + #[test] + fn test_duration() { + assert_eq!(process("五分钟"), "5分钟"); + assert_eq!(process("五秒钟"), "5秒钟"); + } +} diff --git a/src/asr/zh/whitelist.rs b/src/asr/zh/whitelist.rs new file mode 100644 index 0000000..6264bc3 --- /dev/null +++ b/src/asr/zh/whitelist.rs @@ -0,0 +1,54 @@ +//! Whitelist tagger for Chinese ITN. +//! +//! Maps Chinese terms to their abbreviation/acronym forms: +//! - "人力资源" → "HR" +//! - "自动取款机" → "ATM" + +/// Whitelist entries: (Chinese term, abbreviation) +const WHITELIST: &[(&str, &str)] = &[ + ("人力资源", "HR"), + ("自动取款机", "ATM"), + ("首席执行官", "CEO"), + ("美国研究生入学考试", "GRE"), + ("研究生管理专业入学考试", "GMAT"), + ("全球定位系统", "GPS"), + ("刷卡机", "POS机"), + ("数位多功能光碟", "DVD"), + ("镭射唱片", "CD"), + ("通用串行总线", "USB"), + ("统一资源定位符", "URL"), + ("虚拟专用网络", "VPN"), + ("网络互联协议", "IP"), + ("脱氧核糖核酸", "DNA"), + ("核糖核酸", "RNA"), + ("平均学分绩点", "GPA"), + ("发光二极管", "LED"), + ("可移植文档格式", "PDF"), + ("社会性网络服务", "SNS"), + ("博士", "PhD"), +]; + +/// Process whitelist replacements in the input string. +pub fn process(input: &str) -> String { + let mut result = input.to_string(); + // Apply longest matches first to avoid partial matches + let mut sorted: Vec<&(&str, &str)> = WHITELIST.iter().collect(); + sorted.sort_by(|a, b| b.0.len().cmp(&a.0.len())); + + for &(term, abbr) in &sorted { + result = result.replace(term, abbr); + } + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + assert_eq!(process("人力资源"), "HR"); + assert_eq!(process("自动取款机"), "ATM"); + assert_eq!(process("博士"), "PhD"); + } +} diff --git a/src/asr/zh/word.rs b/src/asr/zh/word.rs new file mode 100644 index 0000000..a36db8c --- /dev/null +++ b/src/asr/zh/word.rs @@ -0,0 +1,10 @@ +//! Word tagger for Chinese ITN. +//! +//! Pass-through: returns input unchanged. +//! This module exists for completeness — the word test cases verify +//! that non-numeric Chinese text passes through unmodified. + +/// Process word patterns (pass-through). +pub fn process(input: &str) -> String { + input.to_string() +} diff --git a/src/lib.rs b/src/lib.rs index b3c6987..18edb90 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,7 +23,7 @@ pub mod tts; #[cfg(feature = "ffi")] pub mod ffi; -use asr::{ +use asr::en::{ cardinal, date, decimal, electronic, measure, money, ordinal, punctuation, telephone, time, whitelist, word, }; @@ -109,10 +109,430 @@ pub fn normalize(input: &str) -> String { input.to_string() } -/// Normalize with language selection (future use). -pub fn normalize_with_lang(input: &str, _lang: &str) -> String { - // TODO: Language-specific ITN taggers - normalize(input) +/// Normalize with language selection. +/// +/// Supports language-specific ITN taggers for converting spoken-form +/// ASR output to written form in different languages. +/// +/// Supported languages: "en" (default), "fr" (French), "de" (German), +/// "es" (Spanish), "hi" (Hindi), "ja" (Japanese), "zh" (Chinese). +pub fn normalize_with_lang(input: &str, lang: &str) -> String { + let input = input.trim(); + + match lang { + "en" => normalize(input), + "fr" => normalize_lang_fr(input), + "de" => normalize_lang_de(input), + "es" => normalize_lang_es(input), + "hi" => normalize_lang_hi(input), + "ja" => normalize_lang_ja(input), + "zh" => normalize_lang_zh(input), + _ => normalize(input), // Default to English + } +} + +/// Strip trailing punctuation from input: "vingt!" → ("vingt", "!") +fn strip_trailing_punctuation(input: &str) -> Option<(&str, &str)> { + let punct_chars = ['!', '?', '.', ',', ';', ':', '…']; + let trimmed = input.trim(); + for &p in &punct_chars { + if trimmed.ends_with(p) { + let text = trimmed[..trimmed.len() - p.len_utf8()].trim(); + let punct = &trimmed[trimmed.len() - p.len_utf8()..]; + if !text.is_empty() { + return Some((text, punct)); + } + } + } + None +} + +// ── French ITN ────────────────────────────────────────────────────────── + +/// ITN for French +fn normalize_lang_fr(input: &str) -> String { + // Try full input first + if let Some(result) = try_fr_taggers(input) { + return result; + } + + // Try stripping trailing punctuation: "vingt!" → try "vingt" then append " !" + if let Some((text, punct)) = strip_trailing_punctuation(input) { + if let Some(result) = try_fr_taggers(text) { + return format!("{} {}", result, punct); + } + } + + // Try partial number normalization: "quarante trois" → "40 trois" + // Only when input has exactly 2 space-separated tokens + if let Some(result) = try_fr_partial_cardinal(input) { + return result; + } + + // No match - return original + input.to_string() +} + +/// Try all French ITN taggers on the input +fn try_fr_taggers(input: &str) -> Option { + if let Some(result) = custom_rules::parse(input) { + return Some(result); + } + if let Some(result) = asr::fr::whitelist::parse(input) { + return Some(result); + } + if let Some(result) = asr::fr::punctuation::parse(input) { + return Some(result); + } + if let Some(result) = asr::fr::word::parse(input) { + return Some(result); + } + if let Some(result) = asr::fr::time::parse(input) { + return Some(result); + } + if let Some(result) = asr::fr::date::parse(input) { + return Some(result); + } + if let Some(result) = asr::fr::money::parse(input) { + return Some(result); + } + if let Some(result) = asr::fr::measure::parse(input) { + return Some(result); + } + if let Some(result) = asr::fr::electronic::parse(input) { + return Some(result); + } + if let Some(result) = asr::fr::ordinal::parse(input) { + return Some(result); + } + if let Some(result) = asr::fr::decimal::parse(input) { + return Some(result); + } + if let Some(num) = asr::fr::cardinal::parse(input) { + return Some(num); + } + // Telephone last since it can match numbers + if let Some(result) = asr::fr::telephone::parse(input) { + return Some(result); + } + None +} + +/// Try partial cardinal normalization for French. +/// "quarante trois" → "40 trois" (normalize first word if it's a tens/hundreds number) +fn try_fr_partial_cardinal(input: &str) -> Option { + let tokens: Vec<&str> = input.split_whitespace().collect(); + if tokens.len() != 2 { + return None; + } + + // Only convert the first token if it's a standalone number ≥ 10 + let first = tokens[0]; + let first_lower = first.to_lowercase(); + if let Some(num) = asr::fr::cardinal::words_to_number(&first_lower) { + if num >= 10 { + return Some(format!("{} {}", num, tokens[1])); + } + } + + None +} + +// ── German ITN ────────────────────────────────────────────────────────── + +/// ITN for German +fn normalize_lang_de(input: &str) -> String { + // Try full input first + if let Some(result) = try_de_taggers(input) { + return result; + } + + // Try stripping trailing punctuation: "zwanzig!" → try "zwanzig" then append " !" + if let Some((text, punct)) = strip_trailing_punctuation(input) { + if let Some(result) = try_de_taggers(text) { + return format!("{} {}", result, punct); + } + } + + // No match - return original + input.to_string() +} + +/// Try all German ITN taggers on the input +fn try_de_taggers(input: &str) -> Option { + if let Some(result) = custom_rules::parse(input) { + return Some(result); + } + if let Some(result) = asr::de::whitelist::parse(input) { + return Some(result); + } + if let Some(result) = asr::de::punctuation::parse(input) { + return Some(result); + } + if let Some(result) = asr::de::time::parse(input) { + return Some(result); + } + if let Some(result) = asr::de::date::parse(input) { + return Some(result); + } + if let Some(result) = asr::de::money::parse(input) { + return Some(result); + } + if let Some(result) = asr::de::measure::parse(input) { + return Some(result); + } + if let Some(result) = asr::de::electronic::parse(input) { + return Some(result); + } + if let Some(result) = asr::de::ordinal::parse(input) { + return Some(result); + } + if let Some(result) = asr::de::fraction::parse(input) { + return Some(result); + } + if let Some(result) = asr::de::decimal::parse(input) { + return Some(result); + } + if let Some(num) = asr::de::cardinal::parse(input) { + return Some(num); + } + // Telephone last since it can match digit sequences + if let Some(result) = asr::de::telephone::parse(input) { + return Some(result); + } + None +} + +// ── Spanish ITN ───────────────────────────────────────────────────────── + +/// ITN for Spanish +fn normalize_lang_es(input: &str) -> String { + // Try full input first + if let Some(result) = try_es_taggers(input) { + return result; + } + + // Try stripping trailing punctuation: "veinte!" → try "veinte" then append " !" + if let Some((text, punct)) = strip_trailing_punctuation(input) { + if let Some(result) = try_es_taggers(text) { + return format!("{} {}", result, punct); + } + } + + // No match - return original + input.to_string() +} + +/// Try all Spanish ITN taggers on the input +fn try_es_taggers(input: &str) -> Option { + if let Some(result) = custom_rules::parse(input) { + return Some(result); + } + if let Some(result) = asr::es::whitelist::parse(input) { + return Some(result); + } + if let Some(result) = asr::es::punctuation::parse(input) { + return Some(result); + } + if let Some(result) = asr::es::word::parse(input) { + return Some(result); + } + if let Some(result) = asr::es::time::parse(input) { + return Some(result); + } + if let Some(result) = asr::es::date::parse(input) { + return Some(result); + } + if let Some(result) = asr::es::money::parse(input) { + return Some(result); + } + if let Some(result) = asr::es::measure::parse(input) { + return Some(result); + } + if let Some(result) = asr::es::electronic::parse(input) { + return Some(result); + } + if let Some(result) = asr::es::ordinal::parse(input) { + return Some(result); + } + if let Some(result) = asr::es::fraction::parse(input) { + return Some(result); + } + if let Some(result) = asr::es::decimal::parse(input) { + return Some(result); + } + if let Some(num) = asr::es::cardinal::parse(input) { + return Some(num); + } + // Telephone last since it can match digit sequences + if let Some(result) = asr::es::telephone::parse(input) { + return Some(result); + } + None +} + +/// Decompose precomposed Devanagari nukta characters to base + nukta. +/// This ensures consistent matching regardless of input encoding. +fn decompose_devanagari_nukta(input: &str) -> String { + let mut out = String::with_capacity(input.len() + 16); + for c in input.chars() { + match c { + '\u{0958}' => { + out.push('\u{0915}'); + out.push('\u{093C}'); + } // क़ + '\u{0959}' => { + out.push('\u{0916}'); + out.push('\u{093C}'); + } // ख़ + '\u{095A}' => { + out.push('\u{0917}'); + out.push('\u{093C}'); + } // ग़ + '\u{095B}' => { + out.push('\u{091C}'); + out.push('\u{093C}'); + } // ज़ + '\u{095C}' => { + out.push('\u{0921}'); + out.push('\u{093C}'); + } // ड़ + '\u{095D}' => { + out.push('\u{0922}'); + out.push('\u{093C}'); + } // ढ़ + '\u{095E}' => { + out.push('\u{092B}'); + out.push('\u{093C}'); + } // फ़ + '\u{095F}' => { + out.push('\u{092F}'); + out.push('\u{093C}'); + } // य़ + _ => out.push(c), + } + } + out +} + +/// ITN for Hindi. +/// +/// Hindi ITN uses a sentence-scanning approach. Each processor scans the +/// full input for its patterns and replaces Hindi number word spans in-place. +/// Order matters — more specific patterns (money, measure, time, date) +/// run before generic cardinal replacement. +fn normalize_lang_hi(input: &str) -> String { + // Normalize precomposed nukta characters to decomposed form + let input = decompose_devanagari_nukta(input); + let mut result = input; + + // 1. Whitelist (abbreviations: डॉक्टर→डॉ., etc.) + result = asr::hi::whitelist::process(&result); + + // 2. Money (number + currency name → symbol + digits) + result = asr::hi::money::process(&result); + + // 3. Date (day + month [+ year], ranges, eras) + result = asr::hi::date::process(&result); + + // 4. Time (X बजे/घंटा + मिनट/सेकंड) + // Before measure so "X घंटा Y मिनट" isn't caught as measure + result = asr::hi::time::process(&result); + + // 5. Measure (number + unit → digits + symbol) + result = asr::hi::measure::process(&result); + + // 6. Fractions (X बटा Y, X सही Y बटा Z) + result = asr::hi::fraction::process(&result); + + // 7. Ordinal (Xवां, Xवीं, Xवें) + result = asr::hi::ordinal::process(&result); + + // 8. Decimal (X दशमलव Y) + result = asr::hi::decimal::process(&result); + + // 9. Cardinal — convert compound number words (with scale words) and + // single number words to Devanagari digits. Must run BEFORE + // telephone/address so compound numbers like "एक सौ" are grouped. + result = asr::hi::cardinal::process(&result); + + // 10. Telephone (digit-by-digit sequences ≥ 4 Devanagari digits) + result = asr::hi::telephone::process(&result); + + // 11. Address (digit-by-digit with हाइफ़न/बटा, comma-separated digits) + result = asr::hi::address::process(&result); + + result +} + +// ── Japanese ITN ──────────────────────────────────────────────────────── + +/// ITN for Japanese. +/// +/// Japanese ITN uses a sentence-scanning approach: each processor scans the +/// full input for its patterns and replaces kanji number spans in-place. +/// Order matters — more specific patterns (fractions, decimals, dates, times) +/// run before generic cardinal replacement. +fn normalize_lang_ja(input: &str) -> String { + let mut result = input.to_string(); + + // 1. Fractions first (X分のY) — before time which also uses 分 + result = asr::ja::fraction::process(&result); + + // 2. Decimals (X点Y) — before cardinal swallows the kanji + result = asr::ja::decimal::process(&result); + + // 3. Dates (年月日, 世紀, 年代, weekdays, ranges) + result = asr::ja::date::process(&result); + + // 4. Time (時, 分) — after fractions to avoid 分の collision + result = asr::ja::time::process(&result); + + // 5. Ordinals (番目, 第) + result = asr::ja::ordinal::process(&result); + + // 6. Cardinal — catch remaining standalone kanji number spans + result = asr::ja::cardinal::replace_kanji_numbers(&result); + + result +} + +// ── Chinese ITN ───────────────────────────────────────────────────────── + +/// ITN for Chinese. +/// +/// Chinese ITN uses a sentence-scanning approach similar to Japanese. +/// Each processor scans the full input for its patterns and replaces +/// Chinese number spans in-place. +/// Order matters — whitelist, money, and specific patterns run before cardinal. +fn normalize_lang_zh(input: &str) -> String { + let mut result = input.to_string(); + + // 1. Whitelist (abbreviation mappings) + result = asr::zh::whitelist::process(&result); + + // 2. Money (before decimal to catch currency-specific decimal patterns like 一点五万美元) + result = asr::zh::money::process(&result); + + // 3. Fractions (X分之Y) — before time which also uses 分 + result = asr::zh::fraction::process(&result); + + // 4. Time (X点Y分, X分钟, X秒钟) — before decimal so 点 with 分/刻/半 isn't consumed as decimal + result = asr::zh::time::process(&result); + + // 5. Decimals (X点Y) + result = asr::zh::decimal::process(&result); + + // 6. Dates (年月日, 公元/纪元) + result = asr::zh::date::process(&result); + + // 7. Ordinals (第X) + result = asr::zh::ordinal::process(&result); + + // 8. Cardinal — catch remaining standalone Chinese number spans + result = asr::zh::cardinal::replace_zh_numbers(&result); + + result } // ── Multi-language TN helpers ────────────────────────────────────────── diff --git a/tests/common/mod.rs b/tests/common/mod.rs index c641a64..94f68b0 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -27,13 +27,22 @@ pub fn parse_test_file(path: &Path) -> Vec<(String, String)> { pub fn run_test_file(path: &Path, normalize_fn: F) -> TestResults where F: Fn(&str) -> String, +{ + run_test_file_with_compare(path, normalize_fn, |a, b| a == b) +} + +/// Run all test cases with a custom comparison function. +pub fn run_test_file_with_compare(path: &Path, normalize_fn: F, compare_fn: C) -> TestResults +where + F: Fn(&str) -> String, + C: Fn(&str, &str) -> bool, { let cases = parse_test_file(path); let mut results = TestResults::new(cases.len()); for (input, expected) in &cases { let result = normalize_fn(input); - if result == *expected { + if compare_fn(&result, expected) { results.passed += 1; } else { results.failures.push(TestFailure { diff --git a/tests/data/de/cardinal.txt b/tests/data/de/cardinal.txt new file mode 100644 index 0000000..0b20642 --- /dev/null +++ b/tests/data/de/cardinal.txt @@ -0,0 +1,62 @@ +ein hundert~100 +einhundert~100 +ein hundert und zwei~102 +einhundertzwei~102 +ein hundert und zwanzig~120 +ein hundert und elf~111 +ein tausend~1000 +eintausend~1000 +ein hundert zwanzig~120 +ein tausend zwanzig~1020 +eintausendzwanzig~1020 +neun billionen sieben hundert neun und achtzig milliarden drei hundert zwei und achtzig millionen fünf hundert sechs und dreißig tausend ein hundert dreißig~9789382536130 +zwei hundert vier und fünfzig~254 +ein hundert sieben und vierzig tausend vier hundert ein und fünfzig~147451 +eine million ein hundert sechs und fünfzig tausend ein hundert drei und siebzig~1156173 +eine milliarde fünf hundert drei und neunzig millionen zwei und siebzig tausend neun hundert ein und sechzig~1593072961 +sieben und neunzig billiarden acht hundert acht billionen zwei hundert vier und sechzig milliarden sieben hundert zwei und siebzig millionen sieben hundert zwei und neunzig tausend fünf~97808264772792005 +zehn billiarden zehn billionen zehn millionen ein hundert tausend zehn~10010000010100010 +zehn billiarden zehn billionen zehn millionen einhunderttausendzehn~10010000010100010 +minus fünf und zwanzig tausend sieben und dreißig~-25037 +minus fünf und zwanzig tausend sieben und dreißig~-25037 +minus fünfundzwanzigtausendsiebenunddreißig~-25037 +eine billiarde zwei hundert vier und sechzig billionen drei hundert eins milliarden neun hundert acht und dreißig millionen ein hundert vier~1264301938000104 +eine billiarde zweihundertvierundsechzig billionen dreihunderteins milliarden neunhundertachtunddreißig millionen einhundertvier~1264301938000104 +minus sechzig~-60 +sechs und vierzig tausend sechs hundert vier und sechzig~46664 +sechzig~60 +null~null +eins~eins +ein~ein +eine~eine +einer~einer +zwei~zwei +neun~neun +zehn~10 +elf~11 +zwölf~12 +dreizehn~13 +vierzehn~14 +fünfzehn~15 +sechzehn~16 +siebzehn~17 +achtzehn~18 +zwanzig~20 +dreißig~30 +vierzig~40 +fünfzig~50 +sechzig~60 +siebzig~70 +achtzig~80 +neunzig~90 +zwei millionen drei~2000003 +ein tausend dreizehn~1013 +ein tausend eins~1001 +ein tausend ein hundert~1100 +ein tausend sechs und zwanzig~1026 +ein tausend ein hundert sechs und zwanzig~1126 +achtzehn millionen vier hundert fünfzig tausend neun hundert neunzig~18450990 +achtzehn millionen neun hundert vierzig tausend sieben hundert zwei und zwanzig~18940722 +achtzehn millionen sechs hundert neunzig tausend neun hundert sechzehn~18690916 +achtzehn millionen sechshundertneunzigtausendneunhundertsechzehn~18690916 +achtzehn tausend acht hundert achtzig~18880 diff --git a/tests/data/de/date.txt b/tests/data/de/date.txt new file mode 100644 index 0000000..e994b93 --- /dev/null +++ b/tests/data/de/date.txt @@ -0,0 +1,22 @@ +vierundzwanzigster juli zwei tausend dreizehn~24. Jul. 2013 +vier und zwanzigster juli zwei tausend dreizehn~24. Jul. 2013 +neunzehn achtzig~1980 +neunzehnachtzig~1980 +neunzehnhundertachtzig~1980 +neunzehn hundert achtzig~1980 +neunzehn achtziger~19 achtziger +zwei tausend zwanzig~2020 +zwanzig zwanzig~2020 +zwei tausend neun~2009 +vierzehnter januar~14. Jan. +januarzweitausendneun~januarzweitausendneun +januar zweitausendneun~Jan. 2009 +erster januar~1. Jan. +dreißigster juni~30. Jun. +neunzehn siebzehn~1917 +neunzehn hundert siebzehn~1917 +neunzehn hundert vierundneunzig~1994 +neunzehn hundert vier und neunzig~1994 +neunzehn vierundneunzig~1994 +zwei tausend drei~2003 +ein tausend acht~1008 diff --git a/tests/data/de/decimal.txt b/tests/data/de/decimal.txt new file mode 100644 index 0000000..9db4974 --- /dev/null +++ b/tests/data/de/decimal.txt @@ -0,0 +1,10 @@ +null komma zwei millionen~0,2 millionen +eine million~1 million +eins komma zwei millionen~1,2 millionen +achtzehn milliarden~18 milliarden +vier hundert sechzig millionen~460 millionen +ein hundert zwanzig millionen~120 millionen +zehn millionen~10 millionen +minus sechzig komma zwei vier null null~-60,2400 +acht hundert achtzehn komma drei null drei~818,303 +achthundertachtzehn komma drei null drei~818,303 \ No newline at end of file diff --git a/tests/data/de/electronic.txt b/tests/data/de/electronic.txt new file mode 100644 index 0000000..fc9b0fc --- /dev/null +++ b/tests/data/de/electronic.txt @@ -0,0 +1,9 @@ +c d f at a b c punkt e d u~cdf@abc.edu +a b c at g mail punkt a b c~abc@gmail.abc +a b c at a b c punkt com~abc@abc.com +a s d f eins zwei drei at a b c punkt com~asdf123@abc.com +a eins b zwei at a b c punkt com~a1b2@abc.com +a b drei bindestrich s d d bindestrich drei at g mail punkt com~ab3-sdd-3@gmail.com +h t t p s doppelpunkt slash slash w w w punkt a b c punkt com~https://www.abc.com +w w w punkt a b c punkt com~www.abc.com +h t t p s doppelpunkt slash slash w w w punkt a b c punkt com slash a b fragezeichen gleichheitszeichen drei bindestrich slash a b s slash eins~https://www.abc.com/ab?=3-/abs/1 \ No newline at end of file diff --git a/tests/data/de/fraction.txt b/tests/data/de/fraction.txt new file mode 100644 index 0000000..5802cd4 --- /dev/null +++ b/tests/data/de/fraction.txt @@ -0,0 +1,34 @@ +null nulltel~0/0 +ein halb~1/2 +vier halbe~4/2 +ein drittel~1/3 +ein viertel~1/4 +ein fünftel~1/5 +ein sechstel~1/6 +ein siebtel~1/7 +ein achtel~1/8 +zwei neuntel~2/9 +ein ein halb~1 1/2 +ein sechstel~1/6 +ein zehntel~1/10 +ein elftel~1/11 +ein zehntel~1/10 +ein zwölftel~1/12 +ein dreizehntel~1/13 +ein vierzehntel~1/14 +ein fünfzehntel~1/15 +ein sechzehntel~1/16 +ein siebzehntel~1/17 +ein achtzehntel~1/18 +ein neunzehntel~1/19 +ein zwanzigstel~1/20 +ein dreißigstel~1/30 +ein vierzigstel~1/40 +ein fünfzigstel~1/50 +ein sechzigstel~1/60 +ein siebzigstel~1/70 +ein achtzigstel~1/80 +ein neunzigstel~1/90 +ein ein hundertstel~1/100 +ein zwei und zwanzigstel~1/22 +minus ein zwei und zwanzigstel~-1/22 \ No newline at end of file diff --git a/tests/data/de/measure.txt b/tests/data/de/measure.txt new file mode 100644 index 0000000..f454760 --- /dev/null +++ b/tests/data/de/measure.txt @@ -0,0 +1,28 @@ +zwei hundert meter~200 m +sechs und fünfzig komma drei pro quadrat kilometer~56,3 /km² +zwei hundert kilometer pro stunde~200 km/h +zwei und vierzig tausend zwei hundert neun und fünfzig pro quadrat meter~42259 /m² +minus sechs und sechzig kilogramm~-66 kg +minus sechsundsechzig kilogramm~-66 kg +zwei kilowattstunden~2 kwh +eins komma null null null null zwei acht kubik zentimeter~1,000028 cm³ +eins komma eins zentimeter~1,1 cm +drei stunden~3 h +eine stunde~1 h +ein millivolt~1 mv +eine million millivolt~1 million mv +zwei kubik meter~2 m³ +neunzig gramm~90 g +neunzig millionen gramm~90 millionen g +neunzig komma vier millionen gramm~90,4 millionen g +vier hundert vierzig milliliter~440 ml +drei hundert mikrometer~300 μm +fünf und sechzig tausend quadrat kilometer~65000 km² +zwei kilometer pro stunde~2 km/h +zwei millionen kilometer pro stunde~2 millionen km/h +zwei komma zwei millionen kilometer pro stunde~2,2 millionen km/h +sechzig komma zwei vier null null kilogramm~60,2400 kg +null fuß~0 ft +ein halb fuß~1/2 ft +ein ein halb fuß~1 1/2 ft +minus ein ein halb fuß~-1 1/2 ft \ No newline at end of file diff --git a/tests/data/de/money.txt b/tests/data/de/money.txt new file mode 100644 index 0000000..6e236d8 --- /dev/null +++ b/tests/data/de/money.txt @@ -0,0 +1,23 @@ +zwei dollar~$2 +ein dollar~$1 +eine million dollar~$1 million +zwei komma null null null eins dollar~$2,0001 +zwei komma null eins dollar~$2,01 +zwei komma null null dollar~$2,00 +ein cent~€0,01 +zwei cent~€0,02 +zwanzig cent~€0,20 +zweiundzwanzig cent~€0,22 +einhundert cent~100 cent +zwei dollar zwanzig~$2,20 +zweidollarzwanzig~zweidollarzwanzig +zwei dollar und zwanzig cent~$2,20 +zwei euro und zwanzig cent~€2,20 +zwei pfund und zwanzig pence~£2,20 +zwei euro zwanzig cent~€2,20 +zwei millionen euro~€2 millionen +zwei komma zwei null null millionen euro~€2,200 millionen +zwei komma zwei null eins millionen euro~€2,201 millionen +zwei komma zwei eins millionen euro~€2,21 millionen +zwei pfund und ein penny~£2,01 +zwei pfund und ein hundert penny~£2 und 100 penny \ No newline at end of file diff --git a/tests/data/de/ordinal.txt b/tests/data/de/ordinal.txt new file mode 100644 index 0000000..c85bb28 --- /dev/null +++ b/tests/data/de/ordinal.txt @@ -0,0 +1,20 @@ +ein hundertste~100. +fünf und zwanzig tausend ein hundert elftem~25111. +fünfundzwanzigtausendeinhundertelftem~25111. +zweite~zweite +nullte~nullte +erster~erster +zweiter~zweiter +dritter~dritter +vierter~vierter +zehnter~10. +elftem~11. +dreizehntem~13. +ein und zwanzigstes~21. +drei und zwanzigstes~23. +dreiundzwanzigstes~23. +ein hundert elftes~111. +ein tausendstem~1000. +dem ein tausendstem~dem 1000. +ein hundert ein und zwanzigste~121. +einhunderteinundzwanzigste~121. diff --git a/tests/data/de/telephone.txt b/tests/data/de/telephone.txt new file mode 100644 index 0000000..204d3e4 --- /dev/null +++ b/tests/data/de/telephone.txt @@ -0,0 +1 @@ +null vier eins eins eins zwei drei vier eins zwei drei vier~(0411) 1234-1234 \ No newline at end of file diff --git a/tests/data/de/time.txt b/tests/data/de/time.txt new file mode 100644 index 0000000..0eb0bdb --- /dev/null +++ b/tests/data/de/time.txt @@ -0,0 +1,24 @@ +acht uhr~8 Uhr +vier und zwanzig uhr~24 Uhr +vierundzwanziguhr~24 Uhr +vierundzwanzig uhr~24 Uhr +vierundzwanziguhrzweiundzwanzigest~24:22 Uhr est +vierundzwanziguhrzweiundzwanzig e s t~24:22 Uhr est +zwölf uhr mittags~12 Uhr mittags +achtzehn uhr~18 Uhr +acht uhr sieben~08:07 Uhr +null uhr siebzehn~00:17 Uhr +halb zwölf~11:30 Uhr +viertel vor zwölf~11:45 Uhr +drei vor zwölf~11:57 Uhr +zwei und zwanzig vor zwölf~11:38 Uhr +zweiundzwanzig vor zwölf~11:38 Uhr +drei nach zwölf~12:03 Uhr +viertel nach zwölf~12:15 Uhr +zehn nach zwölf~12:10 Uhr +zehn vor zwölf~11:50 Uhr +viertel nach zwölf nachts~12:15 Uhr nachts +null uhr null minuten null sekunden~00:00:00 Uhr +ein uhr eine minute eine sekunde e s t~01:01:01 Uhr est +zwei uhr zwei minuten drei und zwanzig sekunden~02:02:23 Uhr +zwei uhr zwei minuten dreiundzwanzig sekunden~02:02:23 Uhr \ No newline at end of file diff --git a/tests/data/de/whitelist.txt b/tests/data/de/whitelist.txt new file mode 100644 index 0000000..4f3767f --- /dev/null +++ b/tests/data/de/whitelist.txt @@ -0,0 +1,6 @@ +doktor dao~Dr. dao +miss smith~Ms. smith +misses smith~Mrs. smith +mister dao~Mr. dao +ich mag essen zum beispiel eis~ich mag essen z.B. eis +Chanel nummer fünf~Chanel Nr. fünf diff --git a/tests/data/de/word.txt b/tests/data/de/word.txt new file mode 100644 index 0000000..98ddaf3 --- /dev/null +++ b/tests/data/de/word.txt @@ -0,0 +1,49 @@ +~ +yahoo!~yahoo! +zwanzig!~20 ! +x ~x +—~— +aaa~aaa +aabach~aabach +aabenraa~aabenraa +aabye~aabye +aaccessed~aaccessed +aach~aach +aachen's~aachen's +aadri~aadri +aafia~aafia +aagaard~aagaard +aagadu~aagadu +aagard~aagard +aagathadi~aagathadi +aaghart's~aaghart's +aagnes~aagnes +aagomoni~aagomoni +aagon~aagon +aagoo~aagoo +aagot~aagot +aahar~aahar +aahh~aahh +aahperd~aahperd +aaibinterstate~aaibinterstate +aajab~aajab +aakasa~aakasa +aakervik~aakervik +aakirkeby~aakirkeby +aalam~aalam +aalbaek~aalbaek +aaldiu~aaldiu +aalem~aalem +a'ali~a'ali +aalilaassamthey~aalilaassamthey +aalin~aalin +aaliyan~aaliyan +aaliyan's~aaliyan's +aamadu~aamadu +aamara~aamara +aambala~aambala +aamera~aamera +aamer's~aamer's +aamina~aamina +aaminah~aaminah +aamjiwnaang~aamjiwnaang diff --git a/tests/data/es/cardinal.txt b/tests/data/es/cardinal.txt new file mode 100644 index 0000000..8e6d277 --- /dev/null +++ b/tests/data/es/cardinal.txt @@ -0,0 +1,51 @@ +doscientos cincuenta y uno~251 +novecientos noventa y nueve millones novecientos noventa y nueve mil novecientos noventa y nueve~999999999 +cero~cero +uno~uno +una~una +dos~dos +nueve~nueve +diez~10 +, uno~, uno +, diez~, 10 +menos veintitrés~-23 +cien~100 +ciento uno~101 +ciento un~101 +ciento una~101 +mil y uno~1001 +mil uno~1001 +nueve billones setecientos ochenta y nueve mil trescientos ochenta y dos millones quinientos treinta y seis mil ciento treinta~9789382536130 +doscientos cincuenta y cuatro~254 +ciento cuarenta y siete mil cuatrocientos cincuenta y uno~147451 +un millón ciento cincuenta y seis mil ciento setenta y tres~1156173 +mil quinientos noventa y tres millones setenta y dos mil novecientos sesenta y uno~1593072961 +noventa y siete mil ochocientos ocho billones doscientos sesenta y cuatro mil setecientos setenta y dos millones setecientos noventa y dos mil cinco~97808264772792005 +diecisiete mil ochocientos cincuenta y cinco trillones treinta y seis mil seiscientos cincuenta y siete billones siete mil quinientos noventa y seis millones ciento diez mil novecientos cuarenta y nueve~17855036657007596110949 +diez mil diez billones diez millones cien mil diez~10010000010100010 +menos veinticinco mil treinta y siete~-25037 +mil doscientos sesenta y cuatro billones trescientos un mil novecientos treinta y ocho millones ciento cuatro~1264301938000104 +menos sesenta~-60 +cuarenta y seis mil seiscientos sesenta y cuatro~46664 +sesenta~60 +dos millones tres~2000003 +mil trece~1013 +mil cien~1100 +mil veintiséis~1026 +mil ciento veintiséis~1126 +dieciocho millones cuatrocientos cincuenta mil novecientos noventa~18450990 +dieciocho millones novecientos cuarenta mil setecientos veintidós~18940722 +dieciocho millones seiscientos noventa mil novecientos dieciséis~18690916 +dieciocho mil ochocientos ochenta~18880 +un millardo uno~1000000001 +mil millones uno~1000000001 +mil millones ciento uno~1000000101 +mil millones mil ciento uno~1000001101 +mil millones diez mil ciento uno~1000010101 +mil un millón diez mil ciento uno~1001010101 +dos millardos cincuenta y dos~2000000052 +muchas millones~muchas millones +mil billones uno~1000000000000001 +mil trillones uno~1000000000000000000001 +veintiacuátro~veintiacuátro +entre dieciséis mil y dieciocho mil~entre 16000 y 18000 \ No newline at end of file diff --git a/tests/data/es/cardinal_cased.txt b/tests/data/es/cardinal_cased.txt new file mode 100644 index 0000000..15514ae --- /dev/null +++ b/tests/data/es/cardinal_cased.txt @@ -0,0 +1,30 @@ +Doscientos cincuenta y uno~251 +Novecientos noventa y nueve millones novecientos noventa y nueve mil novecientos noventa y nueve~999999999 +Cero~Cero +Uno~Uno +una~una +dos~dos +Nueve~Nueve +Diez~10 +, uno~, uno +, diez~, 10 +Menos veintitrés~-23 +cien~100 +ciento uno~101 +ciento un~101 +ciento una~101 +mil y uno~1001 +Mil una~1001 +nueve billones setecientos ochenta y nueve mil trescientos ochenta y dos millones quinientos treinta y seis mil ciento treinta~9789382536130 +Doscientos cincuenta y cuatro~254 +ciento cuarenta y siete mil cuatrocientos cincuenta y uno~147451 +Un Millón ciento cincuenta y seis mil ciento setenta y tres~1156173 +Mil quinientos noventa y tres millones setenta y dos mil novecientos sesenta y uno~1593072961 +noventa y siete mil ochocientos ocho billones doscientos sesenta y cuatro mil setecientos setenta y dos millones setecientos noventa y dos mil cinco~97808264772792005 +diecisiete mil ochocientos cincuenta y cinco trillones treinta y seis mil seiscientos cincuenta y siete billones siete mil quinientos noventa y seis millones ciento diez mil novecientos cuarenta y nueve~17855036657007596110949 +diez mil diez billones diez millones cien mil diez~10010000010100010 +Menos veinticinco mil treinta y siete~-25037 +mil doscientos sesenta y cuatro billones trescientos un mil novecientos treinta y ocho millones ciento cuatro~1264301938000104 +menos sesenta~-60 +cuarenta y seis mil seiscientos sesenta y cuatro~46664 +sesenta~60 \ No newline at end of file diff --git a/tests/data/es/date.txt b/tests/data/es/date.txt new file mode 100644 index 0000000..c56c0da --- /dev/null +++ b/tests/data/es/date.txt @@ -0,0 +1,8 @@ +primero de enero~1 de enero +uno de enero~1 de enero +el uno de diciembre~el 1 de diciembre +el primero de diciembre~el 1 de diciembre +domingo veintiséis de octubre~domingo 26 de octubre +treinta y uno de diciembre de mil novecientos noventa y dos~31 de diciembre de 1992 +siglo diecinueve~siglo xix +doscientos tres antes de cristo~203 a. c. \ No newline at end of file diff --git a/tests/data/es/date_cased.txt b/tests/data/es/date_cased.txt new file mode 100644 index 0000000..98bfd6f --- /dev/null +++ b/tests/data/es/date_cased.txt @@ -0,0 +1,8 @@ +Primero De Enero~1 de Enero +Uno de enero~1 de Enero +el uno de Diciembre~el 1 de Diciembre +El primero de diciembre~El 1 de diciembre +Domingo Veintiséis De Octubre~Domingo 26 de Octubre +treinta y uno de diciembre de mil novecientos noventa y dos~31 de diciembre de 1992 +Siglo diecinueve~Siglo xix +doscientos tres antes de Cristo~203 A. C. \ No newline at end of file diff --git a/tests/data/es/decimal.txt b/tests/data/es/decimal.txt new file mode 100644 index 0000000..7da1204 --- /dev/null +++ b/tests/data/es/decimal.txt @@ -0,0 +1,29 @@ +uno coma dos seis~1,26 +menos uno coma dos seis~-1,26 +uno coma veintiséis~1,26 +cero coma dos seis~0,26 +cero coma veintiséis~0,26 +tres coma ciento cuarenta y uno~3,141 +tres coma cero ciento cuarenta y uno~3,0141 +tres coma ciento cuarenta y uno cincuenta y nueve~3,14159 +tres coma catorce ciento cincuenta y nueve~3,14159 +tres coma catorce quince noventa y dos sesenta y cinco tres~3,141592653 +tres coma catorce quince cero noventa y dos sesenta y cinco treinta y cinco~3,14150926535 +tres coma catorce quince cero novecientos veintiséis cero quinientos treinta y cinco~3,141509260535 +cuatrocientos millones~400 millones +uno punto treinta y tres~1.33 +uno punto treinta y tres millones~1.33 millones +cero coma seis millones~0,6 millones +mil ochocientos veinticuatro millón~1824 millón +mil ochocientos veinticuatro millones~1824 millones +punto dos seis~.26 +un millón~1 millón +dos millones~2 millones +un millardo~1 millardo +dos millardos~2 millardos +un billón~1 billón +dos billones~2 billones +un trillón~1 trillón +dos trillones~2 trillones +un cuatrillón~1 cuatrillón +dos cuatrillones~2 cuatrillones diff --git a/tests/data/es/decimal_cased.txt b/tests/data/es/decimal_cased.txt new file mode 100644 index 0000000..81a91bb --- /dev/null +++ b/tests/data/es/decimal_cased.txt @@ -0,0 +1,6 @@ +Uno coma dos seis~1,26 +Menos uno coma dos seis~-1,26 +Uno Coma Veintiséis~1,26 +Cero coma Dos seis~0,26 +cero coma veintiséis~0,26 +tres coma ciento cuarenta y uno~3,141 \ No newline at end of file diff --git a/tests/data/es/electronic.txt b/tests/data/es/electronic.txt new file mode 100644 index 0000000..36fef75 --- /dev/null +++ b/tests/data/es/electronic.txt @@ -0,0 +1,16 @@ +a punto b c arroba g mail punto com~a.bc@gmail.com +c d f arroba a b c punto e d u~cdf@abc.edu +a b c arroba g mail punto a b c~abc@gmail.abc +a b c arroba a b c punto com~abc@abc.com +a s d f uno dos tres arroba a b c punto com~asdf123@abc.com +a uno b dos arroba a b c punto com~a1b2@abc.com +a b tres punto s d d punto tres arroba g mail punto com~ab3.sdd.3@gmail.com +hache te te pe ese dos puntos barra barra doble ve doble ve doble ve punto n vidia punto com~https://www.nvidia.com +doble ve doble ve doble ve punto n vidia punto com~www.nvidia.com +doble ve doble ve doble ve punto nvidia punto com~www.nvidia.com +w w w punto nvidia punto com~www.nvidia.com +doble ve doble ve doble ve punto a b c punto es barra e f g~www.abc.es/efg +doble ve doble ve doble ve punto a b c punto e s~www.abc.es +doble ve doble ve doble ve punto a b c punto es barra e f g signo de interrogación i d signo igual a b c~www.abc.es/efg?id=abc +doble ve doble ve doble ve punto a b c punto gob~www.abc.gob +doble ve doble ve doble ve punto a b c punto d e f~www.abc.def \ No newline at end of file diff --git a/tests/data/es/electronic_cased.txt b/tests/data/es/electronic_cased.txt new file mode 100644 index 0000000..2d3f26b --- /dev/null +++ b/tests/data/es/electronic_cased.txt @@ -0,0 +1,5 @@ +A punto B C Arroba G mail punto com~A.BC@gmail.com +c d f Arroba a b c Punto e d u~cdf@abc.edu +W W W Punto N vidia Punto com~www.nvidia.com +Doble ve doble ve doble ve punto a b c punto es barra e f g~www.abc.es/efg +Doble Ve Doble Ve Doble Ve Punto a b c Punto e s~www.abc.es \ No newline at end of file diff --git a/tests/data/es/fraction.txt b/tests/data/es/fraction.txt new file mode 100644 index 0000000..885ed7f --- /dev/null +++ b/tests/data/es/fraction.txt @@ -0,0 +1,12 @@ +medio~medio +un cuarto~un cuarto +ocho tercios~8/3 +dos quintos~2/5 +diez treintavos~10/30 +tres vigésimos~3/20 +once cientounavos~11/101 +un décimo~1/10 +un cuarentiunavo~1/41 +dos y dos tercios~2 2/3 +menos cuatro y un quinto~-4 1/5 +menos diez veinteavos~-10/20 \ No newline at end of file diff --git a/tests/data/es/measure.txt b/tests/data/es/measure.txt new file mode 100644 index 0000000..6b80918 --- /dev/null +++ b/tests/data/es/measure.txt @@ -0,0 +1,20 @@ +doscientos metros~200 m +tres horas~3 h +una hora~1 h +doscientos cuarenta y cinco millas por hora~245 mph +dos kilos~2 kg +sesenta coma dos cuatro cero cero kilogramos~60,2400 kg +menos sesenta coma veinticuatro cero cero kilogramos~-60,2400 kg +ocho coma cinco dos por ciento~8,52 % +menos ocho coma cinco dos por ciento~-8,52 % +uno porciento~1 % +tres centímetros~3 cm +cuatro segundos~4 s +cinco litros~5 l +tres metros cúbicos~3 m³ +dos kilómetros por hora~2 kph +diez grados farenheit~10 ° F +dos metros y medio~2 1/2 m +tres quintos de metro~3/5 m +menos tres y medio metros por hora~-3 1/2 m/h +dos más dos es igual a cuatro~2 + 2 = 4 \ No newline at end of file diff --git a/tests/data/es/measure_cased.txt b/tests/data/es/measure_cased.txt new file mode 100644 index 0000000..ad28add --- /dev/null +++ b/tests/data/es/measure_cased.txt @@ -0,0 +1,11 @@ +Doscientos metros~200 m +tres horas~3 h +una hora~1 h +Doscientos cuarenta y cinco Millas Por Hora~245 mph +Dos Kilos~2 kg +sesenta coma dos cuatro cero cero kilogramos~60,2400 kg +Menos sesenta coma veinticuatro cero cero kilogramos~-60,2400 kg +menos Ocho Coma Cinco Dos por ciento~-8,52 % +uno Porciento~1 % +tres centímetros~3 cm +dos más dos es igual a cuatro~2 + 2 = 4 \ No newline at end of file diff --git a/tests/data/es/money.txt b/tests/data/es/money.txt new file mode 100644 index 0000000..47611f9 --- /dev/null +++ b/tests/data/es/money.txt @@ -0,0 +1,24 @@ +doce dólares y cinco centavos~$12,05 +doce dólares y cinco céntimos~$12,05 +setenta y cinco dólares sesenta y tres~$75,63 +setenta y cinco dólares y sesenta y tres centavos~$75,63 +setenta y cinco dólares con sesenta y tres centavos~$75,63 +setenta y cinco dólares con sesenta y tres~$75,63 +veintinueve dólares cincuenta centavos~$29,50 +un dólar~$1 +veinticinco centavos~$0,25 +veinticinco céntimos~€0,25 +doce euros y cinco centavos~€12,05 +doce dólares estadounidenses y cinco centavos~US$12,05 +doce dólares americanos y cinco centavos~US$12,05 +doce pesos y cinco centavos~$12,05 +doce yenes y cinco centavos~¥12,05 +dos dólares y sesenta y tres dólares~$2 y $63 +diez pesetas~₧10 +un colón~₡1 +un chon~₩0,01 +tres wones con veinte~₩3,20 +cien quetzales~q100 +nueve punto cinco millones de pesos~$9.5 millones +catorce millones quinientos mil pesos mexicanos~Mex$14500000 +diez pesos mexicanos~Mex$10 \ No newline at end of file diff --git a/tests/data/es/money_cased.txt b/tests/data/es/money_cased.txt new file mode 100644 index 0000000..a57e606 --- /dev/null +++ b/tests/data/es/money_cased.txt @@ -0,0 +1,6 @@ +doce dólares y cinco centavos~$12,05 +Doce Dólares Y Cinco Céntimos~$12,05 +setenta y cinco Dólares sesenta y tres~$75,63 +Veintinueve dólares cincuenta centavos~$29,50 +Catorce millones quinientos mil Pesos mexicanos~Mex$14500000 +diez pesos Mexicanos~Mex$10 \ No newline at end of file diff --git a/tests/data/es/ordinal.txt b/tests/data/es/ordinal.txt new file mode 100644 index 0000000..b224775 --- /dev/null +++ b/tests/data/es/ordinal.txt @@ -0,0 +1,30 @@ +primero~primero +tercera~tercera +primer~primer +tercer~tercer +noveno~noveno +novena~novena +décimo~10.º +décima~10.ª +undécimo~11.º +undécima~11.ª +decimoprimero~11.º +décimo primero~11.º +decimoprimer~11.ᵉʳ +décimo primer~11.ᵉʳ +decimoprimera~11.ª +décima primera~11.ª +(technically ungrammatical) décimo primera~(technically ungrammatical) 11.ª +decimotercero~13.º +vigésimo primero~21.º +vigésima primera~21.ª +(technically ungrammatical) vigésimo primera~(technically ungrammatical) 21.ª +vigésimo primer~21.ᵉʳ +vigesimosegundo~22.º +vigésimo segundo~22.º +vigesimosegunda~22.ª +vigésima segunda~22.ª +vigésimo tercero~23.º +centésimo undécimo~111.º +centésimo trigésimo cuarto~134.º +vigesimoctavo~28.º \ No newline at end of file diff --git a/tests/data/es/ordinal_cased.txt b/tests/data/es/ordinal_cased.txt new file mode 100644 index 0000000..0dd13fd --- /dev/null +++ b/tests/data/es/ordinal_cased.txt @@ -0,0 +1,11 @@ +primero~primero +Tercera~Tercera +Primer~Primer +tercer~tercer +Décima~10.ª +undécimo~11.º +Decimoprimer~11.ᵉʳ +Décimo primer~11.ᵉʳ +Décima Primera~11.ª +(technically ungrammatical) décimo primera~(technically ungrammatical) 11.ª +decimotercero~13.º \ No newline at end of file diff --git a/tests/data/es/telephone.txt b/tests/data/es/telephone.txt new file mode 100644 index 0000000..18899ba --- /dev/null +++ b/tests/data/es/telephone.txt @@ -0,0 +1,9 @@ +uno dos tres uno dos tres cinco seis siete ocho~123-123-5678 +uno veintitrés uno veintitrés cincuenta y seis setenta y ocho~123-123-5678 +uno dos tres cuatro cinco seis siete ocho nueve~123-456-789 +uno veintitrés cuatro cincuenta y seis siete ochenta y nueve~123-456-789 +uno dos tres cuatro cinco seis siete ocho~1234-5678 +doce treinta y cuatro cincuenta y seis setenta y ocho~1234-5678 +triple tres uno dos tres cinco seis siete ocho~333-123-5678 +más uno uno dos tres uno dos tres cinco seis siete ocho~+1-123-123-5678 +más cincuenta y cuatro uno dos tres uno dos tres cinco seis siete ocho extensión doce~+54-123-123-5678 ext. 12 \ No newline at end of file diff --git a/tests/data/es/telephone_cased.txt b/tests/data/es/telephone_cased.txt new file mode 100644 index 0000000..068867d --- /dev/null +++ b/tests/data/es/telephone_cased.txt @@ -0,0 +1,6 @@ +Uno dos tres uno dos tres cinco seis siete ocho~123-123-5678 +uno veintitrés uno veintitrés cincuenta y seis setenta y ocho~123-123-5678 +Uno Dos Tres Cuatro Cinco Seis Siete Ocho Nueve~123-456-789 +Triple tres uno dos tres cinco seis siete ocho~333-123-5678 +Más uno uno dos tres uno dos tres cinco seis siete ocho~+1-123-123-5678 +más cincuenta y cuatro uno dos tres uno dos tres cinco seis siete ocho Extensión doce~+54-123-123-5678 ext. 12 \ No newline at end of file diff --git a/tests/data/es/time.txt b/tests/data/es/time.txt new file mode 100644 index 0000000..e74a63f --- /dev/null +++ b/tests/data/es/time.txt @@ -0,0 +1,25 @@ +las dieciséis cincuenta~las 16:50 +la una~la una +las dos~las dos +las tres personas~las tres personas +las dos a eme~las 2:00 a.m. +la una pe eme~la 1:00 p.m. +la una y diez~la 1:10 +la una y diez a eme~la 1:10 a.m. +la una y diez pe eme~la 1:10 p.m. +la una diez~la 1:10 +la una con diez~la 1:10 +la una y cuarto~la 1:15 +la una y media~la 1:30 +las dos menos veinte~la 1:40 +las dos menos cuarto~la 1:45 +cuarto para las dos~la 1:45 +un cuarto para las dos~la 1:45 +las veintitrés y media~las 23:30 +las veintitrés y cincuenta y nueve~las 23:59 +las dos de la tarde~las 2:00 p.m. +cuarto para las cero~las 23:45 +cuarto para las veinticuatro~las 23:45 +diez para las doce~las 11:50 +dos y media de la tarde~2:30 p.m. +la una de la tarde u t c más cuatro~la 1:00 p.m. UTC+4 diff --git a/tests/data/es/time_cased.txt b/tests/data/es/time_cased.txt new file mode 100644 index 0000000..ba450d7 --- /dev/null +++ b/tests/data/es/time_cased.txt @@ -0,0 +1,9 @@ +las dieciséis cincuenta~las 16:50 +la una~la una +Las dos~Las dos +Las tres personas~Las tres personas +Las Dos a eme~Las 2:00 a.m. +la una Pe Eme~la 1:00 P.M. +la una y diez~la 1:10 +la una y Diez a eme~la 1:10 a.m. +La Una Y Diez pe eme~La 1:10 p.m. \ No newline at end of file diff --git a/tests/data/es/whitelist.txt b/tests/data/es/whitelist.txt new file mode 100644 index 0000000..d6aa321 --- /dev/null +++ b/tests/data/es/whitelist.txt @@ -0,0 +1,5 @@ +usted~Ud. +ustedes~Uds. +habla usted español~habla Ud. español +hablan ustedes español~hablan Uds. español +estados unidos~EE. UU. \ No newline at end of file diff --git a/tests/data/es/word.txt b/tests/data/es/word.txt new file mode 100644 index 0000000..80b5275 --- /dev/null +++ b/tests/data/es/word.txt @@ -0,0 +1,49 @@ +~ +yahoo!~yahoo! +veinte!~20 ! +x ~x +—~— +aaa~aaa +aabach~aabach +aabenraa~aabenraa +aabye~aabye +aaccessed~aaccessed +aach~aach +aachen's~aachen's +aadri~aadri +aafia~aafia +aagaard~aagaard +aagadu~aagadu +aagard~aagard +aagathadi~aagathadi +aaghart's~aaghart's +aagnes~aagnes +aagomoni~aagomoni +aagon~aagon +aagoo~aagoo +aagot~aagot +aahar~aahar +aahh~aahh +aahperd~aahperd +aaibinterstate~aaibinterstate +aajab~aajab +aakasa~aakasa +aakervik~aakervik +aakirkeby~aakirkeby +aalam~aalam +aalbaek~aalbaek +aaldiu~aaldiu +aalem~aalem +a'ali~a'ali +aalilaassamthey~aalilaassamthey +aalin~aalin +aaliyan~aaliyan +aaliyan's~aaliyan's +aamadu~aamadu +aamara~aamara +aambala~aambala +aamera~aamera +aamer's~aamer's +aamina~aamina +aaminah~aaminah +aamjiwnaang~aamjiwnaang diff --git a/tests/data/es/word_cased.txt b/tests/data/es/word_cased.txt new file mode 100644 index 0000000..3868101 --- /dev/null +++ b/tests/data/es/word_cased.txt @@ -0,0 +1,11 @@ +~ +Yahoo!~Yahoo! +Veinte!~20 ! +X ~X +—~— +AAA~AAA +Aabach~Aabach +aabenraa~aabenraa +Aachen's~Aachen's +aadri~aadri +aaliyan's~aaliyan's \ No newline at end of file diff --git a/tests/data/fr/cardinal.txt b/tests/data/fr/cardinal.txt new file mode 100644 index 0000000..9cebe04 --- /dev/null +++ b/tests/data/fr/cardinal.txt @@ -0,0 +1,106 @@ +cent~100 +dix-huit~18 +vingt et un~21 +vingt-et-un~21 +trente et un~31 +trente-et-un~31 +quarante-trois~43 +quarante trois~40 trois +cinquante et un~51 +cinquante-et-un~51 +soixante et un~61 +soixante-et-un~61 +soixante-dix~70 +soixante-douze~72 +quatre-vingts~80 +quatre-vingt-dix-huit~98 +cent~100 +cent deux~102 +cent-deux~102 +cent vingt~120 +cent-vingt~120 +deux-cents~200 +deux cent neuf~209 +deux-cent-neuf~209 +cent onze~111 +cent-onze~111 +mille~1000 +cent vingt~120 +cent-vingt~120 +mille vingt~1020 +mille-vingt~1020 +neuf billion sept cent quatre-vingt-neuf milliard trois cent quatre-vingt-deux million cinq cent trente-six mille cent trente~9789382536130 +neuf-billion-sept-cent-quatre-vingt-neuf-milliard-trois-cent-quatre-vingt-deux-million-cinq-cent-trente-six-mille-cent-trente~9789382536130 +deux cent cinquante-quatre~254 +deux-cent-cinquante-quatre~254 +cent quarante-sept mille quatre cent cinquante et une~147451 +cent-quarante-sept-mille-quatre-cent-cinquante-et-une~147451 +un million cent cinquante-six mille cent soixante-treize~1156173 +un-million-cent-cinquante-six-mille-cent-soixante-treize~1156173 +un milliard cinq cent quatre-vingt-treize million soixante-douze mille neuf cent soixante et un~1593072961 +un-milliard-cinq-cent-quatre-vingt-treize-million-soixante-douze-mille-neuf-cent-soixante-et-un~1593072961 +un milliard cinq cent quatre-vingt-treize million septante-deux mille neuf cent soixante et un~1593072961 +un-milliard-cinq-cent-quatre-vingt-treize-million-septante-deux-mille-neuf-cent-soixante-et-un~1593072961 +quatre-vingt-dix-sept billiard huit cent huit billion deux cent soixante-quatre milliard sept cent soixante-douze million sept cent quatre-vingt-douze mille cinq~97808264772792005 +quatre-vingt-dix-sept-billiard-huit-cent-huit-billion-deux-cent-soixante-quatre-milliard-sept-cent-soixante-douze-million-sept-cent-quatre-vingt-douze-mille-cinq~97808264772792005 +dix billiard dix billion dix million cent mille dix~10010000010100010 +dix-billiard-dix-billion-dix-million-cent-mille-dix~10010000010100010 +moins vingt-cinq mille trente-sept~-25037 +moins vingt-cinq-mille-trente-sept~-25037 +moins dix-neuf cent trente-sept~-1937 +moins dix-neuf-cent-trente-sept~-1937 +un billiard deux cent soixante-quatre billion trois cent un milliard neuf cent trente-huit million cent quatre~1264301938000104 +un-billiard-deux-cent-soixante-quatre-billion-trois-cent-un-milliard-neuf-cent-trente-huit-million-cent-quatre~1264301938000104 +moins soixante~-60 +quarante-six mille six cent soixante-quatre~46664 +quarante-six-mille-six-cent-soixante-quatre~46664 +soixante~60 +zéro~zéro +un~un +une~une +deux~deux +neuf~neuf +dix~10 +onze~11 +douze~12 +treize~13 +quatorze~14 +quinze~15 +seize~16 +dix-sept~17 +dix-huit~18 +vingt~20 +trente~30 +quarante~40 +cinquante~50 +soixante~60 +soixante-dix~70 +septante~70 +quatre-vingts~80 +huitante~80 +quatre-vingt-dix~90 +deux million dix~2000010 +deux-million-dix~2000010 +mille treize~1013 +mille-treize~1013 +mille un~1001 +mille-un~1001 +mille cent~1100 +mille-cent~1100 +onze cents~1100 +onze-cents~1100 +dix-huit mille treize~18013 +dix-huit-mille-treize~18013 +mille vingt-six~1026 +mille-vingt-six~1026 +mille cent vingt-six~1126 +mille-cent-vingt-six~1126 +onze cent vingt-six~1126 +onze-cent-vingt-six~1126 +dix-huit million quatre cent cinquante mille neuf cent quatre-vingt-dix~18450990 +dix-huit-million-quatre-cent-cinquante-mille-neuf-cent-quatre-vingt-dix~18450990 +dix-huit-million-quatre-cent-cinquante-mille-neuf-cent-nonante~18450990 +dix-huit mille huit cent quatre-vingts~18880 +dix-huit-mille-huit-cent-quatre-vingts~18880 +dix-huit mille huit cent huitante~18880 +dix-huit-mille-huit-cent-huitante~18880 \ No newline at end of file diff --git a/tests/data/fr/date.txt b/tests/data/fr/date.txt new file mode 100644 index 0000000..b31c11c --- /dev/null +++ b/tests/data/fr/date.txt @@ -0,0 +1,6 @@ +vingt-quatre juillet deux-mille-treize~24 juillet 2013 +vingt-quatre juillet~24 juillet +quatorze janvier~14 janvier +premier janvier~1ᵉʳ janvier +trente juin~30 juin +dix-huit mai dix-neuf cent trente~18 mai 1930 \ No newline at end of file diff --git a/tests/data/fr/decimal.txt b/tests/data/fr/decimal.txt new file mode 100644 index 0000000..6e14ac0 --- /dev/null +++ b/tests/data/fr/decimal.txt @@ -0,0 +1,15 @@ +zéro virgule deux million~0,2 million +dix-huit milliards~18 milliards +quatre cent soixante millions~460 millions +quatre-cent-soixante millions~460 millions +quatre-cent-soixante-millions~460 millions +cent vingt millions~120 millions +cent-vingt-millions~120 millions +cent vingt millions~120 millions +dix billions~10 billions +dix-billions~10 billions +moins soixante virgule deux quatre zéro zéro~-60,240 0 +huit cent dix-huit virgule trois zéro trois~818,303 +huit-cent-dix-huit virgule trois zéro trois~818,303 +huit-cent-dix-huit virgule trente trois~818,303 +mille-huit-cent-dix-huit virgule trois zéro trois trois quatre~1 818,303 34 \ No newline at end of file diff --git a/tests/data/fr/electronic.txt b/tests/data/fr/electronic.txt new file mode 100644 index 0000000..f70075b --- /dev/null +++ b/tests/data/fr/electronic.txt @@ -0,0 +1,10 @@ +a point b c arobase g mail point com~a.bc@gmail.com +a point b c at g mail point com~a.bc@gmail.com +c d f at a b c point e d u~cdf@abc.edu +a b c at g mail point a b c~abc@gmail.abc +a b c arobase g mail point a b c~abc@gmail.abc +a b c at a b c point com~abc@abc.com +a s d f un deux trois at a b c point com~asdf123@abc.com +a un b deux arobase a b c point com~a1b2@abc.com +a b trois point s d d point trois at g mail point com~ab3.sdd.3@gmail.com +a b trois point s d d point trois arobase g mail point com~ab3.sdd.3@gmail.com \ No newline at end of file diff --git a/tests/data/fr/measure.txt b/tests/data/fr/measure.txt new file mode 100644 index 0000000..af99890 --- /dev/null +++ b/tests/data/fr/measure.txt @@ -0,0 +1,15 @@ +deux cents mètres~200 m +cinquante-six virgule trois par kilomètre carré~56,3 /km² +deux-cents kilomètres par heure~200 km/h +deux-cents kilomètres heure~200 km/h +quarante-deux-mille-deux-cent-cinquante-neuf par mètre carré~42 259 /m² +moins soixante-six kilogrammes~-66 kg +un virgule zéro zéro zéro zéro vingt-huit centimètre cube~1,000 028 cm³ +cinquante minutes~50 min +deux mètres cubes~2 m³ +quatre-vingt-dix grammes~90 g +quatre-cent-quarante millilitres~440 ml +trois cents micromètres~300 µm +soixante-cinq kilomètres carrés~65 km² +deux kilomètres par heure~2 km/h +soixante virgule vingt-quatre zéro zéro kilogrammes~60,240 0 kg \ No newline at end of file diff --git a/tests/data/fr/money.txt b/tests/data/fr/money.txt new file mode 100644 index 0000000..9d67e8f --- /dev/null +++ b/tests/data/fr/money.txt @@ -0,0 +1,22 @@ +deux dollars~2 $ +un centime~0,01 € +vingt centimes~0,20 € +vingt-deux centimes~0,22 € +deux dollars vingt~2,20 $ +deux euros et vingt centimes~2,20 € +vingt euros~20 € +un franc suisse~1 CHF +vingt euro cinq~20,05 € +un euro~1 € +deux euro~2 € +cinq euro et soixante~5,60 € +cinquante centimes~0,50 € +quatre-vingt mille won~80 000 ₩ +quatre-vingt-mille won~80 000 ₩ +quatre-vingt-millions de wons~80 millions de wons +trois livre~3 £ +trois pence~0,03 £ +zéro euro~0 € +zéro euro quatre-vingt~0,80 € +deux-millions de dollars~2 millions de dollars +quatre virgule quatre-vingt milliards d'euros~4,80 milliards d'euros \ No newline at end of file diff --git a/tests/data/fr/ordinal.txt b/tests/data/fr/ordinal.txt new file mode 100644 index 0000000..5d5c8ef --- /dev/null +++ b/tests/data/fr/ordinal.txt @@ -0,0 +1,23 @@ +centième~100ᵉ +centièmes~100ᵉˢ +vingt-cinq-mille-cent-onzième~25111ᵉ +première~1ʳᵉ +premières~1ʳᵉˢ +premier~1ᵉʳ +premiers~1ᵉʳˢ +second~2ᵈ +seconds~2ᵈˢ +seconde~2ᵈᵉ +secondes~2ᵈᵉˢ +deuxième~2ᵉ +troisième~3ᵉ +quatrième~4ᵉ +onzièmes~11ᵉˢ +treizième~13ᵉ +vingt-et-unième~21ᵉ +vingt-troisièmes~23ᵉˢ +cent-onzième~111ᵉ +cent onzième~111ᵉ +millième~1000ᵉ +dix-neuvième siècle~XIXᵉ siècle +vingtième siècle~XXᵉ siècle \ No newline at end of file diff --git a/tests/data/fr/telephone.txt b/tests/data/fr/telephone.txt new file mode 100644 index 0000000..d8ffa6b --- /dev/null +++ b/tests/data/fr/telephone.txt @@ -0,0 +1,5 @@ +zéro deux douze trente-deux trente trente~02 12 32 30 30 +zéro deux une deux trois deux trois zéro trois zéro~02 12 32 30 30 +deux douze trente-deux trente trente~02 12 32 30 30 +deux une deux trois deux trois zéro trois zéro~02 12 32 30 30 +double neuf douze trente-deux trente trente~99 12 32 30 30 \ No newline at end of file diff --git a/tests/data/fr/time.txt b/tests/data/fr/time.txt new file mode 100644 index 0000000..a838131 --- /dev/null +++ b/tests/data/fr/time.txt @@ -0,0 +1,18 @@ +huit heures~8 h +huit heures du matin~8 h +huit heures du soir~20 h +minuit~0 h +deux heures de l'après-midi~14 h +quatorze heures~14 h +midi~12 h +dix-huit heures~18 h +huit heures sept~8 h 07 +minuit dix-sept~0 h 17 +douze heures~12 h +onze heures et demie~11 h 30 +midi moins le quart~11 h 45 +onze heures et trois quarts~11 h 45 +midi moins trois~11 h 57 +onze heures cinquante-sept~11 h 57 +onze heures trente-huit~11 h 38 +midi moins vingt-deux~11 h 38 \ No newline at end of file diff --git a/tests/data/fr/whitelist.txt b/tests/data/fr/whitelist.txt new file mode 100644 index 0000000..8535bfd --- /dev/null +++ b/tests/data/fr/whitelist.txt @@ -0,0 +1,8 @@ +docteur~Dʳ +docteures~Dʳᵉˢ +monsieur~M. +messieurs~MM. +madame~Mᵐᵉ +mesdames~Mᵐᵉˢ +mademoiselle~Mˡˡᵉ +mademoiselles~Mˡˡᵉˢ \ No newline at end of file diff --git a/tests/data/fr/word.txt b/tests/data/fr/word.txt new file mode 100644 index 0000000..66f3445 --- /dev/null +++ b/tests/data/fr/word.txt @@ -0,0 +1,49 @@ +~ +yahoo!~yahoo! +vingt!~20 ! +x ~x +—~— +aaa~aaa +aabach~aabach +aabenraa~aabenraa +aabye~aabye +aaccessed~aaccessed +aach~aach +aachen's~aachen's +aadri~aadri +aafia~aafia +aagaard~aagaard +aagadu~aagadu +aagard~aagard +aagathadi~aagathadi +aaghart's~aaghart's +aagnes~aagnes +aagomoni~aagomoni +aagon~aagon +aagoo~aagoo +aagot~aagot +aahar~aahar +aahh~aahh +aahperd~aahperd +aaibinterstate~aaibinterstate +aajab~aajab +aakasa~aakasa +aakervik~aakervik +aakirkeby~aakirkeby +aalam~aalam +aalbaek~aalbaek +aaldiu~aaldiu +aalem~aalem +a'ali~a'ali +aalilaassamthey~aalilaassamthey +aalin~aalin +aaliyan~aaliyan +aaliyan's~aaliyan's +aamadu~aamadu +aamara~aamara +aambala~aambala +aamera~aamera +aamer's~aamer's +aamina~aamina +aaminah~aaminah +aamjiwnaang~aamjiwnaang diff --git a/tests/data/hi/address.txt b/tests/data/hi/address.txt new file mode 100644 index 0000000..69447a6 --- /dev/null +++ b/tests/data/hi/address.txt @@ -0,0 +1,25 @@ +सात शून्य शून्य ओक स्ट्रीट~७०० ओक स्ट्रीट +एक एक जंगल रोड~११ जंगल रोड +तीन शून्य एक पार्क एवेन्यू~३०१ पार्क एवेन्यू +गली नंबर एक सात जीएकगढ़~गली नंबर १७ जीएकगढ़ +अदनान अपार्टमेंट फ्लैट नंबर पाँच पाँच~अदनान अपार्टमेंट फ्लैट नंबर ५५ +प्लॉट नंबर आठ बालाजी मार्केट~प्लॉट नंबर ८ बालाजी मार्केट +बूथ सात शून्य, सेक्टर आठ, चंडीगढ़~बूथ ७०, सेक्टर ८, चंडीगढ़ +दो दो दो एक सदर्न स्ट्रीट~२२२१ सदर्न स्ट्रीट +छह दो पाँच स्कूल स्ट्रीट~६२५ स्कूल स्ट्रीट +पाँच शून्य छह स्टेट रोड~५०६ स्टेट रोड +छह छह हाइफ़न चार, पार्कहर्स्ट रोड~६६-४, पार्कहर्स्ट रोड +एक चार बटा तीन, मथुरा रोड~१४/३, मथुरा रोड +अमरावती छह पाँच पाँच नौ तीन शून्य~अमरावती ६५५९३० +अमरावती चार छह आठ दो पाँच दो~अमरावती ४६८२५२ +शिमला, हिमाचल प्रदेश पाँच नौ तीन नौ आठ आठ~शिमला, हिमाचल प्रदेश ५९३९८८ +रांची, झारखंड सात तीन छह पाँच पाँच सात~रांची, झारखंड ७३६५५७ +कोहिमा, नागालैंड चार चार आठ तीन सात सात~कोहिमा, नागालैंड ४४८३७७ +मुंबई, महाराष्ट्र आठ तीन नौ चार आठ आठ~मुंबई, महाराष्ट्र ८३९४८८ +मुंबई, महाराष्ट्र दो नौ शून्य नौ तीन सात~मुंबई, महाराष्ट्र २९०९३७ +गांधीनगर, गुजरात आठ शून्य आठ तीन सात चार~गांधीनगर, गुजरात ८०८३७४ +रायपुर, छत्तीसगढ़ एक एक शून्य छह तीन पाँच~रायपुर, छत्तीसगढ़ ११०६३५ +भोपाल, मध्य प्रदेश सात पाँच एक दो दो पाँच~भोपाल, मध्य प्रदेश ७५१२२५ +अगरतला, त्रिपुरा नौ एक पाँच तीन शून्य पाँच~अगरतला, त्रिपुरा ९१५३०५ +लखनऊ, उत्तर प्रदेश आठ शून्य दो चार आठ एक~लखनऊ, उत्तर प्रदेश ८०२४८१ +श्रीनगर, जम्मू और कश्मीर नौ छह चार पाँच दो तीन~श्रीनगर, जम्मू और कश्मीर ९६४५२३ diff --git a/tests/data/hi/cardinal.txt b/tests/data/hi/cardinal.txt new file mode 100644 index 0000000..4a72216 --- /dev/null +++ b/tests/data/hi/cardinal.txt @@ -0,0 +1,54 @@ +चार चौके~४ चौके +छः खिलाड़ी आउट~६ खिलाड़ी आउट +वनप्लस आठ प्रो~वनप्लस ८ प्रो +पाँच चार्जर~५ चार्जर +चार ओवर में सत्रह रन~४ ओवर में १७ रन +पाँच चॉकलेट्स नौ टॉफ़िज़~५ चॉकलेट्स ९ टॉफ़िज़ +दस हजार निन्यानवे~१००९९ +एक लाख एक~१००००१ +एक सौ~१०० +तीन सौ नौ~३०९ +सात सौ अट्ठानवे~७९८ +पाँच हज़ार~५००० +आठ हज़ार चार~८००४ +नौ हज़ार सोलह~९०१६ +उन्नीस सौ बारह~१९१२ +दो हज़ार दो सौ बाईस~२२२२ +चौदह हज़ार~१४००० +अठारह हज़ार छह~१८००६ +छब्बीस हज़ार इक्कीस~२६०२१ +छियानवे हज़ार आठ सौ ग्यारह~९६८११ +चार लाख~४००००० +दो लाख दो~२००००२ +सात लाख बीस~७०००२० +नौ लाख तीन सौ इक्कीस~९००३२१ +आठ लाख पाँच हज़ार तीन सौ इक्कीस~८०५३२१ +तेईस लाख~२३००००० +पन्द्रह लाख एक~१५००००१ +सत्ताईस लाख आठ सौ बीस~२७००८२० +इक्यानवे लाख इकतीस हज़ार आठ सौ उनतीस~९१३१८२९ +तीन करोड़~३००००००० +एक करोड़ एक~१००००००१ +सात करोड़ तेरह~७०००००१३ +चार करोड़ नौ सौ ग्यारह~४००००९११ +छः करोड़ पाँच हज़ार नौ सौ ग्यारह~६०००५९११ +छः करोड़ पच्चीस हज़ार नौ सौ ग्यारह~६००२५९११ +तीन करोड़ एक लाख पच्चीस हज़ार नौ सौ ग्यारह~३०१२५९११ +दो करोड़ सत्रह लाख पच्चीस हज़ार नौ सौ ग्यारह~२१७२५९११ +तीस करोड़~३०००००००० +अट्ठानवे लाख छिहत्तर हज़ार सात सौ नवासी~९८७६७८९ +तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ~२३४५५६७ +एक करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ~१२३४५५६७ +एक करोड़ इक्कीस लाख इक्कीस हज़ार दो सौ बारह~१२१२१२१२ +एक अरब बारह करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ~११२२३४५५६७ +एक अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ~१०२२३४५५६७ +ग्यारह अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ~११०२२३४५५६७ +इक्यावन अरब दो करोड़ तेईस लाख पैंतालीस हज़ार पाँच सौ सड़सठ~५१०२२३४५५६७ +सवा सात सौ~७२५ +साढ़े सात सौ~७५० +साढ़े सात हज़ार~७५०० +सवा सात हज़ार~७२५० +डेढ़ सौ~१५० +ढाई सौ~२५० +साढ़े सोलह सौ~१६५० +सवा सोलह सौ~१६२५ diff --git a/tests/data/hi/date.txt b/tests/data/hi/date.txt new file mode 100644 index 0000000..402361d --- /dev/null +++ b/tests/data/hi/date.txt @@ -0,0 +1,42 @@ +छः मई~६ मई +तीस जून~३० जून +पच्चीस मार्च दो हज़ार दस~२५ मार्च, २०१० +तीस मार्च उन्नीस सौ नब्बे~३० मार्च, १९९० +मार्च तीस उन्नीस सौ नब्बे~मार्च ३०, १९९० +उन्नीस जून दो हज़ार पाँच~१९ जून, २००५ +पन्द्रह जून दो हज़ार उन्नीस~१५ जून, २०१९ +आठ जनवरी~८ जनवरी +अठारह जुलाई~१८ जुलाई +छब्बीस नवंबर~२६ नवंबर +तीन अप्रैल~३ अप्रैल +चार जनवरी~४ जनवरी +एक अक्टूबर~१ अक्टूबर +तेरह सितंबर~१३ सितंबर +मार्च दो हज़ार दस~मार्च २०१० +दस मार्च~१० मार्च +बारह दिसंबर~१२ दिसंबर +दिसंबर बारह~दिसंबर १२ +एक सितंबर~१ सितंबर +तीन फ़रवरी~३ फ़रवरी +सात जून~७ जून +सत्ताईस जुलाई दो हज़ार ग्यारह~२७ जुलाई, २०११ +जुलाई सत्ताईस~जुलाई २७ +वर्ष दो हज़ार उन्नीस~वर्ष २०१९ +सन उन्नीस सौ नब्बे~सन १९९० +उन्नीस सौ नब्बे से उन्नीस सौ इक्यानबे~१९९०-१९९१ +दो हज़ार पाँच से दो हज़ार उन्नीस~२००५-२०१९ +दो हज़ार पाँच से उन्नीस~२००५-१९ +चौंतीस सौ ईसा पूर्व~३४०० ई.पू. +उन्नीस सौ बीस ईस्वी~१९२० ई. +पच्चीस जनवरी अठारह सौ तिरेपन ईसवी~२५ जनवरी, १८५३ ई. +इकत्तीस मई उन्नीस सौ नब्बे ईसवी~३१ मई, १९९० ई. +पच्चीस ईसा पूर्व~२५ ई.पू. +मार्च की दो~मार्च २ +फ़रवरी की बीस~फ़रवरी २० +उन्नीस सौ नब्बे से उन्नीस सौ इक्यानबे ईसवी~१९९०-१९९१ ई. +दो हज़ार पाँच से दो हज़ार उन्नीस ईसा पूर्व~२००५-२०१९ ई.पू. +दसवें शताब्दी~१०वें शताब्दी +अठाहरवीं शताब्दी~१८वीं शताब्दी +एक हज़ार एकवीं शताब्दी~१००१वीं शताब्दी +एक सौ उन्नीसवां शताब्दी~११९वां शताब्दी +उन्नीस सौ बीस से छब्बीस तक~१९२०-२६ तक \ No newline at end of file diff --git a/tests/data/hi/decimal.txt b/tests/data/hi/decimal.txt new file mode 100644 index 0000000..5b8d866 --- /dev/null +++ b/tests/data/hi/decimal.txt @@ -0,0 +1,13 @@ +दो सौ छह दशमलव दो नौ~२०६.२९ +एक सौ एक दशमलव छह~१०१.६ +एक सौ नौ दशमलव आठ~१०९.८ +एक सौ आठ दशमलव सात पाँच~१०८.७५ +एक सौ दस दशमलव सात पाँच~११०.७५ +एक सौ दो दशमलव तीन~१०२.३ +एक सौ छह दशमलव पाँच~१०६.५ +साढ़े तीन सौ दशमलव दो दो~३५०.२२ +सवा तीन सौ दशमलव दो~३२५.२ +साढ़े चार सौ दशमलव सात पाँच~४५०.७५ +सवा चार सौ दशमलव सात पाँच~४२५.७५ +ढाई सौ दशमलव छह~२५०.६ +डेढ़ सौ दशमलव सात पाँच~१५०.७५ diff --git a/tests/data/hi/fraction.txt b/tests/data/hi/fraction.txt new file mode 100644 index 0000000..21ceff6 --- /dev/null +++ b/tests/data/hi/fraction.txt @@ -0,0 +1,31 @@ +एक सौ नौ बटा एक सौ चौबीस~१०९/१२४ +एक सौ एक बटा दो~१०१/२ +दो सौ एक बटा दो~२०१/२ +एक सौ एक बटा चार~१०१/४ +दो सौ बटा पाँच सौ~२००/५०० +दो सौ बटा बारह~२००/१२ +एक सौ तेईस बटा एक सौ पच्चीस~१२३/१२५ +छह सौ बासठ बटा एक~६६२/१ +एक सौ पाँच बटा सात~१०५/७ +छह सौ चौवन बटा तीन~६५४/३ +एक सौ तैंतीस सही एक बटा दो~१३३ १/२ +एक सौ तैंतीस सही दो बटा तीन~१३३ २/३ +एक सही छह बटा छह~१ ६/६ +दो सही एक बटा छह~२ १/६ +तीन सही तीन बटा चार~३ ३/४ +एक सौ बीस सही तीन बटा चार~१२० ३/४ +एक सौ बीस सही पिछत्तर बटा नब्बे~१२० ७५/९० +तीन सही तीन बटा चार~३ ३/४ +सवा चौरासी~८४ १/४ +डेढ़~१ १/२ +ढाई~२ १/२ +आधा~१/२ +साढ़े~१/२ +सवा~१/४ +पौन~३/४ +पौना~३/४ +सवा पैंतीस~३५ १/४ +साढ़े चार सौ बटा दस~४५०/१० +तीन चौथाई~३/४ +दो तिहाई~२/३ +एक चौथाई~१/४ diff --git a/tests/data/hi/measure.txt b/tests/data/hi/measure.txt new file mode 100644 index 0000000..21615f1 --- /dev/null +++ b/tests/data/hi/measure.txt @@ -0,0 +1,48 @@ +दो सौ छह दशमलव दो नौ ग्राम~२०६.२९ g +दो सौ छह ग्राम~२०६ g +इक्कीस दशमलव शून्य सेल्सियस~२१.० °C +इक्कीस सेल्सियस~२१ °C +बारह हज़ार तेरह दशमलव सात सात सात डेसिग्राम~१२०१३.७७७ dg +बारह हज़ार तेरह डेसिग्राम~१२०१३ dg +चार सौ उनतीस दशमलव एक कैल्विन~४२९.१ K +चार सौ उनतीस कैल्विन~४२९ K +बाईस दशमलव शून्य पाँच मिलिग्राम~२२.०५ mg +बाईस मिलिग्राम~२२ mg +नौ हज़ार दशमलव शून्य शून्य मीट्रिक टन~९०००.०० t +पच्चीस दशमलव एक किग्रा~२५.१ kg +पच्चीस किग्रा~२५ kg +बानवे हज़ार तीन सौ तिरानवे दशमलव शून्य शून्य चार मिलीमीटर~९२३९३.००४ mm +बानवे हज़ार तीन सौ तिरानवे मिलीमीटर~९२३९३ mm +सात दशमलव सात इंच~७.७ in +पाँच सौ दशमलव आठ नौ तीन माइक्रॉन~५००.८९३ µm +पाँच सौ माइक्रॉन~५०० µm +पच्चीस सौ दशमलव छः छः फुट~२५००.६६ ft +पच्चीस सौ फुट~२५०० ft +छप्पन हज़ार तीस दशमलव दो वर्गसेंटीमीटर~५६०३०.२ cm² +छप्पन हज़ार तीस वर्ग सेंटीमीटर~५६०३० cm² +छियासठ दशमलव एक एकड़~६६.१ ac +छियासठ एकड़~६६ ac +चौंतीस सौ नौ दशमलव सात पाँच क्यूबिकमिलीमीटर~३४०९.७५ mm³ +छे सौ अठारह दशमलव दो दो लीटर~६१८.२२ L +चार हज़ार दशमलव शून्य शून्य गैलन~४०००.०० gal +चार हज़ार गैलन~४००० gal +तैंतीस दशमलव तीन तीन किलोमीटर प्रति घंटा~३३.३३ km/h +चौदह हज़ार इकहत्तर दशमलव नौ नौ पिंट~१४०७१.९९ pt +बहत्तर दशमलव आठ तीन मील प्रति घंटा~७२.८३ mi/h +बहत्तर मील प्रति घंटा~७२ mi/h +पौने ग्यारह घंटे~१०.७५ h +साढ़े सात वर्ष~७.५ yr +सवा ग्यारह सौ मीटर~११२५ m +पौने चार सौ हेक्टेयर~३७५ ha +साढ़े दस घन फीट~१०.५ ft³ +पौने पांच सौ किलोमीटर~४७५ km +ढाई सौ गैलन~२५० gal +डेढ़ दर्जन~१.५ doz +साढ़े सात ऐंपीयर~७.५ A +पौने तीन हजार एकड़~२७५० ac +साढ़े बारह वर्ग माइक्रोमीटर~१२.५ µm² +ढाई महीने~२.५ mo +दो बाई दो~२x२ +दो बाई दो~२x२ +पाँच बाई पाँच~५x५ +बाईस बाई पाँच घन फीट~२२x५ ft³ diff --git a/tests/data/hi/money.txt b/tests/data/hi/money.txt new file mode 100644 index 0000000..8821940 --- /dev/null +++ b/tests/data/hi/money.txt @@ -0,0 +1,50 @@ +तैंतीस अल्जीरियाई दिनार~دج३३ +बारह हज़ार तेरह डॉलर~$१२०१३ +चौदह हज़ार इकहत्तर दशमलव नौ नौ बेलारूसी रूबल~br१४०७१.९९ +छे सौ अठारह चीनी युआन~元६१८ +अट्ठाईस सौ दशमलव शून्य आठ आर्मेनियाई ड्राम~֏२८००.०८ +पच्चीस सौ छः अरूबान फ्लोरिन~ƒ२५०६ +बहत्तर त्रिनिदाद और टोबैगो डॉलर~tt$७२ +छियासठ तुर्की लिरा~₺६६ +चार सौ उनतीस युगांडा शिलिंग~ush४२९ +बाईस दशमलव शून्य पाँच यूक्रेनी ग्रिव्ना~₴२२.०५ +पच्चीस वॉन~₩२५ +छप्पन हज़ार तीस वेनेजुएलन बोलिवार~bs.५६०३० +चौंतीस सौ नौ साइप्रस पाउंड~cyp३४०९ +बानवे हज़ार तीन सौ तिरानवे दशमलव शून्य शून्य चार लिलांगेनी~l९२३९३.००४ +छे सौ अठारह बहरीन दिरहम~.د.ب६१८ +दो सौ छह रुपये दो सौ छह पैसे~₹२०६.२०६ +अड़तीस रुपिया~₹३८ +इक्यानबे सौ रुपेया और दो सौ पैसा~₹९१००.२०० +नौ हज़ार दशमलव शून्य शून्य पैसे~p९०००.०० +चौदह हज़ार इकहत्तर अजरबैजानी मनात~₼१४०७१ +इकहत्तर हज़ार इकहत्तर बिटकॉइन~₿७१०७१ +बत्तीस बुरुंडी फ्रैंक~fbu३२ +पन्द्रह सौ कैमन आइलैंड्स डॉलर~ci$१५०० +छह सौ पच्चीस रुपये दो पैसे~₹६२५.२ +साढ़े सात सौ डॉलर~$७५० +सवा दो सौ यूक्रेनी ग्रिव्ना~₴२२५ +साढ़े छः लाख रुपए~₹६५०००० +सवा छः लाख अल्जीरियाई दिनार~دج६२५००० +सवा पंद्रह लाख युगांडा शिलिंग~ush१५२५००० +साढ़े पंद्रह लाख रुपए~₹१५५०००० +साढ़े पाँच हज़ार लीरा~₺५५०० +ढाई सौ यूरो~€२५० +ढाई हजार बुरुंडी फ्रैंक~fbu२५०० +ढाई करोड़ रुपए~₹२५०००००० +ढाई लाख रुपए~₹२५०००० +डेढ़ सौ यूरो~€१५० +डेढ़ हजार रुपए~₹१५०० +डेढ़ करोड़ रुपए~₹१५०००००० +डेढ़ लाख रुपए~₹१५०००० +पौने तीन सौ रुपए~₹२७५ +पौने पंद्रह सौ रुपए~₹१४७५ +पौने तीन हजार रुपए~₹२७५० +पौने पंद्रह हजार यूरो~€१४७५० +पौने पैंतालिस हजार यूरो~€४४७५० +पौने तीन लाख रुपए~₹२७५००० +पौने पंद्रह लाख रुपए~₹१४७५००० +पौने पैंतालिस लाख रुपए~₹४४७५००० +पौने तीन करोड़ रुपए~₹२७५००००० +पौने पंद्रह करोड़ रुपए~₹१४७५००००० +पौने पैंतालिस करोड़ रुपए~₹४४७५००००० diff --git a/tests/data/hi/ordinal.txt b/tests/data/hi/ordinal.txt new file mode 100644 index 0000000..3a65fdf --- /dev/null +++ b/tests/data/hi/ordinal.txt @@ -0,0 +1,13 @@ +एक हज़ार एकवीं~१००१वीं +सौवां~१००वां +एक सौ एकवां~१०१वां +दसवां~१०वां +दसवीं~१०वीं +दसवें~१०वें +एक सौ उन्नीसवां~११९वां +एक सौ उन्नीसवीं~११९वीं +एक सौ उन्नीसवें~११९वें +अट्ठानवे सौ छब्बीसवीं~९८२६वीं +अट्ठानवेवीं~९८वीं +निन्यानवेवां~९९वां +छे सौ चालीसवीं~६४०वीं \ No newline at end of file diff --git a/tests/data/hi/telephone.txt b/tests/data/hi/telephone.txt new file mode 100644 index 0000000..3b84a33 --- /dev/null +++ b/tests/data/hi/telephone.txt @@ -0,0 +1,28 @@ +एक एक एक एक एक एक~११११११ +पाँच शून्य शून्य शून्य एक दो~५०००१२ +एक दो तीन चार पाँच छह~१२३४५६ +चार शून्य शून्य शून्य एक शून्य~४०००१० +सात पाँच शून्य शून्य शून्य दो~७५०००२ +आठ आठ शून्य नौ नौ शून्य~८८०९९० +नौ आठ सात छह पाँच चार तीन दो एक शून्य~९८७६५४३२१० +सात शून्य एक दो तीन चार पाँच छह सात आठ~७०१२३४५६७८ +आठ आठ आठ सात सात सात छह छह छह छह~८८८७७७६६६६ +छह दो नौ शून्य एक पाँच सात तीन चार आठ~६२९०१५७३४८ +नौ नौ आठ आठ सात सात छह छह पाँच पाँच~९९८८७७६६५५ +प्लस नौ एक नौ आठ सात छह पाँच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० +प्लस नौ एक सात शून्य एक दो तीन चार पाँच छह सात आठ~+९१ ७०१२३४५६७८ +प्लस नौ एक आठ आठ आठ सात सात सात छह छह छह छह~+९१ ८८८७७७६६६६ +प्लस नौ एक एक एक एक एक एक एक एक एक एक एक~+९१ ११११११११११ +शून्य दो शून्य दो चार तीन सात एक पाँच चार दो~०२०२४३७१५४२ +शून्य एक एक दो छह एक दो तीन चार पाँच छह~०११२६१२३४५६ +चार चार दो दो आठ आठ छह छह चार चार~४४२२८८६६४४ +शून्य आठ शून्य चार एक दो तीन चार पाँच छह सात~०८०४१२३४५६७ +दो दो छह छह पांच चार तीन दो एक शून्य~२२६६५४३२१० +zero one three three six two three four five six seven~०१३३६२३४५६७ +zero one three four two three two one five four eight~०१३४२३२१५४८ +एक दो तीन चार~१२३४ +पाँच शून्य शून्य नौ~५००९ +चार चार चार चार~४४४४ +सात आठ नौ एक~७८९१ +एक शून्य दो शून्य~१०२० +नौ आठ सात छह~९८७६ \ No newline at end of file diff --git a/tests/data/hi/time.txt b/tests/data/hi/time.txt new file mode 100644 index 0000000..8ec5e4d --- /dev/null +++ b/tests/data/hi/time.txt @@ -0,0 +1,25 @@ +एक बजे सात मिनट~१:०७ +दो बजकर ग्यारह मिनट~२:११ +दो बजके इकतालीस मिनट~२:४१ +बारह बजकर चौवन मिनट~१२:५४ +ग्यारह बजे~११:०० +सात बजे~७:०० +चार बजके नौ मिनट~४:०९ +आठ बजकर पैंतालीस मिनट~८:४५ +छः बजके पाँच मिनट~६:०५ +छह बजे~६:०० +बारह पन्द्रह~१२:१५ +दस अठारह~१०:१८ +चार बजे पाँच सेकंड~४:००:०५ +नौ घंटा दो सेकंड~९:००:०२ +सोलह घंटा एक मिनट सत्ताईस सेकंड~१६:०१:२७ +दस बजकर चौवन मिनट आठ सेकंड~१०:५४:०८ +तीन मिनट उन्नीस सेकंड~००:०३:१९ +ढाई बजे~२:३० +डेढ़ बजे~१:३० +डेढ़ घंटा~१:३० +साढ़े पाँच बजे~५:३० +सवा चार बजे~४:१५ +साढ़े ग्यारह~११:३० +पौने पाँच~४:४५ +पौने तीन घंटा~२:४५ diff --git a/tests/data/hi/whitelist.txt b/tests/data/hi/whitelist.txt new file mode 100644 index 0000000..68f4fd7 --- /dev/null +++ b/tests/data/hi/whitelist.txt @@ -0,0 +1,8 @@ +मास्टर निखिल तनिष~मा. निखिल तनिष +पाव~१/४ +श्रीमती ज्योत्सना~स्मि. ज्योत्सना +डॉक्टर~डॉ. +आधा कप चाय~१/२ कप चाय +श्रीमान भारत कुमार~श्री. भारत कुमार +डॉक्टर प्रशांत~डॉ. प्रशांत +कुमारी~कु. diff --git a/tests/data/hi/word.txt b/tests/data/hi/word.txt new file mode 100644 index 0000000..ce044e7 --- /dev/null +++ b/tests/data/hi/word.txt @@ -0,0 +1,15 @@ +नींद~नींद +याहू!~याहू! +-~- +आआआ~आआआ +आकाशगंगा~आकाशगंगा +लटरपटर~लटरपटर +कच्चा-पक्का~कच्चा-पक्का +गुब्बारा~गुब्बारा +चिट्ठी~चिट्ठी +ढूंढना~ढूंढना +लोहे का!~लोहे का! +टाटा~टाटा +~ +झ~झ +संगीत~संगीत \ No newline at end of file diff --git a/tests/data/ja/cardinal.txt b/tests/data/ja/cardinal.txt new file mode 100644 index 0000000..c763a3d --- /dev/null +++ b/tests/data/ja/cardinal.txt @@ -0,0 +1,28 @@ +一~1 +百~100 +五千億~500,000,000,000 +五万~50,000 +五兆~5,000,000,000,000 +十一兆~11,000,000,000,000 +十一兆一~11,000,000,000,001 +九十九兆~99,000,000,000,000 +一兆~1,000,000,000,000 +一兆一~1,000,000,000,001 +一兆十~1,000,000,000,010 +一兆百~1,000,000,000,100 +一兆千~1,000,000,001,000 +一兆一万~1,000,000,010,000 +一兆十万~1,000,000,100,000 +一兆百万~1,000,001,000,000 +一兆一千万~1,000,010,000,000 +そこに鳥一羽がいます~そこに鳥1羽がいます +これから百数えてください~これから100数えてください +生産に掛かる費用は五千億になります~生産に掛かる費用は500,000,000,000になります +お年玉五万あげる~お年玉50,000あげる +五兆円分の株式を買った~5,000,000,000,000円分の株式を買った +今年の収益は十一兆になる~今年の収益は11,000,000,000,000になる +隣の会社の年収益は九十九兆だそうだ~隣の会社の年収益は99,000,000,000,000だそうだ +政府は一兆の赤字で困っている~政府は1,000,000,000,000の赤字で困っている +兵士五百人を派遣する~兵士500人を派遣する +お寺に一万寄付した~お寺に10,000寄付した +クラスに二十人いる~クラスに20人いる \ No newline at end of file diff --git a/tests/data/ja/date.txt b/tests/data/ja/date.txt new file mode 100644 index 0000000..c22d263 --- /dev/null +++ b/tests/data/ja/date.txt @@ -0,0 +1,31 @@ +一日~1日 +一月~1月 +一月一日~1月1日 +一月二十二日~1月22日 +七十から八十年代~70〜80年代 +七十年代~70年代 +七月~7月 +七月二十三日~7月23日 +八月四日~8月4日 +五から九日~5〜9日 +九月~9月 +三から四月~3〜4月 +三月一日水曜日~3月1日(水) +四月三十日日曜日~4月30日(日) +三月二十日~3月20日 +三月~3月 +九十年代~90年代 +九月~9月 +九月五日~9月5日 +二十一世紀~21世紀 +二十一日月曜日~21日(月) +今日は一月二十二日~今日は1月22日 +毎月の三十日はゴミの日~毎月の30日はゴミの日 +七十年代はロックがはやってた~70年代はロックがはやってた +二十一世紀でやることじゃない~21世紀でやることじゃない +正月は一月一日から始まる~正月は1月1日から始まる +五から九日は休みの日~5〜9日は休みの日 +誕生日は千九百九十九年三月二十日~誕生日は1999年3月20日 +八月と九月はまだ夏~8月と9月はまだ夏 +放送日は三月一日水曜日~放送日は3月1日(水) +十月になるとすずしい~10月になるとすずしい diff --git a/tests/data/ja/decimal.txt b/tests/data/ja/decimal.txt new file mode 100644 index 0000000..d5080ac --- /dev/null +++ b/tests/data/ja/decimal.txt @@ -0,0 +1,32 @@ +マイナス一点零六~-1.06 +マイナス七点零零六~-7.006 +マイナス三十九点五七四~-39.574 +マイナス三点八六~-3.86 +マイナス九十二点一五七四~-92.1574 +マイナス九点零三八~-9.038 +マイナス二点八七四一~-2.8741 +マイナス二百三十一点四六零九~-231.4609 +マイナス五十二点一八~-52.18 +マイナス五点三~-5.3 +マイナス五百七十九点三零零二~-579.3002 +マイナス八十六点四~-86.4 +マイナス八点四零九~-8.409 +マイナス八百二十一点七九五四~-821.7954 +マイナス八百五十二点七~-852.7 +マイナス六十一点零七~-61.07 +マイナス六点八一四~-6.814 +マイナス六百五十七点三零二四~-657.3024 +マイナス四十二点六零五~-42.605 +マイナス四百八十九点零五二一~-489.0521 +答えはマイナス一点零六~答えは-1.06 +計算の結果はマイナス七点零零六~計算の結果は-7.006 +マイナス二点八七四はかなり悪いスコア~-2.874はかなり悪いスコア +五点三は平均点~5.3は平均点 +テストの点数は八十六点四~テストの点数は86.4 +マイナス三十九点五七四は低すぎる~-39.574は低すぎる +答えはマイナス一点零六~答えは-1.06 +計算の結果はマイナス八十六点四~計算の結果は-86.4 +マイナス五十二点一八はかなり悪いスコア~-52.18はかなり悪いスコア +六点八一四は平均点~6.814は平均点 +テストの点数は九十二点一五七四~テストの点数は92.1574 +マイナス七点零零六は低すぎる~-7.006は低すぎる diff --git a/tests/data/ja/fraction.txt b/tests/data/ja/fraction.txt new file mode 100644 index 0000000..32f80e8 --- /dev/null +++ b/tests/data/ja/fraction.txt @@ -0,0 +1,34 @@ +マイナス一と四分の三~-1 3/4 +一と四分の三~1 3/4 +マイナス一分の九~-9/1 +マイナス一分の六十~-60/1 +マイナス一分の百二十三~-123/1 +マイナス一荷四分の三~-1 3/4 +マイナス七百二十分の一~-1/720 +マイナス三十二分の三十一~-31/32 +マイナス三百九十七分の四~-4/397 +マイナス三百五十分の一~-1/350 +マイナス九十八分の四百七十一~-471/98 +マイナス二と五分の三~-2 3/5 +マイナス二十分の九~-9/20 +マイナス二十分の二十一~-21/20 +マイナス二十四分の一~-1/24 +マイナス二百二十分の一~-1/220 +マイナス二百五十二分の百四十七~-147/252 +マイナス二百五十六分の一~-1/256 +マイナス二荷五分の三~-2 3/5 +マイナス五分の七~-7/5 +マイナス五分の八~-8/5 +マイナス五分の十四~-14/5 +マイナス五分の百三十二~-132/5 +マイナス八分の五~-5/8 +答えはマイナス八分の五~答えは-5/8 +三分の一の人がその場を離れた~1/3の人がその場を離れた +約二分の一を削る~約1/2を削る +十分の三を削って吟醸をつくる~3/10を削って吟醸をつくる +一人三分の一ぐらい取る~1人1/3ぐらい取る +答えは九分の一~答えは1/9 +三分の二の人がその場を離れた~2/3の人がその場を離れた +約十分の一を削る~約1/10を削る +三分の一を削って吟醸をつくる~1/3を削って吟醸をつくる +一人二分の一とぐらい取る~1人1/2とぐらい取る \ No newline at end of file diff --git a/tests/data/ja/ordinal.txt b/tests/data/ja/ordinal.txt new file mode 100644 index 0000000..873f3f9 --- /dev/null +++ b/tests/data/ja/ordinal.txt @@ -0,0 +1,65 @@ +一万一番目~10001番目 +一万番目~10000番目 +一番目~1番目 +七十番目~70番目 +七千番目~7000番目 +七番目~7番目 +七百番目~700番目 +三十番目~30番目 +三千三百三十番目~3330番目 +三千番目~3000番目 +三番目~3番目 +三百番目~300番目 +九十番目~90番目 +九千九百九十九番目~9999番目 +九千番目~9000番目 +九番目~9番目 +九百番目~900番目 +二十番目~20番目 +二千二百番目~2200番目 +二千番目~2000番目 +二番目~2番目 +二百番目~200番目 +五十番目~50番目 +五千番目~5000番目 +五番目~5番目 +五百番目~500番目 +八十番目~80番目 +八千番目~8000番目 +第一~第1 +第一万~第10000 +第一万一~第10001 +第一万九千~第19000 +第一万九千八百~第19800 +第七~第7 +第七万~第70000 +第七万二千六~第72006 +第七十~第70 +第七十二~第72 +第七千~第7000 +第七千八十九~第7089 +第七百~第700 +第七百三十~第730 +第七百三十五~第735 +第七百九~第709 +第三~第3 +第三万四~第30004 +第三十~第30 +第三千~第3000 +第三千三百二十二~第3322 +第三千十七~第3017 +第三千四百一~第3401 +第三百~第300 +第九~第9 +第九万~第90000 +第九十~第90 +五番目私の席~5番目私の席 +第七班に任務を任せる~第7班に任務を任せる +この角からまっすぐ行って三番目の交差点で曲がる~この角からまっすぐ行って3番目の交差点で曲がる +田中君は二番目の席~田中君は2番目の席 +トップから数えて第七十二~トップから数えて第72 +八番目私の席~8番目私の席 +第十三班に任務を任せる~第13班に任務を任せる +この角からまっすぐ行って二番目の交差点で曲がる~この角からまっすぐ行って2番目の交差点で曲がる +田中君は五番目の席~田中君は5番目の席 +トップから数えて第89~トップから数えて第89 diff --git a/tests/data/ja/time.txt b/tests/data/ja/time.txt new file mode 100644 index 0000000..6a50821 --- /dev/null +++ b/tests/data/ja/time.txt @@ -0,0 +1,40 @@ +七時一分~7時1分 +七時四分~7時4分 +九時五十八分~9時58分 +九時十分前~9時10分前 +九時四十分~9時40分 +五時二十六分~5時26分 +六時五十五分~6時55分 +三時~3時 +三時~3時 +正午一分前~正午1分前 +正午十分過ぎ~正午10分過ぎ +九時三十分~9時30分 +七時五十分頃~7時50分頃 +一時~1時 +一時十分~1時10分 +三時~3時 +十七時~17時 +二十時~20時 +二十一時~21時 +二時~2時 +十二時三十分~12時30分 +零時~0時 +零時一分前~0時1分前 +二時~2時 +十二時~12時 +二十時~20時 +二十三時~23時 +二十四時~24時 +零時~0時 +四時~4時 +毎日五時に起きる~毎日5時に起きる +九時四十分の予約になります~9時40分の予約になります +現在の時間は十二時三十分~現在の時間は12時30分 +ちょうど零時になった~ちょうど0時になった +四時で店を閉める~4時で店を閉める +毎日六時に起きる~毎日6時に起きる +十時三十分の予約になります~10時30分の予約になります +現在の時間は十時三分~現在の時間は10時3分 +ちょうど一時になった~ちょうど1時になった +七時で店を閉める~7時で店を閉める diff --git a/tests/data/zh/cardinal.txt b/tests/data/zh/cardinal.txt new file mode 100644 index 0000000..02d3dcb --- /dev/null +++ b/tests/data/zh/cardinal.txt @@ -0,0 +1,130 @@ +一百~100 +一百零一~101 +一百一十一~111 +两百~200 +九百~900 +九百五十~950 +九百五十一~951 +一千~1,000 +一千零一~1,001 +一千一百~1,100 +一千一百零一~1,101 +一千零五十~1,050 +一千一百一十~1,110 +一千一百十~1,110 +一千一百一十一~1,111 +两千~2,000 +九千九百九十九~9,999 +一万一千~11,000 +一万一千一百~11,100 +一万一千一百一十~11,110 +一万一千一百一十一~11,111 +一万零一百~10,100 +一万零一百五十~10,150 +一万零一百五十一~10,151 +一万零一~10,001 +一万零五十~10,050 +一万零五十一~10,051 +一万~1万 +两万~2万 +三万~3万 +四万~4万 +五万~5万 +六万~6万 +七万~7万 +八万~8万 +九万~9万 +十万~10万 +十萬~10萬 +九十万~90万 +九十一万~91万 +九十万五千八百二十五~905,825 +九十一万五千八百二十五~915,825 +十一万~11万 +十万一千一百一十一~101,111 +十万一千一百~101,100 +十万一千~101,000 +十万零一百~100,100 +十万零十~100,010 +十万零一~100,001 +一百万~100万 +一百一十万~110万 +一百一十一万~111万 +两百万~200万 +两百一十万~210万 +两百零一万~201万 +一百一十九万~119万 +一百一十九万九千~1,199,000 +一百一十九万九千九百~1,199,900 +一百一十九万九千九百九十~1,199,990 +一百一十九万九千九百九十九~1,199,999 +一百一十九万零九~1,190,009 +一百一十九万零九百九十一~1,190,991 +一千万~1,000万 +一千一百万~1,100万 +一千一百一十万~1,110万 +一千一百一十一万~1,111万 +一千一百一十一万九千~11,119,000 +一千一百一十一万九千一百~11,119,100 +一千一百一十一万九千一百二十~11,119,120 +一千一百一十一万九千一百二十一~11,119,121 +一千一百一十一万零一~11,110,001 +一千一百一十一万零一十~11,110,010 +一千一百一十一万零一百~11,110,100 +一千零一十万零一百~10,100,100 +一千零一十一万零一百~10,110,100 +一千零一万零一百~10,010,100 +一億~1億 +一億一千萬~110,000,000 +一億一千一百萬~111,000,000 +一億一千一百一十萬~111,100,000 +一億一千一百一十一萬~111,110,000 +一億零一百萬~101,000,000 +一億零一百一十萬~101,100,000 +一億零一百一十一萬~101,110,000 +一億零一十萬~100,100,000 +一億零一十一萬~100,110,000 +一億零一萬~100,010,000 +一億零一萬一千~100,011,000 +一億零一萬一千一百~100,011,100 +一億零一萬一千一百一~100,011,101 +一億零一萬一千一百一十一~100,011,111 +一億零一萬一千一百零五~100,011,105 +一億零一萬一千零五~100,011,005 +十億~10億 +十一億~11億 +十一億九千萬~1,190,000,000 +十一億九千一百萬~1,191,000,000 +十一億九千一百一十萬~1,191,100,000 +十一億九千一百一十一萬~1,191,110,000 +十一億零一百一十萬~1,101,100,000 +十一億零一十萬~1,100,100,000 +十一億零一萬~1,100,010,000 +十一億零十萬~1,100,100,000 +十一億零九千~1,100,009,000 +十一億零九百~1,100,000,900 +十一億零九十~1,100,000,090 +十一億零九~1,100,000,009 +一百億~100億 +一百一十億~110億 +一百一十一億~111億 +一百一十一億九千萬~11,190,000,000 +一百一十一億九千九百萬~11,199,000,000 +一百一十一億九千九百一十萬~11,199,100,000 +一百一十一億九千九百一十一萬~11,199,110,000 +一百一十一億九千九百一十一萬九千~11,199,119,000 +一百一十一億九千九百一十一萬九千九百一十一~11,199,119,911 +一百零一億~101億 +一百零一億零九百萬~10,109,000,000 +一百零一億零九十萬~10,100,900,000 +一百零一億零九萬~10,100,090,000 +一百零一億零九萬零一百~10,100,090,100 +一千億~1,000億 +一千一百億~1,100億 +一千零五十億~1,050億 +一千零五億~1,005億 +一千億九千萬~100,090,000,000 +一千億零九百萬~100,009,000,000 +一千億零九十萬~100,000,900,000 +一千億零九萬~100,000,090,000 +一千億零九十萬零五百~100,000,900,500 \ No newline at end of file diff --git a/tests/data/zh/date.txt b/tests/data/zh/date.txt new file mode 100644 index 0000000..5404b8e --- /dev/null +++ b/tests/data/zh/date.txt @@ -0,0 +1,31 @@ +一七九八年五月三十日~1798年5月30日 +五月三十日~5月30日 +一七九八年五月~1798年5月 +八月~8月 +一七九八年~1798年 +十九日~19日 +一九九四年一月二日~1994年1月2日 +一九九五年二月三日~1995年2月3日 +二零零零年三月五日~2000年3月5日 +二零零一年四月六日~2001年4月6日 +公元一七九八年五月三十日~公元1798年5月30日 +公元一八三五年~公元1835年 +公元一八三四年八月~公元1834年8月 +公元一九九四年一月二日~公元1994年1月2日 +公元一九九五年二月三日~公元1995年2月3日 +公元二零零零年三月五日~公元2000年3月5日 +公元二零零一年四月六日~公元2001年4月6日 +公元前一七九八年~公元前1798年 +公元前二八零九年~公元前2809年 +公元前一九九四年一月二日~公元前1994年1月2日 +公元前一九九五年二月三日~公元前1995年2月3日 +公元前二零零零年三月五日~公元前2000年3月5日 +公元前二零零一年四月六日~公元前2001年4月6日 +纪元前一九三四年一月二日~公元前1934年1月2日 +纪元前一九九八年三月三日~公元前1998年3月3日 +纪元前二零零零年三月五日~公元前2000年3月5日 +纪元前二零零一年四月六日~公元前2001年4月6日 +纪元一二三四年一月二日~公元1234年1月2日 +纪元二零五六年二月三日~公元2056年2月3日 +纪元二零零零年三月五日~公元2000年3月5日 +纪元二零零一年四月六日~公元2001年4月6日 \ No newline at end of file diff --git a/tests/data/zh/decimal.txt b/tests/data/zh/decimal.txt new file mode 100644 index 0000000..a73dc30 --- /dev/null +++ b/tests/data/zh/decimal.txt @@ -0,0 +1,42 @@ +一点零~1.0 +十五点零~15.0 +一百点零~100.0 +一百零一点五~101.5 +一点零五六~1.056 +一点零零五六~1.0056 +一点零零零五六~1.00056 +两百点一~200.1 +三千点五~3,000.5 +四万点六~40,000.6 +一點零零五~1.005 +九十九點零零零五~99.0005 +一百點五七三五~100.5735 +一千五百点零一~1,500.01 +负五万点二四五~-50,000.245 +负十五万点三七九~-150,000.379 +负一点一~-1.1 +负十点五~-10.5 +負十點五~-10.5 +負九十九點九五~-99.95 +負一百五十點一二~-150.12 +負一千五百零九點五一~-1,509.51 +負五萬點三~-50,000.3 +負五點零一~-5.01 +負十點零零一~-10.001 +負十點零零零三~-10.0003 +負一百點零零零零四~-100.00004 +一点一二三四五六七八九~1.123456789 +负五点一零二~-5.102 +负三点一二零三~-3.1203 +负十点一二三零五~-10.12305 +伍拾壹点肆~51.4 +壹佰点叁肆~100.34 +贰拾点伍陆~20.56 +柒拾捌点玖~78.9 +负叁拾壹点肆~-31.4 +负壹佰点叁肆~-100.34 +负贰拾点伍陆~-20.56 +负柒拾点玖~-70.9 +負贰拾点叁肆~-20.34 +負玖点玖~-9.9 +負壹佰贰拾点叁肆~-120.34 \ No newline at end of file diff --git a/tests/data/zh/fraction.txt b/tests/data/zh/fraction.txt new file mode 100644 index 0000000..473f1df --- /dev/null +++ b/tests/data/zh/fraction.txt @@ -0,0 +1,20 @@ +五分之一~1/5 +二分之一~1/2 +三分之一~1/3 +十分之一~1/10 +一百分之一~1/100 +一千分之一~1/1000 +五分之二~2/5 +三分之二~2/3 +十分之五~5/10 +一千分之五~5/1000 +三又五分之一~3又1/5 +一又二分之一~1又1/2 +一又三分之一~1又1/3 +三又十分之一~3又1/10 +五十又一百分之一~50又1/100 +三又一千分之五~3又5/1000 +六又十分之五~6又5/10 +八又七分之五~8又5/7 +九又四分之三~9又3/4 +五分之四~4/5 diff --git a/tests/data/zh/money.txt b/tests/data/zh/money.txt new file mode 100644 index 0000000..2504e7d --- /dev/null +++ b/tests/data/zh/money.txt @@ -0,0 +1,49 @@ +一千美元~US$1000 +五千美元~US$5000 +一万美元~US$1万 +一点五万美元~US$1.5万 +五十万美元~US$50万 +一百万美元~US$100万 +一千万美元~US$1000万 +一千元~¥1000 +五千元~¥5000 +一万元~¥1万 +一千五万元~¥1005万 +五十万元~¥50万 +一百万元~¥100万 +一千万元~¥1000万 +一千欧元~€1000 +五千欧元~€5000 +一万欧元~€1万 +一点五万欧元~€1.5万 +五十万欧元~€50万 +一百万欧元~€100万 +一千万欧元~€1000万 +一千英镑~£1000 +五千英镑~£5000 +一万英镑~£1万 +一点五万英镑~£1.5万 +五十万英镑~£50万 +一百万英镑~£100万 +一千万英镑~£1000万 +一千韩元~₩1000 +五千韩元~₩5000 +一万韩元~₩1万 +一点五万韩元~₩1.5万 +五十万韩元~₩50万 +一百万韩元~₩100万 +一千万韩元~₩1000万 +一千印度卢布~₹1000 +五千印度卢布~₹5000 +一万印度卢布~₹1万 +一点五万印度卢布~₹1.5万 +五十万印度卢布~₹50万 +一百万印度卢布~₹100万 +一千万印度卢布~₹1000万 +一千日元~JPY¥1000 +五千日元~JPY¥5000 +一万日元~JPY¥1万 +一点五万日元~JPY¥1.5万 +五十万日元~JPY¥50万 +一百万日元~JPY¥100万 +一千万日元~JPY¥1000万 diff --git a/tests/data/zh/ordinal.txt b/tests/data/zh/ordinal.txt new file mode 100644 index 0000000..828ec62 --- /dev/null +++ b/tests/data/zh/ordinal.txt @@ -0,0 +1,57 @@ +第一百~第100 +第五百~第500 +第兩萬一千一百一十一~第21111 +第一百~第100 +第二百~第200 +第兩千~第2000 +第两万~第2万 +第十万~第10万 +第一百万~第100万 +第一千万~第1000万 +第一亿~第1亿 +第一百零一~第101 +第十亿~第10亿 +第五十万~第50万 +第一百一十一~第111 +第十万一千一百一十一~第101111 +第十万一千一百~第101100 +第十万一千~第101000 +第十万零一百~第100100 +第十万零十~第100010 +第十万零一~第100001 +第一百万~第100万 +第一百一十万~第110万 +第一百一十一万~第111万 +第两百万~第200万 +第两百一十万~第210万 +第两百零一万~第201万 +第一百一十九万~第119万 +第一百一十九万九千~第1199000 +第一百一十九万九千九百~第1199900 +第一百一十九万九千九百九十~第1199990 +第一百一十九万九千九百九十九~第1199999 +第一百一十九万零九~第1190009 +第一百一十九万零九十~第1190090 +第一百一十九万零九十一~第1190091 +第一百一十九万零九百九十一~第1190991 +第一千万~第1000万 +第一千一百万~第1100万 +第一千一百一十万~第1110万 +第一千一百一十一万~第1111万 +第一千一百一十一万九千~第11119000 +第一千一百一十一万九千一百~第11119100 +第一千一百一十一万九千一百二十~第11119120 +第一千一百一十一万九千一百二十一~第11119121 +第一千一百一十一万零一~第11110001 +第一千一百一十一万零一十~第11110010 +第一千一百一十一万零一百~第11110100 +第一千零一十万零一百~第10100100 +第一千零一十一万零一百~第10110100 +第一千零一万零一百~第10010100 +第一億~第1億 +第一億一千萬~第110000000 +第一億一千一百萬~第111000000 +第一億一千一百一十萬~第111100000 +第一億一千一百一十一萬~第111110000 +第一億零一百萬~第101000000 +第一億零一百一十萬~第101100000 \ No newline at end of file diff --git a/tests/data/zh/time.txt b/tests/data/zh/time.txt new file mode 100644 index 0000000..01b2a5d --- /dev/null +++ b/tests/data/zh/time.txt @@ -0,0 +1,23 @@ +五点五分~05:05 +五点一刻~5点1刻 +两点二刻~2点2刻 +三点三刻~3点3刻 +六点~6点 +五点五分~05:05 +五点半~5点半 +五点一刻~5点1刻 +两点三刻~2点3刻 +三点三刻~3点3刻 +五点五分~05:05 +两点一刻~2点1刻 +三点二刻~3点2刻 +四点~4点 +一点五分十秒~01:05:10 +十三点五分十秒~13:05:10 +十点~10点 +五分钟~5分钟 +五秒钟~5秒钟 +十三点五分~13:05 +十三点零五分~13:05 +五点二十五分~05:25 +十一点三十四分~11:34 \ No newline at end of file diff --git a/tests/data/zh/whitelist.txt b/tests/data/zh/whitelist.txt new file mode 100644 index 0000000..f36dc42 --- /dev/null +++ b/tests/data/zh/whitelist.txt @@ -0,0 +1,21 @@ +人力资源~HR +自动取款机~ATM +人力资源~HR +首席执行官~CEO +美国研究生入学考试~GRE +研究生管理专业入学考试~GMAT +全球定位系统~GPS +刷卡机~POS机 +数位多功能光碟~DVD +镭射唱片~CD +通用串行总线~USB +统一资源定位符~URL +虚拟专用网络~VPN +网络互联协议~IP +脱氧核糖核酸~DNA +核糖核酸~RNA +平均学分绩点~GPA +发光二极管~LED +可移植文档格式~PDF +社会性网络服务~SNS +博士~PhD diff --git a/tests/data/zh/word.txt b/tests/data/zh/word.txt new file mode 100644 index 0000000..1d0cac2 --- /dev/null +++ b/tests/data/zh/word.txt @@ -0,0 +1,21 @@ +你好~你好 +年级~年级 +秘密~秘密 +键盘~键盘 +借口~借口 +学生~学生 +人力~人力 +转移~转移 +徘徊~徘徊 +冤枉~冤枉 +浏览~浏览 +珍藏~珍藏 +患难 ~患难 +湿~湿 +眼眶~眼眶 +遗产~遗产 +流浪~流浪 +信仰~信仰 +戒指~戒指 +义无反顾~义无反顾 +交换~交换 diff --git a/tests/de_tests.rs b/tests/de_tests.rs new file mode 100644 index 0000000..beaca01 --- /dev/null +++ b/tests/de_tests.rs @@ -0,0 +1,166 @@ +//! German inverse text normalization tests. +//! +//! Test cases sourced from NVIDIA NeMo text processing: +//! https://github.com/NVIDIA/NeMo-text-processing + +mod common; + +use std::path::Path; +use text_processing_rs::normalize_with_lang; + +fn normalize_de(input: &str) -> String { + normalize_with_lang(input, "de") +} + +fn print_failures(results: &common::TestResults) { + for f in &results.failures { + println!( + " FAIL: '{}' => '{}' (expected '{}')", + f.input, f.got, f.expected + ); + } +} + +#[test] +fn test_cardinal() { + let results = common::run_test_file(Path::new("tests/data/de/cardinal.txt"), normalize_de); + println!( + "cardinal: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_ordinal() { + let results = common::run_test_file(Path::new("tests/data/de/ordinal.txt"), normalize_de); + println!( + "ordinal: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_decimal() { + let results = common::run_test_file(Path::new("tests/data/de/decimal.txt"), normalize_de); + println!( + "decimal: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_money() { + let results = common::run_test_file(Path::new("tests/data/de/money.txt"), normalize_de); + println!( + "money: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_date() { + let results = common::run_test_file(Path::new("tests/data/de/date.txt"), normalize_de); + println!( + "date: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_time() { + let results = common::run_test_file(Path::new("tests/data/de/time.txt"), normalize_de); + println!( + "time: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_measure() { + let results = common::run_test_file(Path::new("tests/data/de/measure.txt"), normalize_de); + println!( + "measure: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_electronic() { + let results = common::run_test_file(Path::new("tests/data/de/electronic.txt"), normalize_de); + println!( + "electronic: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_telephone() { + let results = common::run_test_file(Path::new("tests/data/de/telephone.txt"), normalize_de); + println!( + "telephone: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_whitelist() { + let results = common::run_test_file(Path::new("tests/data/de/whitelist.txt"), normalize_de); + println!( + "whitelist: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_word() { + let results = common::run_test_file(Path::new("tests/data/de/word.txt"), normalize_de); + println!( + "word: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_fraction() { + let results = common::run_test_file(Path::new("tests/data/de/fraction.txt"), normalize_de); + println!( + "fraction: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} diff --git a/tests/es_tests.rs b/tests/es_tests.rs new file mode 100644 index 0000000..2f86228 --- /dev/null +++ b/tests/es_tests.rs @@ -0,0 +1,166 @@ +//! Spanish inverse text normalization tests. +//! +//! Test cases sourced from NVIDIA NeMo text processing: +//! https://github.com/NVIDIA/NeMo-text-processing + +mod common; + +use std::path::Path; +use text_processing_rs::normalize_with_lang; + +fn normalize_es(input: &str) -> String { + normalize_with_lang(input, "es") +} + +fn print_failures(results: &common::TestResults) { + for f in &results.failures { + println!( + " FAIL: '{}' => '{}' (expected '{}')", + f.input, f.got, f.expected + ); + } +} + +#[test] +fn test_cardinal() { + let results = common::run_test_file(Path::new("tests/data/es/cardinal.txt"), normalize_es); + println!( + "cardinal: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_ordinal() { + let results = common::run_test_file(Path::new("tests/data/es/ordinal.txt"), normalize_es); + println!( + "ordinal: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_decimal() { + let results = common::run_test_file(Path::new("tests/data/es/decimal.txt"), normalize_es); + println!( + "decimal: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_money() { + let results = common::run_test_file(Path::new("tests/data/es/money.txt"), normalize_es); + println!( + "money: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_date() { + let results = common::run_test_file(Path::new("tests/data/es/date.txt"), normalize_es); + println!( + "date: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_time() { + let results = common::run_test_file(Path::new("tests/data/es/time.txt"), normalize_es); + println!( + "time: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_measure() { + let results = common::run_test_file(Path::new("tests/data/es/measure.txt"), normalize_es); + println!( + "measure: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_electronic() { + let results = common::run_test_file(Path::new("tests/data/es/electronic.txt"), normalize_es); + println!( + "electronic: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_telephone() { + let results = common::run_test_file(Path::new("tests/data/es/telephone.txt"), normalize_es); + println!( + "telephone: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_whitelist() { + let results = common::run_test_file(Path::new("tests/data/es/whitelist.txt"), normalize_es); + println!( + "whitelist: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_word() { + let results = common::run_test_file(Path::new("tests/data/es/word.txt"), normalize_es); + println!( + "word: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_fraction() { + let results = common::run_test_file(Path::new("tests/data/es/fraction.txt"), normalize_es); + println!( + "fraction: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} diff --git a/tests/fr_tests.rs b/tests/fr_tests.rs new file mode 100644 index 0000000..0037190 --- /dev/null +++ b/tests/fr_tests.rs @@ -0,0 +1,154 @@ +//! French inverse text normalization tests. +//! +//! Test cases sourced from NVIDIA NeMo text processing: +//! https://github.com/NVIDIA/NeMo-text-processing + +mod common; + +use std::path::Path; +use text_processing_rs::normalize_with_lang; + +fn normalize_fr(input: &str) -> String { + normalize_with_lang(input, "fr") +} + +fn print_failures(results: &common::TestResults) { + for f in &results.failures { + println!( + " FAIL: '{}' => '{}' (expected '{}')", + f.input, f.got, f.expected + ); + } +} + +#[test] +fn test_cardinal() { + let results = common::run_test_file(Path::new("tests/data/fr/cardinal.txt"), normalize_fr); + println!( + "cardinal: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_money() { + let results = common::run_test_file(Path::new("tests/data/fr/money.txt"), normalize_fr); + println!( + "money: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_ordinal() { + let results = common::run_test_file(Path::new("tests/data/fr/ordinal.txt"), normalize_fr); + println!( + "ordinal: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_time() { + let results = common::run_test_file(Path::new("tests/data/fr/time.txt"), normalize_fr); + println!( + "time: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_date() { + let results = common::run_test_file(Path::new("tests/data/fr/date.txt"), normalize_fr); + println!( + "date: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_decimal() { + let results = common::run_test_file(Path::new("tests/data/fr/decimal.txt"), normalize_fr); + println!( + "decimal: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_measure() { + let results = common::run_test_file(Path::new("tests/data/fr/measure.txt"), normalize_fr); + println!( + "measure: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_telephone() { + let results = common::run_test_file(Path::new("tests/data/fr/telephone.txt"), normalize_fr); + println!( + "telephone: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_electronic() { + let results = common::run_test_file(Path::new("tests/data/fr/electronic.txt"), normalize_fr); + println!( + "electronic: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_whitelist() { + let results = common::run_test_file(Path::new("tests/data/fr/whitelist.txt"), normalize_fr); + println!( + "whitelist: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_word() { + let results = common::run_test_file(Path::new("tests/data/fr/word.txt"), normalize_fr); + println!( + "word: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} diff --git a/tests/hi_tests.rs b/tests/hi_tests.rs new file mode 100644 index 0000000..001932b --- /dev/null +++ b/tests/hi_tests.rs @@ -0,0 +1,140 @@ +//! Hindi inverse text normalization tests. +//! +//! Test cases sourced from NVIDIA NeMo text processing: +//! https://github.com/NVIDIA/NeMo-text-processing + +mod common; + +use std::path::Path; +use text_processing_rs::normalize_with_lang; + +/// Decompose precomposed Devanagari nukta characters for consistent comparison. +/// Both input normalization (in lib.rs) and expected output may use different +/// Unicode representations of the same character. +fn decompose_nukta(input: &str) -> String { + let mut out = String::with_capacity(input.len() + 16); + for c in input.chars() { + match c { + '\u{0958}' => { + out.push('\u{0915}'); + out.push('\u{093C}'); + } + '\u{0959}' => { + out.push('\u{0916}'); + out.push('\u{093C}'); + } + '\u{095A}' => { + out.push('\u{0917}'); + out.push('\u{093C}'); + } + '\u{095B}' => { + out.push('\u{091C}'); + out.push('\u{093C}'); + } + '\u{095C}' => { + out.push('\u{0921}'); + out.push('\u{093C}'); + } + '\u{095D}' => { + out.push('\u{0922}'); + out.push('\u{093C}'); + } + '\u{095E}' => { + out.push('\u{092B}'); + out.push('\u{093C}'); + } + '\u{095F}' => { + out.push('\u{092F}'); + out.push('\u{093C}'); + } + _ => out.push(c), + } + } + out +} + +fn normalize_hi(input: &str) -> String { + normalize_with_lang(input, "hi") +} + +/// Compare with nukta normalization on both sides. +fn nukta_eq(got: &str, expected: &str) -> bool { + decompose_nukta(got) == decompose_nukta(expected) +} + +fn run_hi_test(name: &str, file: &str) { + let results = common::run_test_file_with_compare(Path::new(file), normalize_hi, nukta_eq); + println!( + "{}: {}/{} passed ({} failures)", + name, + results.passed, + results.total, + results.failures.len() + ); + for f in &results.failures { + println!( + " FAIL: '{}' => '{}' (expected '{}')", + f.input, f.got, f.expected + ); + } +} + +#[test] +fn test_cardinal() { + run_hi_test("cardinal", "tests/data/hi/cardinal.txt"); +} + +#[test] +fn test_ordinal() { + run_hi_test("ordinal", "tests/data/hi/ordinal.txt"); +} + +#[test] +fn test_decimal() { + run_hi_test("decimal", "tests/data/hi/decimal.txt"); +} + +#[test] +fn test_date() { + run_hi_test("date", "tests/data/hi/date.txt"); +} + +#[test] +fn test_time() { + run_hi_test("time", "tests/data/hi/time.txt"); +} + +#[test] +fn test_fraction() { + run_hi_test("fraction", "tests/data/hi/fraction.txt"); +} + +#[test] +fn test_money() { + run_hi_test("money", "tests/data/hi/money.txt"); +} + +#[test] +fn test_measure() { + run_hi_test("measure", "tests/data/hi/measure.txt"); +} + +#[test] +fn test_whitelist() { + run_hi_test("whitelist", "tests/data/hi/whitelist.txt"); +} + +#[test] +fn test_word() { + run_hi_test("word", "tests/data/hi/word.txt"); +} + +#[test] +fn test_address() { + run_hi_test("address", "tests/data/hi/address.txt"); +} + +#[test] +fn test_telephone() { + run_hi_test("telephone", "tests/data/hi/telephone.txt"); +} diff --git a/tests/ja_tests.rs b/tests/ja_tests.rs new file mode 100644 index 0000000..3339a98 --- /dev/null +++ b/tests/ja_tests.rs @@ -0,0 +1,94 @@ +//! Japanese inverse text normalization tests. +//! +//! Test cases sourced from NVIDIA NeMo text processing: +//! https://github.com/NVIDIA/NeMo-text-processing + +mod common; + +use std::path::Path; +use text_processing_rs::normalize_with_lang; + +fn normalize_ja(input: &str) -> String { + normalize_with_lang(input, "ja") +} + +fn print_failures(results: &common::TestResults) { + for f in &results.failures { + println!( + " FAIL: '{}' => '{}' (expected '{}')", + f.input, f.got, f.expected + ); + } +} + +#[test] +fn test_cardinal() { + let results = common::run_test_file(Path::new("tests/data/ja/cardinal.txt"), normalize_ja); + println!( + "cardinal: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_ordinal() { + let results = common::run_test_file(Path::new("tests/data/ja/ordinal.txt"), normalize_ja); + println!( + "ordinal: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_decimal() { + let results = common::run_test_file(Path::new("tests/data/ja/decimal.txt"), normalize_ja); + println!( + "decimal: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_date() { + let results = common::run_test_file(Path::new("tests/data/ja/date.txt"), normalize_ja); + println!( + "date: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_time() { + let results = common::run_test_file(Path::new("tests/data/ja/time.txt"), normalize_ja); + println!( + "time: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_fraction() { + let results = common::run_test_file(Path::new("tests/data/ja/fraction.txt"), normalize_ja); + println!( + "fraction: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} diff --git a/tests/zh_tests.rs b/tests/zh_tests.rs new file mode 100644 index 0000000..068228d --- /dev/null +++ b/tests/zh_tests.rs @@ -0,0 +1,130 @@ +//! Chinese inverse text normalization tests. +//! +//! Test cases sourced from NVIDIA NeMo text processing: +//! https://github.com/NVIDIA/NeMo-text-processing + +mod common; + +use std::path::Path; +use text_processing_rs::normalize_with_lang; + +fn normalize_zh(input: &str) -> String { + normalize_with_lang(input, "zh") +} + +fn print_failures(results: &common::TestResults) { + for f in &results.failures { + println!( + " FAIL: '{}' => '{}' (expected '{}')", + f.input, f.got, f.expected + ); + } +} + +#[test] +fn test_cardinal() { + let results = common::run_test_file(Path::new("tests/data/zh/cardinal.txt"), normalize_zh); + println!( + "cardinal: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_ordinal() { + let results = common::run_test_file(Path::new("tests/data/zh/ordinal.txt"), normalize_zh); + println!( + "ordinal: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_decimal() { + let results = common::run_test_file(Path::new("tests/data/zh/decimal.txt"), normalize_zh); + println!( + "decimal: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_date() { + let results = common::run_test_file(Path::new("tests/data/zh/date.txt"), normalize_zh); + println!( + "date: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_time() { + let results = common::run_test_file(Path::new("tests/data/zh/time.txt"), normalize_zh); + println!( + "time: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_fraction() { + let results = common::run_test_file(Path::new("tests/data/zh/fraction.txt"), normalize_zh); + println!( + "fraction: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_money() { + let results = common::run_test_file(Path::new("tests/data/zh/money.txt"), normalize_zh); + println!( + "money: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_whitelist() { + let results = common::run_test_file(Path::new("tests/data/zh/whitelist.txt"), normalize_zh); + println!( + "whitelist: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +} + +#[test] +fn test_word() { + let results = common::run_test_file(Path::new("tests/data/zh/word.txt"), normalize_zh); + println!( + "word: {}/{} passed ({} failures)", + results.passed, + results.total, + results.failures.len() + ); + print_failures(&results); +}