Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 24 additions & 20 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ fn tn_normalize_for_lang(input: &str, lang: &str) -> String {
let input = input.trim();

match lang {
"en" => tn_normalize(input),
"fr" => tn_normalize_lang_fr(input),
"es" => tn_normalize_lang_es(input),
"de" => tn_normalize_lang_de(input),
Expand Down Expand Up @@ -381,6 +382,9 @@ fn tn_parse_span_lang(span: &str, lang: &str) -> Option<(String, u8)> {
}

match lang {
"en" => {
try_lang_taggers!(tts::en);
}
"fr" => {
try_lang_taggers!(tts::fr);
}
Expand Down Expand Up @@ -569,34 +573,34 @@ pub fn normalize_sentence_with_max_span(input: &str, max_span_tokens: usize) ->
pub fn tn_normalize(input: &str) -> String {
let input = input.trim();

if let Some(result) = tts::whitelist::parse(input) {
if let Some(result) = tts::en::whitelist::parse(input) {
return result;
}
if let Some(result) = tts::money::parse(input) {
if let Some(result) = tts::en::money::parse(input) {
return result;
}
if let Some(result) = tts::measure::parse(input) {
if let Some(result) = tts::en::measure::parse(input) {
return result;
}
if let Some(result) = tts::date::parse(input) {
if let Some(result) = tts::en::date::parse(input) {
return result;
}
if let Some(result) = tts::time::parse(input) {
if let Some(result) = tts::en::time::parse(input) {
return result;
}
if let Some(result) = tts::electronic::parse(input) {
if let Some(result) = tts::en::electronic::parse(input) {
return result;
}
if let Some(result) = tts::telephone::parse(input) {
if let Some(result) = tts::en::telephone::parse(input) {
return result;
}
if let Some(result) = tts::ordinal::parse(input) {
if let Some(result) = tts::en::ordinal::parse(input) {
return result;
}
if let Some(result) = tts::decimal::parse(input) {
if let Some(result) = tts::en::decimal::parse(input) {
return result;
}
if let Some(result) = tts::cardinal::parse(input) {
if let Some(result) = tts::en::cardinal::parse(input) {
return result;
}

Expand All @@ -611,34 +615,34 @@ fn tn_parse_span(span: &str) -> Option<(String, u8)> {
return None;
}

if let Some(result) = tts::whitelist::parse(span) {
if let Some(result) = tts::en::whitelist::parse(span) {
return Some((result, 100));
}
if let Some(result) = tts::money::parse(span) {
if let Some(result) = tts::en::money::parse(span) {
return Some((result, 95));
}
if let Some(result) = tts::measure::parse(span) {
if let Some(result) = tts::en::measure::parse(span) {
return Some((result, 90));
}
if let Some(result) = tts::date::parse(span) {
if let Some(result) = tts::en::date::parse(span) {
return Some((result, 88));
}
if let Some(result) = tts::time::parse(span) {
if let Some(result) = tts::en::time::parse(span) {
return Some((result, 85));
}
if let Some(result) = tts::electronic::parse(span) {
if let Some(result) = tts::en::electronic::parse(span) {
return Some((result, 82));
}
if let Some(result) = tts::telephone::parse(span) {
if let Some(result) = tts::en::telephone::parse(span) {
return Some((result, 78));
}
if let Some(result) = tts::ordinal::parse(span) {
if let Some(result) = tts::en::ordinal::parse(span) {
return Some((result, 75));
}
if let Some(result) = tts::decimal::parse(span) {
if let Some(result) = tts::en::decimal::parse(span) {
return Some((result, 73));
}
if let Some(result) = tts::cardinal::parse(span) {
if let Some(result) = tts::en::cardinal::parse(span) {
return Some((result, 70));
}

Expand Down
21 changes: 8 additions & 13 deletions src/tts/de/decimal.rs

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Stale doc comment claims period-separated decimals are supported after removing that functionality

The module-level doc comment at src/tts/de/decimal.rs:6 still says "3.14" → "drei komma eins vier", but this PR changed the parse function (lines 33-37) to reject any input that does not contain a comma. Period-only decimals like "3.14" now return None. The corresponding test test_period_decimal was correctly removed, but the doc comment was not updated to match.

(Refers to line 6)

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
//! Converts written decimal numbers to spoken German:
//! - "3,14" → "drei komma eins vier"
//! - "0,5" → "null komma fuenf"
//! - "3.14" → "drei komma eins vier"
//! - "-3,14" → "minus drei komma eins vier"
//!
//! German uses comma (,) as the decimal separator.
//! Period (.) is used as thousands separator in cardinal numbers.

use super::{number_to_words, spell_digits};

Expand All @@ -30,14 +33,11 @@ pub fn parse(input: &str) -> Option<String> {
// Check for quantity suffix: "1,5 milliarden"
let (number_part, suffix) = extract_suffix(trimmed);

// German uses comma as decimal separator, but also accept period
let sep = if number_part.contains(',') && !number_part.contains('.') {
','
} else if number_part.contains('.') {
'.'
} else {
// German uses comma as decimal separator (period is thousands separator)
if !number_part.contains(',') {
return None;
};
}
let sep = ',';

let parts: Vec<&str> = number_part.splitn(2, sep).collect();
if parts.len() != 2 {
Expand Down Expand Up @@ -106,11 +106,6 @@ mod tests {
assert_eq!(parse("0,5"), Some("null komma fuenf".to_string()));
}

#[test]
fn test_period_decimal() {
assert_eq!(parse("3.14"), Some("drei komma eins vier".to_string()));
}

#[test]
fn test_negative() {
assert_eq!(
Expand Down
10 changes: 4 additions & 6 deletions src/tts/de/money.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,10 @@ fn parse_amount(amount_str: &str, currency: &Currency) -> Option<String> {
return None;
}

// Determine decimal separator: German uses comma
let sep = if amount_str.contains(',') { ',' } else { '.' };

if amount_str.contains(sep) && sep != '.' || amount_str.contains('.') {
let actual_sep = if amount_str.contains(',') { ',' } else { '.' };
let parts: Vec<&str> = amount_str.splitn(2, actual_sep).collect();
// Check for decimal separator (comma or period)
if amount_str.contains(',') || amount_str.contains('.') {
let sep = if amount_str.contains(',') { ',' } else { '.' };
let parts: Vec<&str> = amount_str.splitn(2, sep).collect();
if parts.len() == 2 {
let int_clean: String = parts[0].chars().filter(|c| c.is_ascii_digit()).collect();
let dollars: i64 = if int_clean.is_empty() {
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
212 changes: 212 additions & 0 deletions src/tts/en/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
//! Text Normalization taggers for English.
//!
//! Converts written-form text to spoken English:
//! - "200" → "two hundred"
//! - "$5.50" → "five dollars and fifty cents"
//! - "January 5, 2025" → "january fifth twenty twenty five"

pub mod cardinal;
pub mod date;
pub mod decimal;
pub mod electronic;
pub mod measure;
pub mod money;
pub mod ordinal;
pub mod telephone;
pub mod time;
pub mod whitelist;

/// Ones words indexed by value (0..20).
const ONES: [&str; 20] = [
"zero",
"one",
"two",
"three",
"four",
"five",
"six",
"seven",
"eight",
"nine",
"ten",
"eleven",
"twelve",
"thirteen",
"fourteen",
"fifteen",
"sixteen",
"seventeen",
"eighteen",
"nineteen",
];

/// Tens words indexed by tens digit (2..10 → index 0..8).
const TENS: [&str; 8] = [
"twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
];

/// Convert an integer to English words.
///
/// Examples:
/// - `0` → `"zero"`
/// - `21` → `"twenty one"`
/// - `123` → `"one hundred twenty three"`
/// - `1000` → `"one thousand"`
/// - `-42` → `"minus forty two"`
pub fn number_to_words(n: i64) -> String {
if n == 0 {
return "zero".to_string();
}

if n < 0 {
// Use wrapping negation and cast to u64 to handle i64::MIN safely,
// since -i64::MIN overflows i64 but fits in u64.
let abs_val = (n as u64).wrapping_neg();
let mut parts: Vec<String> = Vec::new();
let mut remaining = abs_val;

let scales: &[(u64, &str)] = &[
(1_000_000_000_000_000_000, "quintillion"),
(1_000_000_000_000_000, "quadrillion"),
(1_000_000_000_000, "trillion"),
(1_000_000_000, "billion"),
(1_000_000, "million"),
(1_000, "thousand"),
];

for &(scale_value, scale_name) in scales {
if remaining >= scale_value {
let chunk = remaining / scale_value;
remaining %= scale_value;
parts.push(format!("{} {}", chunk_to_words(chunk as u32), scale_name));
}
}

if remaining > 0 {
parts.push(chunk_to_words(remaining as u32));
}

return format!("minus {}", parts.join(" "));
}

let mut parts: Vec<String> = Vec::new();
let mut remaining = n as u64;

// Process scale groups from largest to smallest
let scales: &[(u64, &str)] = &[
(1_000_000_000_000_000_000, "quintillion"),
(1_000_000_000_000_000, "quadrillion"),
(1_000_000_000_000, "trillion"),
(1_000_000_000, "billion"),
(1_000_000, "million"),
(1_000, "thousand"),
];

for &(scale_value, scale_name) in scales {
if remaining >= scale_value {
let chunk = remaining / scale_value;
remaining %= scale_value;
parts.push(format!("{} {}", chunk_to_words(chunk as u32), scale_name));
}
}

// Remainder (0..999)
if remaining > 0 {
parts.push(chunk_to_words(remaining as u32));
}

parts.join(" ")
}

/// Convert a number 1..999 to words.
fn chunk_to_words(n: u32) -> String {
debug_assert!(n > 0 && n < 1000);
let mut parts: Vec<&str> = Vec::new();

let hundreds = n / 100;
let rest = n % 100;

if hundreds > 0 {
parts.push(ONES[hundreds as usize]);
parts.push("hundred");
}

if rest >= 20 {
let tens_idx = (rest / 10 - 2) as usize;
parts.push(TENS[tens_idx]);
let ones = rest % 10;
if ones > 0 {
parts.push(ONES[ones as usize]);
}
} else if rest > 0 {
parts.push(ONES[rest as usize]);
}

parts.join(" ")
}

/// Spell each digit of a string individually.
///
/// "14" → "one four"
pub fn spell_digits(s: &str) -> String {
s.chars()
.filter_map(|c| c.to_digit(10).map(|d| ONES[d as usize]))
.collect::<Vec<_>>()
.join(" ")
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_number_to_words_basic() {
assert_eq!(number_to_words(0), "zero");
assert_eq!(number_to_words(1), "one");
assert_eq!(number_to_words(10), "ten");
assert_eq!(number_to_words(11), "eleven");
assert_eq!(number_to_words(19), "nineteen");
assert_eq!(number_to_words(20), "twenty");
assert_eq!(number_to_words(21), "twenty one");
assert_eq!(number_to_words(99), "ninety nine");
}

#[test]
fn test_number_to_words_hundreds() {
assert_eq!(number_to_words(100), "one hundred");
assert_eq!(number_to_words(101), "one hundred one");
assert_eq!(number_to_words(123), "one hundred twenty three");
assert_eq!(number_to_words(999), "nine hundred ninety nine");
}

#[test]
fn test_number_to_words_thousands() {
assert_eq!(number_to_words(1000), "one thousand");
assert_eq!(number_to_words(1001), "one thousand one");
assert_eq!(
number_to_words(1234),
"one thousand two hundred thirty four"
);
assert_eq!(number_to_words(10000), "ten thousand");
assert_eq!(number_to_words(100000), "one hundred thousand");
}

#[test]
fn test_number_to_words_millions() {
assert_eq!(number_to_words(1000000), "one million");
assert_eq!(number_to_words(2000003), "two million three");
}

#[test]
fn test_number_to_words_negative() {
assert_eq!(number_to_words(-42), "minus forty two");
assert_eq!(number_to_words(-1000), "minus one thousand");
}

#[test]
fn test_spell_digits() {
assert_eq!(spell_digits("14"), "one four");
assert_eq!(spell_digits("0"), "zero");
assert_eq!(spell_digits("987"), "nine eight seven");
}
}
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
10 changes: 4 additions & 6 deletions src/tts/es/money.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,12 +172,10 @@ fn parse_amount(amount_str: &str, currency: &Currency) -> Option<String> {
return None;
}

// Determine decimal separator: Spanish uses comma, but accept period too
let sep = if amount_str.contains(',') { ',' } else { '.' };

if amount_str.contains(sep) && sep != '.' || amount_str.contains('.') {
let actual_sep = if amount_str.contains(',') { ',' } else { '.' };
let parts: Vec<&str> = amount_str.splitn(2, actual_sep).collect();
// Check for decimal separator (comma or period)
if amount_str.contains(',') || amount_str.contains('.') {
let sep = if amount_str.contains(',') { ',' } else { '.' };
let parts: Vec<&str> = amount_str.splitn(2, sep).collect();
if parts.len() == 2 {
let int_clean: String = parts[0].chars().filter(|c| c.is_ascii_digit()).collect();
let dollars: i64 = if int_clean.is_empty() {
Expand Down
Loading
Loading