Datum-Normalisierung: DD.MM.YYYY → deutsche Ordinalform

03.06.2026 → "Dritter Sechster Zwanzigsechsundzwanzig"
31.12.2027 → "Einunddreißigster Zwölfter Zwanzigsiebenundzwanzig"

- DATE_RE erkennt DD.MM.YYYY (mit/ohne führende Null)
- ordinal_de(): nominativer Ordinal (1→Erster, 12→Zwölfter, 31→Einunddreißigster)
- year_for_date_de(): Datums-Jahrform (2026→Zwanzigsechsundzwanzig)
- normalize_dates() wird VOR normalize_times() aufgerufen (verhindert
  dass TIME_RE "03.06" als Uhrzeit frisst)
- Neues CLI-Flag --no-normalize-dates
- Parameter normalize_date_values in preprocess_tts_text, synthesize_non_streaming,
  synthesize_streaming

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Dieter Schlüter 2026-06-03 23:45:03 +02:00
commit ff3e370bbd

View file

@ -130,6 +130,11 @@ ACRONYM_COMPOUND_RE = re.compile(r'\b([A-ZÄÖÜ]{2,}(?:[A-ZÄÖÜ0-9]*[A-ZÄÖ
# - 14.58 Uhr # - 14.58 Uhr
TIME_RE = re.compile(r'\b([01]?\d|2[0-3])([:.])([0-5]\d)(?:\s*Uhr)?\b', re.IGNORECASE) TIME_RE = re.compile(r'\b([01]?\d|2[0-3])([:.])([0-5]\d)(?:\s*Uhr)?\b', re.IGNORECASE)
# Datum DD.MM.YYYY — muss VOR TIME_RE verarbeitet werden (sonst matcht TIME_RE "03.06" als Uhrzeit)
DATE_RE = re.compile(
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[012])\.((?:19|20|21)\d{2})\b'
)
# Vierstellige Jahreszahlen # Vierstellige Jahreszahlen
YEAR_RE = re.compile(r'\b(19\d{2}|20\d{2}|21\d{2})\b') YEAR_RE = re.compile(r'\b(19\d{2}|20\d{2}|21\d{2})\b')
@ -200,9 +205,22 @@ DEFAULT_PRONUNCIATION_DE: dict[str, str] = {
"MacBook": "Mäk Buk", "MacBook": "Mäk Buk",
"ChatGPT": "Tschet Dschie Pie Tie", "ChatGPT": "Tschet Dschie Pie Tie",
"OpenAI": "Open A I", "OpenAI": "Open A I",
"Anthropic": "Enthropik",
"Claude": "Kloode",
# KI-Begriffe # KI-Begriffe
"GPT": "Dschie Pie Tie", "GPT": "Dschie Pie Tie",
"LLM": "El El Em", "LLM": "El El Em",
# Sonstige
"UN": "Uh En",
"ARD": "Ah Er Dee",
"ZDF": "Tset De Eff",
"RTL": "Er Te El",
"AFD": "Ah Eff Dee",
"AfD": "Ah Eff Dee",
"CDU": "Tse De Uh",
"SPD": "Es Pe Dee",
"FDP": "Eff De Pee",
"BSW": "Be Es Wee",
} }
@ -440,6 +458,47 @@ def year_to_words_en(year: int) -> str:
return f"{number_to_words_en(first_two)} {number_to_words_en(last_two)}" return f"{number_to_words_en(first_two)} {number_to_words_en(last_two)}"
_ORDINAL_IRREGULAR_DE = {
1: "Erster", 2: "Zweiter", 3: "Dritter", 4: "Vierter",
5: "Fünfter", 6: "Sechster", 7: "Siebter", 8: "Achter",
9: "Neunter", 10: "Zehnter", 11: "Elfter", 12: "Zwölfter",
13: "Dreizehnter", 14: "Vierzehnter", 15: "Fünfzehnter",
16: "Sechzehnter", 17: "Siebzehnter", 18: "Achtzehnter",
19: "Neunzehnter",
}
def ordinal_de(n: int) -> str:
"""Nominativer maskuliner Ordinal (1 → 'Erster', 20 → 'Zwanzigster')."""
if n in _ORDINAL_IRREGULAR_DE:
return _ORDINAL_IRREGULAR_DE[n]
return number_to_words_de(n).capitalize() + "ster"
def year_for_date_de(year: int) -> str:
"""Jahreszahl in Datums-Aussprache: 2026 → 'Zwanzigsechsundzwanzig'."""
if 2010 <= year <= 2099:
return "Zwanzig" + number_to_words_de(year % 100)
if year == 2000:
return "Zweitausend"
if 2001 <= year <= 2009:
return "Zweitausend" + number_to_words_de(year - 2000)
return year_to_words_de(year).capitalize()
def normalize_dates(text: str, lang: str) -> str:
if lang != "de":
return text
def repl(m: re.Match) -> str:
day = int(m.group(1))
month = int(m.group(2))
year = int(m.group(3))
return f"{ordinal_de(day)} {ordinal_de(month)} {year_for_date_de(year)}"
return DATE_RE.sub(repl, text)
def spell_out_acronym(token: str, mode: str = "period_space") -> str: def spell_out_acronym(token: str, mode: str = "period_space") -> str:
chars = list(token) chars = list(token)
@ -523,6 +582,7 @@ def preprocess_tts_text(
lang: str, lang: str,
spell_uppercase_acronyms: bool = True, spell_uppercase_acronyms: bool = True,
acronym_mode: Optional[str] = None, # None = auto: 'german' bei de, sonst 'period_space' acronym_mode: Optional[str] = None, # None = auto: 'german' bei de, sonst 'period_space'
normalize_date_values: bool = True,
normalize_time_values: bool = True, normalize_time_values: bool = True,
normalize_year_values: bool = True, normalize_year_values: bool = True,
normalize_units_values: bool = True, normalize_units_values: bool = True,
@ -540,6 +600,10 @@ def preprocess_tts_text(
if normalize_units_values: if normalize_units_values:
text = normalize_units(text, lang) text = normalize_units(text, lang)
# Datum VOR Uhrzeit normalisieren — sonst matcht TIME_RE "03.06" fälschlicherweise
if normalize_date_values:
text = normalize_dates(text, lang)
if normalize_time_values: if normalize_time_values:
text = normalize_times(text, lang) text = normalize_times(text, lang)
@ -887,6 +951,7 @@ def synthesize_non_streaming(
show_progress: bool = True, show_progress: bool = True,
spell_uppercase_acronyms: bool = True, spell_uppercase_acronyms: bool = True,
acronym_mode: Optional[str] = None, acronym_mode: Optional[str] = None,
normalize_date_values: bool = True,
normalize_time_values: bool = True, normalize_time_values: bool = True,
normalize_year_values: bool = True, normalize_year_values: bool = True,
normalize_units_values: bool = True, normalize_units_values: bool = True,
@ -921,6 +986,7 @@ def synthesize_non_streaming(
preprocess_kw = dict( preprocess_kw = dict(
spell_uppercase_acronyms=spell_uppercase_acronyms, spell_uppercase_acronyms=spell_uppercase_acronyms,
acronym_mode=acronym_mode, acronym_mode=acronym_mode,
normalize_date_values=normalize_date_values,
normalize_time_values=normalize_time_values, normalize_time_values=normalize_time_values,
normalize_year_values=normalize_year_values, normalize_year_values=normalize_year_values,
normalize_units_values=normalize_units_values, normalize_units_values=normalize_units_values,
@ -1011,6 +1077,7 @@ def synthesize_streaming(
show_progress: bool = True, show_progress: bool = True,
spell_uppercase_acronyms: bool = True, spell_uppercase_acronyms: bool = True,
acronym_mode: str = "period_space", acronym_mode: str = "period_space",
normalize_date_values: bool = True,
normalize_time_values: bool = True, normalize_time_values: bool = True,
normalize_year_values: bool = True, normalize_year_values: bool = True,
normalize_units_values: bool = True, normalize_units_values: bool = True,
@ -1050,6 +1117,7 @@ def synthesize_streaming(
lang=lang, lang=lang,
spell_uppercase_acronyms=spell_uppercase_acronyms, spell_uppercase_acronyms=spell_uppercase_acronyms,
acronym_mode=acronym_mode, acronym_mode=acronym_mode,
normalize_date_values=normalize_date_values,
normalize_time_values=normalize_time_values, normalize_time_values=normalize_time_values,
normalize_year_values=normalize_year_values, normalize_year_values=normalize_year_values,
normalize_units_values=normalize_units_values, normalize_units_values=normalize_units_values,
@ -1169,6 +1237,7 @@ def build_argparser() -> argparse.ArgumentParser:
help="Ausgabeformat für buchstabierte Akronyme. Default: 'german' bei --lang de, sonst 'period_space'." help="Ausgabeformat für buchstabierte Akronyme. Default: 'german' bei --lang de, sonst 'period_space'."
) )
p.add_argument("--pronunciation-dict", type=str, default=None, help="Pfad zu einer JSON-Datei mit Aussprache-Substitutionen (Eigenname → Lautschrift).") p.add_argument("--pronunciation-dict", type=str, default=None, help="Pfad zu einer JSON-Datei mit Aussprache-Substitutionen (Eigenname → Lautschrift).")
p.add_argument("--no-normalize-dates", action="store_true", help="Datumsangaben (03.06.2026) nicht in sprechbaren Text umwandeln.")
p.add_argument("--no-normalize-times", action="store_true", help="Uhrzeiten nicht in sprechbaren Text umwandeln.") p.add_argument("--no-normalize-times", action="store_true", help="Uhrzeiten nicht in sprechbaren Text umwandeln.")
p.add_argument("--no-normalize-years", action="store_true", help="Jahreszahlen nicht in sprechbaren Text umwandeln.") p.add_argument("--no-normalize-years", action="store_true", help="Jahreszahlen nicht in sprechbaren Text umwandeln.")
p.add_argument("--no-normalize-units", action="store_true", help="Einheiten nicht in sprechbaren Text umwandeln.") p.add_argument("--no-normalize-units", action="store_true", help="Einheiten nicht in sprechbaren Text umwandeln.")
@ -1241,6 +1310,7 @@ def main() -> int:
show_progress=not args.no_progress, show_progress=not args.no_progress,
spell_uppercase_acronyms=not args.no_spell_acronyms, spell_uppercase_acronyms=not args.no_spell_acronyms,
acronym_mode=acronym_mode, acronym_mode=acronym_mode,
normalize_date_values=not args.no_normalize_dates,
normalize_time_values=not args.no_normalize_times, normalize_time_values=not args.no_normalize_times,
normalize_year_values=not args.no_normalize_years, normalize_year_values=not args.no_normalize_years,
normalize_units_values=not args.no_normalize_units, normalize_units_values=not args.no_normalize_units,
@ -1263,6 +1333,7 @@ def main() -> int:
show_progress=not args.no_progress, show_progress=not args.no_progress,
spell_uppercase_acronyms=not args.no_spell_acronyms, spell_uppercase_acronyms=not args.no_spell_acronyms,
acronym_mode=acronym_mode, acronym_mode=acronym_mode,
normalize_date_values=not args.no_normalize_dates,
normalize_time_values=not args.no_normalize_times, normalize_time_values=not args.no_normalize_times,
normalize_year_values=not args.no_normalize_years, normalize_year_values=not args.no_normalize_years,
normalize_units_values=not args.no_normalize_units, normalize_units_values=not args.no_normalize_units,