Datum-Normalisierung: DD.MM.YYYY → deutsche Ordinalform

03.06.2026 → "Dritter Sechster Zwanzigsechsundzwanzig" 31.12.2027 → "Einunddreißigster Zwölfter Zwanzigsiebenundzwanzig" - DATE_RE erkennt DD.MM.YYYY (mit/ohne führende Null) - ordinal_de(): nominativer Ordinal (1→Erster, 12→Zwölfter, 31→Einunddreißigster) - year_for_date_de(): Datums-Jahrform (2026→Zwanzigsechsundzwanzig) - normalize_dates() wird VOR normalize_times() aufgerufen (verhindert dass TIME_RE "03.06" als Uhrzeit frisst) - Neues CLI-Flag --no-normalize-dates - Parameter normalize_date_values in preprocess_tts_text, synthesize_non_streaming, synthesize_streaming Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-03 23:45:03 +02:00 · 2026-06-03 23:45:03 +02:00 · ff3e370bbd
commit ff3e370bbd
parent ce46d4aa59
1 changed files with 71 additions and 0 deletions
--- a/chatterbox_cli_v4.py
+++ b/chatterbox_cli_v4.py
@ -130,6 +130,11 @@ ACRONYM_COMPOUND_RE = re.compile(r'\b([A-ZÄÖÜ]{2,}(?:[A-ZÄÖÜ0-9]*[A-ZÄÖ
 # - 14.58 Uhr
 TIME_RE = re.compile(r'\b([01]?\d|2[0-3])([:.])([0-5]\d)(?:\s*Uhr)?\b', re.IGNORECASE)

+# Datum DD.MM.YYYY — muss VOR TIME_RE verarbeitet werden (sonst matcht TIME_RE "03.06" als Uhrzeit)
+DATE_RE = re.compile(
+    r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[012])\.((?:19|20|21)\d{2})\b'
+)
+
 # Vierstellige Jahreszahlen
 YEAR_RE = re.compile(r'\b(19\d{2}|20\d{2}|21\d{2})\b')

@ -200,9 +205,22 @@ DEFAULT_PRONUNCIATION_DE: dict[str, str] = {
    "MacBook":    "Mäk Buk",
    "ChatGPT":    "Tschet Dschie Pie Tie",
    "OpenAI":     "Open A I",
+    "Anthropic":  "Enthropik",
+    "Claude":     "Kloode",
    # KI-Begriffe
    "GPT":        "Dschie Pie Tie",
    "LLM":        "El El Em",
+    # Sonstige
+    "UN":         "Uh En",
+    "ARD":        "Ah Er Dee",
+    "ZDF":        "Tset De Eff",
+    "RTL":        "Er Te El",
+    "AFD":        "Ah Eff Dee",
+    "AfD":        "Ah Eff Dee",
+    "CDU":        "Tse De Uh",
+    "SPD":        "Es Pe Dee",
+    "FDP":        "Eff De Pee",
+    "BSW":        "Be Es Wee",
 }


@ -440,6 +458,47 @@ def year_to_words_en(year: int) -> str:
    return f"{number_to_words_en(first_two)} {number_to_words_en(last_two)}"


+_ORDINAL_IRREGULAR_DE = {
+    1: "Erster",     2: "Zweiter",    3: "Dritter",     4: "Vierter",
+    5: "Fünfter",    6: "Sechster",   7: "Siebter",     8: "Achter",
+    9: "Neunter",   10: "Zehnter",   11: "Elfter",     12: "Zwölfter",
+    13: "Dreizehnter", 14: "Vierzehnter", 15: "Fünfzehnter",
+    16: "Sechzehnter", 17: "Siebzehnter", 18: "Achtzehnter",
+    19: "Neunzehnter",
+}
+
+
+def ordinal_de(n: int) -> str:
+    """Nominativer maskuliner Ordinal (1 → 'Erster', 20 → 'Zwanzigster')."""
+    if n in _ORDINAL_IRREGULAR_DE:
+        return _ORDINAL_IRREGULAR_DE[n]
+    return number_to_words_de(n).capitalize() + "ster"
+
+
+def year_for_date_de(year: int) -> str:
+    """Jahreszahl in Datums-Aussprache: 2026 → 'Zwanzigsechsundzwanzig'."""
+    if 2010 <= year <= 2099:
+        return "Zwanzig" + number_to_words_de(year % 100)
+    if year == 2000:
+        return "Zweitausend"
+    if 2001 <= year <= 2009:
+        return "Zweitausend" + number_to_words_de(year - 2000)
+    return year_to_words_de(year).capitalize()
+
+
+def normalize_dates(text: str, lang: str) -> str:
+    if lang != "de":
+        return text
+
+    def repl(m: re.Match) -> str:
+        day   = int(m.group(1))
+        month = int(m.group(2))
+        year  = int(m.group(3))
+        return f"{ordinal_de(day)} {ordinal_de(month)} {year_for_date_de(year)}"
+
+    return DATE_RE.sub(repl, text)
+
+
 def spell_out_acronym(token: str, mode: str = "period_space") -> str:
    chars = list(token)

@ -523,6 +582,7 @@ def preprocess_tts_text(
    lang: str,
    spell_uppercase_acronyms: bool = True,
    acronym_mode: Optional[str] = None,  # None = auto: 'german' bei de, sonst 'period_space'
+    normalize_date_values: bool = True,
    normalize_time_values: bool = True,
    normalize_year_values: bool = True,
    normalize_units_values: bool = True,
@ -540,6 +600,10 @@ def preprocess_tts_text(
    if normalize_units_values:
        text = normalize_units(text, lang)

+    # Datum VOR Uhrzeit normalisieren — sonst matcht TIME_RE "03.06" fälschlicherweise
+    if normalize_date_values:
+        text = normalize_dates(text, lang)
+
    if normalize_time_values:
        text = normalize_times(text, lang)

@ -887,6 +951,7 @@ def synthesize_non_streaming(
    show_progress: bool = True,
    spell_uppercase_acronyms: bool = True,
    acronym_mode: Optional[str] = None,
+    normalize_date_values: bool = True,
    normalize_time_values: bool = True,
    normalize_year_values: bool = True,
    normalize_units_values: bool = True,
@ -921,6 +986,7 @@ def synthesize_non_streaming(
    preprocess_kw = dict(
        spell_uppercase_acronyms=spell_uppercase_acronyms,
        acronym_mode=acronym_mode,
+        normalize_date_values=normalize_date_values,
        normalize_time_values=normalize_time_values,
        normalize_year_values=normalize_year_values,
        normalize_units_values=normalize_units_values,
@ -1011,6 +1077,7 @@ def synthesize_streaming(
    show_progress: bool = True,
    spell_uppercase_acronyms: bool = True,
    acronym_mode: str = "period_space",
+    normalize_date_values: bool = True,
    normalize_time_values: bool = True,
    normalize_year_values: bool = True,
    normalize_units_values: bool = True,
@ -1050,6 +1117,7 @@ def synthesize_streaming(
        lang=lang,
        spell_uppercase_acronyms=spell_uppercase_acronyms,
        acronym_mode=acronym_mode,
+        normalize_date_values=normalize_date_values,
        normalize_time_values=normalize_time_values,
        normalize_year_values=normalize_year_values,
        normalize_units_values=normalize_units_values,
@ -1169,6 +1237,7 @@ def build_argparser() -> argparse.ArgumentParser:
        help="Ausgabeformat für buchstabierte Akronyme. Default: 'german' bei --lang de, sonst 'period_space'."
    )
    p.add_argument("--pronunciation-dict", type=str, default=None, help="Pfad zu einer JSON-Datei mit Aussprache-Substitutionen (Eigenname → Lautschrift).")
+    p.add_argument("--no-normalize-dates", action="store_true", help="Datumsangaben (03.06.2026) nicht in sprechbaren Text umwandeln.")
    p.add_argument("--no-normalize-times", action="store_true", help="Uhrzeiten nicht in sprechbaren Text umwandeln.")
    p.add_argument("--no-normalize-years", action="store_true", help="Jahreszahlen nicht in sprechbaren Text umwandeln.")
    p.add_argument("--no-normalize-units", action="store_true", help="Einheiten nicht in sprechbaren Text umwandeln.")
@ -1241,6 +1310,7 @@ def main() -> int:
                show_progress=not args.no_progress,
                spell_uppercase_acronyms=not args.no_spell_acronyms,
                acronym_mode=acronym_mode,
+                normalize_date_values=not args.no_normalize_dates,
                normalize_time_values=not args.no_normalize_times,
                normalize_year_values=not args.no_normalize_years,
                normalize_units_values=not args.no_normalize_units,
@ -1263,6 +1333,7 @@ def main() -> int:
                show_progress=not args.no_progress,
                spell_uppercase_acronyms=not args.no_spell_acronyms,
                acronym_mode=acronym_mode,
+                normalize_date_values=not args.no_normalize_dates,
                normalize_time_values=not args.no_normalize_times,
                normalize_year_values=not args.no_normalize_years,
                normalize_units_values=not args.no_normalize_units,