diff --git a/chatterbox_cli_v4.py b/chatterbox_cli_v4.py index fef7e10..6072fdb 100755 --- a/chatterbox_cli_v4.py +++ b/chatterbox_cli_v4.py @@ -130,6 +130,11 @@ ACRONYM_COMPOUND_RE = re.compile(r'\b([A-ZÄÖÜ]{2,}(?:[A-ZÄÖÜ0-9]*[A-ZÄÖ # - 14.58 Uhr TIME_RE = re.compile(r'\b([01]?\d|2[0-3])([:.])([0-5]\d)(?:\s*Uhr)?\b', re.IGNORECASE) +# Datum DD.MM.YYYY — muss VOR TIME_RE verarbeitet werden (sonst matcht TIME_RE "03.06" als Uhrzeit) +DATE_RE = re.compile( + r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[012])\.((?:19|20|21)\d{2})\b' +) + # Vierstellige Jahreszahlen YEAR_RE = re.compile(r'\b(19\d{2}|20\d{2}|21\d{2})\b') @@ -200,9 +205,22 @@ DEFAULT_PRONUNCIATION_DE: dict[str, str] = { "MacBook": "Mäk Buk", "ChatGPT": "Tschet Dschie Pie Tie", "OpenAI": "Open A I", + "Anthropic": "Enthropik", + "Claude": "Kloode", # KI-Begriffe "GPT": "Dschie Pie Tie", "LLM": "El El Em", + # Sonstige + "UN": "Uh En", + "ARD": "Ah Er Dee", + "ZDF": "Tset De Eff", + "RTL": "Er Te El", + "AFD": "Ah Eff Dee", + "AfD": "Ah Eff Dee", + "CDU": "Tse De Uh", + "SPD": "Es Pe Dee", + "FDP": "Eff De Pee", + "BSW": "Be Es Wee", } @@ -440,6 +458,47 @@ def year_to_words_en(year: int) -> str: return f"{number_to_words_en(first_two)} {number_to_words_en(last_two)}" +_ORDINAL_IRREGULAR_DE = { + 1: "Erster", 2: "Zweiter", 3: "Dritter", 4: "Vierter", + 5: "Fünfter", 6: "Sechster", 7: "Siebter", 8: "Achter", + 9: "Neunter", 10: "Zehnter", 11: "Elfter", 12: "Zwölfter", + 13: "Dreizehnter", 14: "Vierzehnter", 15: "Fünfzehnter", + 16: "Sechzehnter", 17: "Siebzehnter", 18: "Achtzehnter", + 19: "Neunzehnter", +} + + +def ordinal_de(n: int) -> str: + """Nominativer maskuliner Ordinal (1 → 'Erster', 20 → 'Zwanzigster').""" + if n in _ORDINAL_IRREGULAR_DE: + return _ORDINAL_IRREGULAR_DE[n] + return number_to_words_de(n).capitalize() + "ster" + + +def year_for_date_de(year: int) -> str: + """Jahreszahl in Datums-Aussprache: 2026 → 'Zwanzigsechsundzwanzig'.""" + if 2010 <= year <= 2099: + return "Zwanzig" + number_to_words_de(year % 100) + if year == 2000: + return "Zweitausend" + if 2001 <= year <= 2009: + return "Zweitausend" + number_to_words_de(year - 2000) + return year_to_words_de(year).capitalize() + + +def normalize_dates(text: str, lang: str) -> str: + if lang != "de": + return text + + def repl(m: re.Match) -> str: + day = int(m.group(1)) + month = int(m.group(2)) + year = int(m.group(3)) + return f"{ordinal_de(day)} {ordinal_de(month)} {year_for_date_de(year)}" + + return DATE_RE.sub(repl, text) + + def spell_out_acronym(token: str, mode: str = "period_space") -> str: chars = list(token) @@ -523,6 +582,7 @@ def preprocess_tts_text( lang: str, spell_uppercase_acronyms: bool = True, acronym_mode: Optional[str] = None, # None = auto: 'german' bei de, sonst 'period_space' + normalize_date_values: bool = True, normalize_time_values: bool = True, normalize_year_values: bool = True, normalize_units_values: bool = True, @@ -540,6 +600,10 @@ def preprocess_tts_text( if normalize_units_values: text = normalize_units(text, lang) + # Datum VOR Uhrzeit normalisieren — sonst matcht TIME_RE "03.06" fälschlicherweise + if normalize_date_values: + text = normalize_dates(text, lang) + if normalize_time_values: text = normalize_times(text, lang) @@ -887,6 +951,7 @@ def synthesize_non_streaming( show_progress: bool = True, spell_uppercase_acronyms: bool = True, acronym_mode: Optional[str] = None, + normalize_date_values: bool = True, normalize_time_values: bool = True, normalize_year_values: bool = True, normalize_units_values: bool = True, @@ -921,6 +986,7 @@ def synthesize_non_streaming( preprocess_kw = dict( spell_uppercase_acronyms=spell_uppercase_acronyms, acronym_mode=acronym_mode, + normalize_date_values=normalize_date_values, normalize_time_values=normalize_time_values, normalize_year_values=normalize_year_values, normalize_units_values=normalize_units_values, @@ -1011,6 +1077,7 @@ def synthesize_streaming( show_progress: bool = True, spell_uppercase_acronyms: bool = True, acronym_mode: str = "period_space", + normalize_date_values: bool = True, normalize_time_values: bool = True, normalize_year_values: bool = True, normalize_units_values: bool = True, @@ -1050,6 +1117,7 @@ def synthesize_streaming( lang=lang, spell_uppercase_acronyms=spell_uppercase_acronyms, acronym_mode=acronym_mode, + normalize_date_values=normalize_date_values, normalize_time_values=normalize_time_values, normalize_year_values=normalize_year_values, normalize_units_values=normalize_units_values, @@ -1169,6 +1237,7 @@ def build_argparser() -> argparse.ArgumentParser: help="Ausgabeformat für buchstabierte Akronyme. Default: 'german' bei --lang de, sonst 'period_space'." ) p.add_argument("--pronunciation-dict", type=str, default=None, help="Pfad zu einer JSON-Datei mit Aussprache-Substitutionen (Eigenname → Lautschrift).") + p.add_argument("--no-normalize-dates", action="store_true", help="Datumsangaben (03.06.2026) nicht in sprechbaren Text umwandeln.") p.add_argument("--no-normalize-times", action="store_true", help="Uhrzeiten nicht in sprechbaren Text umwandeln.") p.add_argument("--no-normalize-years", action="store_true", help="Jahreszahlen nicht in sprechbaren Text umwandeln.") p.add_argument("--no-normalize-units", action="store_true", help="Einheiten nicht in sprechbaren Text umwandeln.") @@ -1241,6 +1310,7 @@ def main() -> int: show_progress=not args.no_progress, spell_uppercase_acronyms=not args.no_spell_acronyms, acronym_mode=acronym_mode, + normalize_date_values=not args.no_normalize_dates, normalize_time_values=not args.no_normalize_times, normalize_year_values=not args.no_normalize_years, normalize_units_values=not args.no_normalize_units, @@ -1263,6 +1333,7 @@ def main() -> int: show_progress=not args.no_progress, spell_uppercase_acronyms=not args.no_spell_acronyms, acronym_mode=acronym_mode, + normalize_date_values=not args.no_normalize_dates, normalize_time_values=not args.no_normalize_times, normalize_year_values=not args.no_normalize_years, normalize_units_values=not args.no_normalize_units,