Datum-Normalisierung: DD.MM.YYYY → deutsche Ordinalform
03.06.2026 → "Dritter Sechster Zwanzigsechsundzwanzig" 31.12.2027 → "Einunddreißigster Zwölfter Zwanzigsiebenundzwanzig" - DATE_RE erkennt DD.MM.YYYY (mit/ohne führende Null) - ordinal_de(): nominativer Ordinal (1→Erster, 12→Zwölfter, 31→Einunddreißigster) - year_for_date_de(): Datums-Jahrform (2026→Zwanzigsechsundzwanzig) - normalize_dates() wird VOR normalize_times() aufgerufen (verhindert dass TIME_RE "03.06" als Uhrzeit frisst) - Neues CLI-Flag --no-normalize-dates - Parameter normalize_date_values in preprocess_tts_text, synthesize_non_streaming, synthesize_streaming Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
ce46d4aa59
commit
ff3e370bbd
1 changed files with 71 additions and 0 deletions
|
|
@ -130,6 +130,11 @@ ACRONYM_COMPOUND_RE = re.compile(r'\b([A-ZÄÖÜ]{2,}(?:[A-ZÄÖÜ0-9]*[A-ZÄÖ
|
|||
# - 14.58 Uhr
|
||||
TIME_RE = re.compile(r'\b([01]?\d|2[0-3])([:.])([0-5]\d)(?:\s*Uhr)?\b', re.IGNORECASE)
|
||||
|
||||
# Datum DD.MM.YYYY — muss VOR TIME_RE verarbeitet werden (sonst matcht TIME_RE "03.06" als Uhrzeit)
|
||||
DATE_RE = re.compile(
|
||||
r'\b(0?[1-9]|[12]\d|3[01])\.(0?[1-9]|1[012])\.((?:19|20|21)\d{2})\b'
|
||||
)
|
||||
|
||||
# Vierstellige Jahreszahlen
|
||||
YEAR_RE = re.compile(r'\b(19\d{2}|20\d{2}|21\d{2})\b')
|
||||
|
||||
|
|
@ -200,9 +205,22 @@ DEFAULT_PRONUNCIATION_DE: dict[str, str] = {
|
|||
"MacBook": "Mäk Buk",
|
||||
"ChatGPT": "Tschet Dschie Pie Tie",
|
||||
"OpenAI": "Open A I",
|
||||
"Anthropic": "Enthropik",
|
||||
"Claude": "Kloode",
|
||||
# KI-Begriffe
|
||||
"GPT": "Dschie Pie Tie",
|
||||
"LLM": "El El Em",
|
||||
# Sonstige
|
||||
"UN": "Uh En",
|
||||
"ARD": "Ah Er Dee",
|
||||
"ZDF": "Tset De Eff",
|
||||
"RTL": "Er Te El",
|
||||
"AFD": "Ah Eff Dee",
|
||||
"AfD": "Ah Eff Dee",
|
||||
"CDU": "Tse De Uh",
|
||||
"SPD": "Es Pe Dee",
|
||||
"FDP": "Eff De Pee",
|
||||
"BSW": "Be Es Wee",
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -440,6 +458,47 @@ def year_to_words_en(year: int) -> str:
|
|||
return f"{number_to_words_en(first_two)} {number_to_words_en(last_two)}"
|
||||
|
||||
|
||||
_ORDINAL_IRREGULAR_DE = {
|
||||
1: "Erster", 2: "Zweiter", 3: "Dritter", 4: "Vierter",
|
||||
5: "Fünfter", 6: "Sechster", 7: "Siebter", 8: "Achter",
|
||||
9: "Neunter", 10: "Zehnter", 11: "Elfter", 12: "Zwölfter",
|
||||
13: "Dreizehnter", 14: "Vierzehnter", 15: "Fünfzehnter",
|
||||
16: "Sechzehnter", 17: "Siebzehnter", 18: "Achtzehnter",
|
||||
19: "Neunzehnter",
|
||||
}
|
||||
|
||||
|
||||
def ordinal_de(n: int) -> str:
|
||||
"""Nominativer maskuliner Ordinal (1 → 'Erster', 20 → 'Zwanzigster')."""
|
||||
if n in _ORDINAL_IRREGULAR_DE:
|
||||
return _ORDINAL_IRREGULAR_DE[n]
|
||||
return number_to_words_de(n).capitalize() + "ster"
|
||||
|
||||
|
||||
def year_for_date_de(year: int) -> str:
|
||||
"""Jahreszahl in Datums-Aussprache: 2026 → 'Zwanzigsechsundzwanzig'."""
|
||||
if 2010 <= year <= 2099:
|
||||
return "Zwanzig" + number_to_words_de(year % 100)
|
||||
if year == 2000:
|
||||
return "Zweitausend"
|
||||
if 2001 <= year <= 2009:
|
||||
return "Zweitausend" + number_to_words_de(year - 2000)
|
||||
return year_to_words_de(year).capitalize()
|
||||
|
||||
|
||||
def normalize_dates(text: str, lang: str) -> str:
|
||||
if lang != "de":
|
||||
return text
|
||||
|
||||
def repl(m: re.Match) -> str:
|
||||
day = int(m.group(1))
|
||||
month = int(m.group(2))
|
||||
year = int(m.group(3))
|
||||
return f"{ordinal_de(day)} {ordinal_de(month)} {year_for_date_de(year)}"
|
||||
|
||||
return DATE_RE.sub(repl, text)
|
||||
|
||||
|
||||
def spell_out_acronym(token: str, mode: str = "period_space") -> str:
|
||||
chars = list(token)
|
||||
|
||||
|
|
@ -523,6 +582,7 @@ def preprocess_tts_text(
|
|||
lang: str,
|
||||
spell_uppercase_acronyms: bool = True,
|
||||
acronym_mode: Optional[str] = None, # None = auto: 'german' bei de, sonst 'period_space'
|
||||
normalize_date_values: bool = True,
|
||||
normalize_time_values: bool = True,
|
||||
normalize_year_values: bool = True,
|
||||
normalize_units_values: bool = True,
|
||||
|
|
@ -540,6 +600,10 @@ def preprocess_tts_text(
|
|||
if normalize_units_values:
|
||||
text = normalize_units(text, lang)
|
||||
|
||||
# Datum VOR Uhrzeit normalisieren — sonst matcht TIME_RE "03.06" fälschlicherweise
|
||||
if normalize_date_values:
|
||||
text = normalize_dates(text, lang)
|
||||
|
||||
if normalize_time_values:
|
||||
text = normalize_times(text, lang)
|
||||
|
||||
|
|
@ -887,6 +951,7 @@ def synthesize_non_streaming(
|
|||
show_progress: bool = True,
|
||||
spell_uppercase_acronyms: bool = True,
|
||||
acronym_mode: Optional[str] = None,
|
||||
normalize_date_values: bool = True,
|
||||
normalize_time_values: bool = True,
|
||||
normalize_year_values: bool = True,
|
||||
normalize_units_values: bool = True,
|
||||
|
|
@ -921,6 +986,7 @@ def synthesize_non_streaming(
|
|||
preprocess_kw = dict(
|
||||
spell_uppercase_acronyms=spell_uppercase_acronyms,
|
||||
acronym_mode=acronym_mode,
|
||||
normalize_date_values=normalize_date_values,
|
||||
normalize_time_values=normalize_time_values,
|
||||
normalize_year_values=normalize_year_values,
|
||||
normalize_units_values=normalize_units_values,
|
||||
|
|
@ -1011,6 +1077,7 @@ def synthesize_streaming(
|
|||
show_progress: bool = True,
|
||||
spell_uppercase_acronyms: bool = True,
|
||||
acronym_mode: str = "period_space",
|
||||
normalize_date_values: bool = True,
|
||||
normalize_time_values: bool = True,
|
||||
normalize_year_values: bool = True,
|
||||
normalize_units_values: bool = True,
|
||||
|
|
@ -1050,6 +1117,7 @@ def synthesize_streaming(
|
|||
lang=lang,
|
||||
spell_uppercase_acronyms=spell_uppercase_acronyms,
|
||||
acronym_mode=acronym_mode,
|
||||
normalize_date_values=normalize_date_values,
|
||||
normalize_time_values=normalize_time_values,
|
||||
normalize_year_values=normalize_year_values,
|
||||
normalize_units_values=normalize_units_values,
|
||||
|
|
@ -1169,6 +1237,7 @@ def build_argparser() -> argparse.ArgumentParser:
|
|||
help="Ausgabeformat für buchstabierte Akronyme. Default: 'german' bei --lang de, sonst 'period_space'."
|
||||
)
|
||||
p.add_argument("--pronunciation-dict", type=str, default=None, help="Pfad zu einer JSON-Datei mit Aussprache-Substitutionen (Eigenname → Lautschrift).")
|
||||
p.add_argument("--no-normalize-dates", action="store_true", help="Datumsangaben (03.06.2026) nicht in sprechbaren Text umwandeln.")
|
||||
p.add_argument("--no-normalize-times", action="store_true", help="Uhrzeiten nicht in sprechbaren Text umwandeln.")
|
||||
p.add_argument("--no-normalize-years", action="store_true", help="Jahreszahlen nicht in sprechbaren Text umwandeln.")
|
||||
p.add_argument("--no-normalize-units", action="store_true", help="Einheiten nicht in sprechbaren Text umwandeln.")
|
||||
|
|
@ -1241,6 +1310,7 @@ def main() -> int:
|
|||
show_progress=not args.no_progress,
|
||||
spell_uppercase_acronyms=not args.no_spell_acronyms,
|
||||
acronym_mode=acronym_mode,
|
||||
normalize_date_values=not args.no_normalize_dates,
|
||||
normalize_time_values=not args.no_normalize_times,
|
||||
normalize_year_values=not args.no_normalize_years,
|
||||
normalize_units_values=not args.no_normalize_units,
|
||||
|
|
@ -1263,6 +1333,7 @@ def main() -> int:
|
|||
show_progress=not args.no_progress,
|
||||
spell_uppercase_acronyms=not args.no_spell_acronyms,
|
||||
acronym_mode=acronym_mode,
|
||||
normalize_date_values=not args.no_normalize_dates,
|
||||
normalize_time_values=not args.no_normalize_times,
|
||||
normalize_year_values=not args.no_normalize_years,
|
||||
normalize_units_values=not args.no_normalize_units,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue