Source code for bacommon.loctext

# Released under the MIT License. See LICENSE for details.
#
"""Localized-text evaluation (structured plural/select + substitutions).

The value delivered for a locale is either a plain ``str`` or a
:class:`StringSelector`. :func:`evaluate` resolves either against a locale
and caller-supplied arguments to a final ``str``:

* A plain ``str`` has its ``{name}`` placeholders replaced by
  ``str(args['name'])``.
* A :class:`StringSelector` picks one *leaf* string at render time -- by
  CLDR plural category of an integer arg (``kind=PLURAL``) or by the string
  value of an arg (``kind=SELECT``) -- then substitutes ``{name}`` and (for
  plurals) ``#`` (the count) in the chosen leaf.

We use our own small structured form rather than parsing a message-format
string: the runtime decisions are few and closed, the data is
self-describing (it can only express what we support), and a struct is far
safer to port than a hand-rolled string parser. This is the prototype of
the native ``Lstr2`` runtime -- keeping it in ``bacommon`` gives pure-Python
consumers (the client's embedded Python, server-side resolution in
bamaster/basn) the same evaluator, and the :class:`StringSelector`
``IOAttrs`` keys are the on-the-wire schema the C++ port implements.

Scope: cardinal plural selection for non-negative integers, string
``select``, ``{name}`` substitution, and ``#`` for the plural count.
``=N`` keys match an exact count before the category rule. Not (yet):
ordinals/selectordinal, decimal operands, plural offsets, nested selectors.
"""

from __future__ import annotations  # Docs-generation hack.

import re
from enum import Enum
from dataclasses import dataclass
from typing import TYPE_CHECKING, Annotated

from efro.dataclassio import ioprepped, IOAttrs
from bacommon.locale import Locale

if TYPE_CHECKING:
    from collections.abc import Callable, Mapping

#: A ``{name}`` substitution placeholder (lowercase snake_case arg name).
_SUB_RE = re.compile(r'\{([a-z][a-z0-9_]*)\}')


[docs] class PluralCategory(Enum): """A CLDR plural category.""" ZERO = 'zero' ONE = 'one' TWO = 'two' FEW = 'few' MANY = 'many' OTHER = 'other'
# --- CLDR cardinal plural rules (integer-only) ----------------------------- # # Scope: cardinal selection for NON-NEGATIVE INTEGER counts ("3 boxes", # "1 kill") -- the only case the string system needs. For an integer every # CLDR fraction operand (v/w/f/t) is zero, so each rule collapses to a # function of n alone (with n % 10 / n % 100). Decimals, ordinals # (``selectordinal``), and compact/scientific are intentionally # unsupported. # # These compact functions are cross-checked against ICU's authoritative # ``PluralRules`` for every shipped locale by the producer-side # ``test_loctext_cldr`` test (where ``pyicu`` lives) -- so this table is # provably CLDR-correct for integers without carrying any CLDR data or a # rule-expression parser at runtime, and it ports trivially to the native # ``Lstr2``. # # One deliberate gap, bounded above 1,000,000: a handful of West-European # locales (es/it/pt/fr/vec) put EXACT millions in a distinct ``many`` form # (e.g. Spanish "un millón DE puntos"). We don't model that -- counts at or # above 1,000,000 in those locales get ``other`` where CLDR says ``many``. # Adding it later is a one-line ``n % 1_000_000 == 0`` clause plus widening # the cross-check range; it's omitted now since game counts stay well below # a million and it would force an extra translated form for those locales. def _rule_other_only(n: int) -> PluralCategory: """Always ``other`` -- no count distinction (zh, ja, ko, th, vi, id, ms).""" del n # Unused. return PluralCategory.OTHER def _rule_one_other(n: int) -> PluralCategory: """``one`` for 1, else ``other``. The common Germanic/Romance rule (en, de, nl, da, sv, el, hu, tr, ta, kk, eo, ...). Also covers es/it/vec below the deferred millions-``many`` threshold (see the module-level note). """ return PluralCategory.ONE if n == 1 else PluralCategory.OTHER def _rule_zero_one_other(n: int) -> PluralCategory: """``one`` for 0 and 1, else ``other`` (fr, pt, fa, hi). (fr/pt also have the deferred millions-``many``; below that threshold they match this rule.) """ return PluralCategory.ONE if n in (0, 1) else PluralCategory.OTHER def _rule_russian(n: int) -> PluralCategory: """East-Slavic rule (ru, uk, be): one/few/many.""" mod10 = n % 10 mod100 = n % 100 if mod10 == 1 and mod100 != 11: return PluralCategory.ONE if 2 <= mod10 <= 4 and not 12 <= mod100 <= 14: return PluralCategory.FEW return PluralCategory.MANY def _rule_polish(n: int) -> PluralCategory: """Polish rule (pl): one/few/many.""" if n == 1: return PluralCategory.ONE mod10 = n % 10 mod100 = n % 100 if 2 <= mod10 <= 4 and not 12 <= mod100 <= 14: return PluralCategory.FEW return PluralCategory.MANY def _rule_czech(n: int) -> PluralCategory: """West-Slavic rule (cs, sk): one/few/other for integers. (CLDR's ``many`` for cs/sk only occurs for decimals, so integers never select it.) """ if n == 1: return PluralCategory.ONE if 2 <= n <= 4: return PluralCategory.FEW return PluralCategory.OTHER def _rule_croatian(n: int) -> PluralCategory: """South-Slavic rule (hr, sr): one/few/other for integers. Like East-Slavic but ``many`` (decimal-only here) folds into ``other``. """ mod10 = n % 10 mod100 = n % 100 if mod10 == 1 and mod100 != 11: return PluralCategory.ONE if 2 <= mod10 <= 4 and not 12 <= mod100 <= 14: return PluralCategory.FEW return PluralCategory.OTHER def _rule_romanian(n: int) -> PluralCategory: """Romanian rule (ro): one/few/other.""" if n == 1: return PluralCategory.ONE if n == 0 or 1 <= n % 100 <= 19: return PluralCategory.FEW return PluralCategory.OTHER def _rule_arabic(n: int) -> PluralCategory: """Arabic rule (ar): zero/one/two/few/many/other.""" mod100 = n % 100 if n == 0: return PluralCategory.ZERO if n == 1: return PluralCategory.ONE if n == 2: return PluralCategory.TWO if 3 <= mod100 <= 10: return PluralCategory.FEW if 11 <= mod100 <= 99: return PluralCategory.MANY return PluralCategory.OTHER def _rule_filipino(n: int) -> PluralCategory: """Filipino/Tagalog rule (fil): ``other`` only for integers ending 4/6/9.""" return PluralCategory.OTHER if n % 10 in (4, 6, 9) else PluralCategory.ONE # Total ``Locale -> rule`` map over every resolved (canonical) locale we # ship; the completeness + correctness of this table is enforced by the # producer-side ``test_loctext_cldr`` test. Obsolete aliases resolve to # their canonical form before lookup (see ``plural_category``). Grouped by # rule family for readability. _RULE_FOR_LOCALE: dict[Locale, Callable[[int], PluralCategory]] = { # other-only (no count distinction). Locale.CHINESE_TRADITIONAL: _rule_other_only, Locale.CHINESE_SIMPLIFIED: _rule_other_only, Locale.JAPANESE: _rule_other_only, Locale.KOREAN: _rule_other_only, Locale.THAI: _rule_other_only, Locale.VIETNAMESE: _rule_other_only, Locale.INDONESIAN: _rule_other_only, Locale.MALAY: _rule_other_only, # one/other (n == 1). Locale.ENGLISH: _rule_one_other, Locale.GERMAN: _rule_one_other, Locale.DUTCH: _rule_one_other, Locale.DANISH: _rule_one_other, Locale.SWEDISH: _rule_one_other, Locale.GREEK: _rule_one_other, Locale.HUNGARIAN: _rule_one_other, Locale.TURKISH: _rule_one_other, Locale.ESPERANTO: _rule_one_other, Locale.TAMIL: _rule_one_other, Locale.KAZAKH: _rule_one_other, Locale.SPANISH_SPAIN: _rule_one_other, Locale.SPANISH_LATIN_AMERICA: _rule_one_other, Locale.ITALIAN: _rule_one_other, Locale.VENETIAN: _rule_one_other, # European Portuguese is one-for-1 (unlike Brazilian, which is 0,1). Locale.PORTUGUESE_PORTUGAL: _rule_one_other, # Novelty/English-derived locales with no CLDR entry -> English's rule. Locale.PIRATE_SPEAK: _rule_one_other, Locale.GIBBERISH: _rule_one_other, # 0,1 -> one. Locale.FRENCH: _rule_zero_one_other, Locale.PORTUGUESE_BRAZIL: _rule_zero_one_other, Locale.PERSIAN: _rule_zero_one_other, Locale.HINDI: _rule_zero_one_other, # East-Slavic one/few/many. Locale.RUSSIAN: _rule_russian, Locale.UKRAINIAN: _rule_russian, Locale.BELARUSSIAN: _rule_russian, # Polish one/few/many. Locale.POLISH: _rule_polish, # West-Slavic one/few/other. Locale.CZECH: _rule_czech, Locale.SLOVAK: _rule_czech, # South-Slavic one/few/other. Locale.CROATIAN: _rule_croatian, Locale.SERBIAN: _rule_croatian, # Romanian one/few/other. Locale.ROMANIAN: _rule_romanian, # Arabic zero/one/two/few/many/other. Locale.ARABIC: _rule_arabic, # Filipino. Locale.FILIPINO: _rule_filipino, }
[docs] def plural_category(locale: Locale, n: int) -> PluralCategory: """Return the CLDR cardinal plural category for an integer in a locale. ``n`` is treated as a non-negative integer count (its absolute value is used). Uses the locale's resolved form so obsolete aliases (e.g. the bare ``SPANISH``) follow their modern locale's rule. Every shipped locale is mapped explicitly; a genuinely-unknown locale falls back to the ``one``-for-1 rule. See the module note for the integer-only scope. """ resolved = locale.resolved.locale rule = _RULE_FOR_LOCALE.get(resolved, _rule_one_other) return rule(abs(n))
#: Canonical CLDR category order, for stable presentation. _CATEGORY_ORDER = [ PluralCategory.ZERO, PluralCategory.ONE, PluralCategory.TWO, PluralCategory.FEW, PluralCategory.MANY, PluralCategory.OTHER, ]
[docs] def required_plural_categories(locale: Locale) -> list[PluralCategory]: """The plural categories a ``plural`` message must cover for a locale. The set the locale's rule can produce over the integers, plus the mandatory ``other`` fallback (ICU requires it), in canonical order. Used to tell a translation model exactly which plural forms to emit for the target locale -- derived from the same rules :func:`plural_category` evaluates with, so producer and client agree. The integer sample (0..200) spans every modulo cycle the rules use; under the integer-only scope (see the module note) this is the exact set of categories the locale can select. """ present = {plural_category(locale, n) for n in range(201)} present.add(PluralCategory.OTHER) return [c for c in _CATEGORY_ORDER if c in present]
# --- Structured selectors + evaluation -------------------------------------
[docs] class LocTextError(Exception): """A malformed localized value or a missing/bad argument."""
[docs] class SelectorKind(Enum): """How a :class:`StringSelector` chooses among its forms.""" #: Choose by the CLDR plural category of an integer argument. PLURAL = 'p' #: Choose by the string value of an argument (e.g. gender). SELECT = 's'
[docs] @ioprepped @dataclass class StringSelector: """A localized string whose final form is chosen at render time. The structured alternative to a plain ``str`` value: ``forms`` maps a form key to its leaf text (which may carry ``{name}`` substitutions and, for plurals, ``#`` for the count). For ``PLURAL`` the keys are CLDR category names (``one``/``few``/…/``other``) or ``=N`` exact-count matches; for ``SELECT`` they are the possible string values of ``arg``. ``other`` is the fallback. The ``IOAttrs`` keys are the on-the-wire schema shared with the native ``Lstr2`` port. """ kind: SelectorKind arg: str forms: dict[str, str]
[docs] def evaluate( value: 'str | StringSelector', locale: Locale, **args: object ) -> str: """Resolve a localized ``value`` to a final string. ``value`` is a plain ``str`` (``{name}`` substitutions only) or a :class:`StringSelector` (plural/select choice, then substitution). ``args`` supplies the named arguments; plural selection uses ``locale``'s CLDR rule (see :func:`plural_category`). Raises :class:`LocTextError` on a missing/ill-typed argument or a selector with no matching form and no ``other``. """ if isinstance(value, StringSelector): return _eval_selector(value, locale, args) return _substitute(value, args, pound=None)
def _substitute( text: str, args: 'Mapping[str, object]', pound: int | None ) -> str: """Expand ``{name}`` placeholders (and ``#`` when ``pound`` is set).""" # ``#`` first, on the template, so a substituted value that happens to # contain ``#`` is left untouched. if pound is not None: text = text.replace('#', str(pound)) def _repl(match: 're.Match[str]') -> str: name = match.group(1) if name not in args: raise LocTextError(f'Missing argument {name!r}.') return str(args[name]) return _SUB_RE.sub(_repl, text) def _eval_selector( sel: StringSelector, locale: Locale, args: 'Mapping[str, object]' ) -> str: """Pick ``sel``'s form for the args, then substitute its leaf text.""" if sel.arg not in args: raise LocTextError(f'Missing argument {sel.arg!r}.') raw = args[sel.arg] if sel.kind is SelectorKind.PLURAL: # ``bool`` is an ``int`` subclass but never a meaningful count. if isinstance(raw, bool) or not isinstance(raw, (int, float, str)): raise LocTextError( f'Plural argument {sel.arg!r} must be a number; got {raw!r}.' ) try: value = int(raw) except ValueError as exc: raise LocTextError( f'Plural argument {sel.arg!r} must be an integer;' f' got {raw!r}.' ) from exc # Exact ``=N`` matches win over the category rule. form = sel.forms.get(f'={value}') if form is None: category = plural_category(locale, value) form = sel.forms.get(category.value) or sel.forms.get('other') if form is None: raise LocTextError( f'Plural for {sel.arg!r} has no matching form and no' " 'other'." ) return _substitute(form, args, pound=value) # SELECT: key by the argument's string value. key = str(raw) form = sel.forms.get(key) or sel.forms.get('other') if form is None: raise LocTextError( f'Select for {sel.arg!r} has no matching key {key!r} and no' " 'other'." ) return _substitute(form, args, pound=None) # Docs-generation hack; import some stuff that we likely only forward-declared # in our actual source code so that docs tools can find it. from typing import (Coroutine, Any, Literal, Callable, Generator, Awaitable, Sequence, Self) import asyncio from concurrent.futures import Future from pathlib import Path from enum import Enum