Source code for bacommon.langstr._core

# Released under the MIT License. See LICENSE for details.
#
"""Core of the language-string runtime model.

See ``docs/initiatives/language-string-context.md`` (ballistica-internal)
for the full design. Three pieces:

* :class:`Lstr` -- a deferred, language-agnostic complex string (an apverid
  + a string name + keyword substitution values, each a flat ``str``/``int``
  or a nested :class:`Lstr`).
* :class:`LanguageStringEncodeContext` -- turns a batch of :class:`Lstr` into
  minimal, language-free encoded chunks plus the ``{pkg_int: apverid}`` map.
* :class:`LanguageStringDecodeContext` -- single-locale; turns an encoded
  chunk back into a flat string via :func:`bacommon.loctext.evaluate`.

Error posture is deliberately asymmetric: encoding is the authoring side
(you control the data) so it raises :class:`LangStrError` loudly; decoding
is the consumer side (you receive data) so it is fail-visible -- it returns
an ``LSTR_ERROR:…`` sentinel and logs, never crashing the caller.
"""

from __future__ import annotations  # Docs-generation hack.

import logging
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Annotated

from efro.dataclassio import ioprepped, IOAttrs
from bacommon.loctext import evaluate, LocTextError

if TYPE_CHECKING:
    from bacommon.locale import Locale
    from bacommon.loctext import StringSelector

logger = logging.getLogger(__name__)


[docs] class LangStrError(Exception): """A malformed language-string or encode-context operation."""
class _DecodeFail(Exception): """Internal: a structural problem while decoding a chunk. Caught once in :meth:`LanguageStringDecodeContext.decode` and turned into the fail-visible ``LSTR_ERROR:…`` sentinel. """
[docs] @ioprepped @dataclass class Lstr: """A deferred, language-agnostic complex string. ``subs`` maps each substitution keyword to its value -- a flat ``str`` / ``int`` or a nested :class:`Lstr`. A no-arg string has empty ``subs``. The value carries its own exact ``apverid`` (so an encode context can discover the package union from the values themselves) and the string's logical ``name`` (mapped to its integer index at encode time). This is ``@ioprepped`` so it can be sent directly on the wire (the name-based docui-v2 form). Flat ``str``/``int`` subs serialize directly; nested-:class:`Lstr` subs are exercised by the in-memory encode / name-decode paths but are not yet directly JSON-serializable here (they graduate with the integer-indexed :data:`EncodedLstr` form). """ apverid: str name: str subs: dict = field( default_factory=dict )
#: A substitution value: a flat string/number, or a nested language-string. type LstrSub = str | int | Lstr #: An encoded chunk: ``[pkg_int, str_int, sub0, sub1, …]`` where each sub is #: a flat ``str``/``int`` or a nested chunk (a list). Plain JSON -- the #: flat-vs-nested distinction is "str/int vs list". type EncodedLstr = list[str | int | 'EncodedLstr']
[docs] @dataclass(frozen=True) class StringDef: """One string's language-free definition. ``params`` is the ordered list of ``(keyword, kind)`` where kind is ``'text'`` (a text sub -> ``str | Lstr``) or ``'count'`` (the plural pivot -> ``int``); ``()`` for a no-arg string. The canonical ordering (sorted keyword) is what fixes the positional substitution order. """ path: str params: tuple[tuple[str, str], ...] = ()
[docs] @dataclass(frozen=True) class PackageDef: """Language-free definition of one asset-package-version's strings. The shared source the encode/decode :class:`PackageStructure` and the type-safe wrapper codegen both derive from (in the real system, from an apverid's resolved listing; in tests, hand-built). """ apverid: str strings: tuple[StringDef, ...]
[docs] class PackageStructure: """Language-free structure of one asset-package-version. Maps string names <-> integer indices (assigned in canonical sorted-name order so both ends agree without shipping the mapping) and holds each string's ordered substitution-keyword list. Carries no translations -- encoding needs only this. """
[docs] @classmethod def from_def(cls, pkgdef: PackageDef) -> 'PackageStructure': """Build the encode/decode structure from a package definition.""" return cls( pkgdef.apverid, { sdef.path: tuple(name for name, _kind in sdef.params) for sdef in pkgdef.strings }, )
def __init__( self, apverid: str, strings: dict[str, tuple[str, ...]] ) -> None: #: ``strings`` maps each logical name to its ordered substitution #: keywords (``()`` for a no-arg string). self.apverid = apverid self._names: tuple[str, ...] = tuple(sorted(strings)) self._index: dict[str, int] = { name: i for i, name in enumerate(self._names) } self._params: dict[str, tuple[str, ...]] = dict(strings)
[docs] def index_of(self, name: str) -> int: """Return the integer index for a string name.""" return self._index[name]
[docs] def name_of(self, index: int) -> str: """Return the string name for an integer index.""" return self._names[index]
[docs] def params_of(self, name: str) -> tuple[str, ...]: """Return the ordered substitution keywords for a string name.""" return self._params[name]
[docs] class LanguageStringEncodeContext: """Encodes :class:`Lstr` values into minimal language-free chunks. Built from the batch of values to send: it computes the union of apverids they reference (recursively -- nested values know their own apverid) and assigns each a stable integer index. :meth:`encode` then emits ``[pkg_int, str_int, …subs]``; :attr:`package_index_map` is the only mapping the consumer needs (string indices resolve from the content-pinned apverid itself). """ def __init__( self, lstrs: list[Lstr], structures: dict[str, PackageStructure], ) -> None: self._structures = structures apverids: set[str] = set() for lstr in lstrs: self._collect(lstr, apverids) # Sorted -> deterministic indices for a given apverid set. self._pkg_index = {av: i for i, av in enumerate(sorted(apverids))} def _collect(self, lstr: Lstr, acc: set[str]) -> None: acc.add(lstr.apverid) for val in lstr.subs.values(): if isinstance(val, Lstr): self._collect(val, acc) @property def package_index_map(self) -> dict[int, str]: """The ``{pkg_int: apverid}`` map a decoder needs.""" return {i: av for av, i in self._pkg_index.items()}
[docs] def encode(self, lstr: Lstr) -> EncodedLstr: """Encode one value (recursively) into a minimal chunk.""" pkg_int = self._pkg_index.get(lstr.apverid) struct = self._structures.get(lstr.apverid) if pkg_int is None or struct is None: raise LangStrError( f'apverid {lstr.apverid!r} is not in this encode context' ) try: str_int = struct.index_of(lstr.name) params = struct.params_of(lstr.name) except KeyError as exc: raise LangStrError( f'unknown string {lstr.name!r} in {lstr.apverid}' ) from exc out: list[str | int | EncodedLstr] = [pkg_int, str_int] for param in params: if param not in lstr.subs: raise LangStrError( f'missing substitution {param!r} for {lstr.name!r}' ) val = lstr.subs[param] out.append(self.encode(val) if isinstance(val, Lstr) else val) return out
[docs] class LanguageStringDecodeContext: """Decodes chunks into flat strings for one target locale. Holds the ``{pkg_int: apverid}`` map (from the encoder), the package structures, and the per-apverid string values **for a single locale**. :meth:`decode` resolves a chunk (recursively rendering nested values) via :func:`bacommon.loctext.evaluate`. """ def __init__( self, package_index_map: dict[int, str], structures: dict[str, PackageStructure], language: dict[str, dict[str, str | StringSelector]], locale: Locale, ) -> None: #: ``language`` maps apverid -> {string-name: value} for ``locale``. self._pkg_map = package_index_map self._structures = structures self._language = language self._locale = locale
[docs] def decode(self, encoded: EncodedLstr) -> str: """Resolve a chunk to a flat string in this context's locale. Fail-visible: any structural problem yields an ``LSTR_ERROR:…`` sentinel (and a logged warning) rather than crashing the caller. """ try: return self._decode(encoded) except _DecodeFail as exc: logger.warning('langstr decode: %s', exc) return f'LSTR_ERROR:{exc}'
def _decode(self, encoded: EncodedLstr) -> str: if len(encoded) < 2: raise _DecodeFail(f'malformed chunk {encoded!r}') pkg_int = encoded[0] str_int = encoded[1] if not isinstance(pkg_int, int) or not isinstance(str_int, int): raise _DecodeFail(f'non-int index in {encoded!r}') apverid = self._pkg_map.get(pkg_int) if ( apverid is None or apverid not in self._structures or apverid not in self._language ): raise _DecodeFail(f'unknown package index {pkg_int}') struct = self._structures[apverid] try: name = struct.name_of(str_int) params = struct.params_of(name) except (IndexError, KeyError): raise _DecodeFail( f'unknown string index {str_int} in {apverid}' ) from None values = self._language[apverid] if name not in values: raise _DecodeFail(f'no value for {name!r} in {apverid}') subs = encoded[2:] if len(subs) != len(params): raise _DecodeFail( f'arity mismatch for {name!r}: {len(subs)} != {len(params)}' ) kwargs: dict[str, str | int] = {} for param, sub in zip(params, subs): # A nested chunk (list) renders recursively to a flat string. kwargs[param] = self.decode(sub) if isinstance(sub, list) else sub try: return evaluate(values[name], self._locale, **kwargs) except LocTextError as exc: raise _DecodeFail(f'eval failed for {name!r}: {exc}') from exc
[docs] class LanguageStringNameDecodeContext: """Decodes :class:`Lstr` values directly, by name, for one locale. The name-based counterpart to :class:`LanguageStringDecodeContext`: it resolves an in-memory :class:`Lstr` (carrying its ``apverid``, string ``name``, and keyword ``subs``) straight against per-apverid per-locale values -- no integer indices, package-index-map, or :class:`PackageStructure` needed, since the subs are self-describing keyword->value pairs. This is the client's primary path: resolve the referenced packages, gather their per-locale values, then decode each :class:`Lstr` in the client's locale. Fail-visible like :class:`LanguageStringDecodeContext` -- any structural problem yields an ``LSTR_ERROR:…`` sentinel (and a logged warning) rather than crashing the caller. """ def __init__( self, language: dict[str, dict[str, str | StringSelector]], locale: Locale, ) -> None: #: ``language`` maps apverid -> {string-name: value} for ``locale``. self._language = language self._locale = locale
[docs] def decode(self, lstr: Lstr) -> str: """Resolve an :class:`Lstr` to a flat string in this context's locale. Fail-visible: any structural problem yields an ``LSTR_ERROR:…`` sentinel (and a logged warning) rather than crashing the caller. """ try: return self._decode(lstr) except _DecodeFail as exc: logger.warning('langstr name-decode: %s', exc) return f'LSTR_ERROR:{exc}'
def _decode(self, lstr: Lstr) -> str: values = self._language.get(lstr.apverid) if values is None: raise _DecodeFail(f'no values for package {lstr.apverid!r}') value = values.get(lstr.name) if value is None: raise _DecodeFail(f'no value for {lstr.name!r} in {lstr.apverid}') kwargs: dict[str, str | int] = {} for key, sub in lstr.subs.items(): # A nested Lstr renders recursively to a flat string. kwargs[key] = self._decode(sub) if isinstance(sub, Lstr) else sub try: return evaluate(value, self._locale, **kwargs) except LocTextError as exc: raise _DecodeFail(f'eval failed for {lstr.name!r}: {exc}') from exc
# Docs-generation hack; import some stuff that we likely only forward-declared # in our actual source code so that docs tools can find it. from typing import (Coroutine, Any, Literal, Callable, Generator, Awaitable, Sequence, Self) import asyncio from concurrent.futures import Future from pathlib import Path from enum import Enum