diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index ce9da44..2527a1c 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -29,8 +29,8 @@ """ import re import sys -from collections.abc import Iterable, Iterator, Mapping, Set -from typing import Any, TypeVar +from collections.abc import Callable, Iterable, Iterator, Mapping, Set +from typing import Any, TypeVar, overload if sys.version_info >= (3, 11): from typing import Self @@ -62,8 +62,14 @@ class SetManager(Set): ''' + _on_change: Callable[[], None] | None + def __init__(self, elements: Iterable[str]) -> None: self.elements = set(elements) + # Optional invalidation hook, wired by an owning Constants so that + # in-place add()/remove() can clear its cached suffixes_prefixes_titles + # union. None when the manager is used standalone. + self._on_change = None def __call__(self) -> Set[str]: return self.elements @@ -93,6 +99,8 @@ def add_with_encoding(self, s: str, encoding: str | None = None) -> None: if isinstance(s, bytes): s = s.decode(encoding) self.elements.add(lc(s)) + if self._on_change: + self._on_change() def add(self, *strings: str) -> Self: """ @@ -112,7 +120,8 @@ def remove(self, *strings: str) -> Self: for s in strings: if (lower := lc(s)) in self.elements: self.elements.remove(lower) - + if self._on_change: + self._on_change() return self @@ -146,6 +155,42 @@ def __getattr__(self, attr: str) -> re.Pattern[str]: return self.get(attr, EMPTY_REGEX) +class _CachedUnionMember: + """Descriptor for the four ``SetManager`` attributes whose union ``Constants`` + caches in ``_pst`` (``prefixes``, ``suffix_acronyms``, ``suffix_not_acronyms``, + ``titles``). + + Assigning a new manager — or mutating one in place via ``add()`` / ``remove()`` + — invalidates that cache. Keeping the behavior on a descriptor scopes it to + exactly these attributes, beside their declarations, rather than spreading it + across a catch-all ``__setattr__`` and a separate attribute-name list. + """ + + _attr: str + + def __set_name__(self, owner: type, name: str) -> None: + self._attr = '_' + name + + @overload + def __get__(self, obj: None, objtype: type | None = None) -> '_CachedUnionMember': ... + @overload + def __get__(self, obj: 'Constants', objtype: type | None = None) -> SetManager: ... + + def __get__(self, obj: 'Constants | None', objtype: type | None = None) -> 'SetManager | _CachedUnionMember': + if obj is None: + return self + return getattr(obj, self._attr) + + def __set__(self, obj: 'Constants', value: SetManager) -> None: + previous = getattr(obj, self._attr, None) + if isinstance(previous, SetManager): + previous._on_change = None # detach the replaced manager so it no longer invalidates + if isinstance(value, SetManager): + value._on_change = obj._invalidate_pst + obj._invalidate_pst() + setattr(obj, self._attr, value) + + class Constants: """ An instance of this class hold all of the configuration constants for the parser. @@ -170,15 +215,14 @@ class Constants: :py:attr:`regexes` wrapped with :py:class:`TupleManager`. """ - prefixes: SetManager - suffix_acronyms: SetManager - suffix_not_acronyms: SetManager - titles: SetManager + prefixes = _CachedUnionMember() + suffix_acronyms = _CachedUnionMember() + suffix_not_acronyms = _CachedUnionMember() + titles = _CachedUnionMember() first_name_titles: SetManager conjunctions: SetManager capitalization_exceptions: TupleManager[str] regexes: RegexTupleManager - _pst: Set[str] | None string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" @@ -262,11 +306,13 @@ def __init__(self, self.conjunctions = SetManager(conjunctions) self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = RegexTupleManager(regexes) + + def _invalidate_pst(self) -> None: self._pst = None @property def suffixes_prefixes_titles(self) -> Set[str]: - if not self._pst: + if self._pst is None: self._pst = self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles return self._pst diff --git a/tests/base.py b/tests/base.py index 25032e2..c60a450 100644 --- a/tests/base.py +++ b/tests/base.py @@ -38,3 +38,9 @@ def assertFalse(self, expr: object, msg: object = None) -> None: def assertIn(self, member: object, container: object, msg: object = None) -> None: assert member in container, msg # type: ignore[operator] + + def assertNotIn(self, member: object, container: object, msg: object = None) -> None: + assert member not in container, msg # type: ignore[operator] + + def assertIs(self, first: object, second: object, msg: object = None) -> None: + assert first is second, msg diff --git a/tests/conftest.py b/tests/conftest.py index 27f19d8..7891b82 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -75,6 +75,3 @@ def empty_attribute_default(request: pytest.FixtureRequest) -> Iterator[str | No setattr(CONSTANTS, attr, value) for attr, value in collection_snapshot.items(): setattr(CONSTANTS, attr, value) - # Invalidate the lazily-built suffixes/prefixes/titles cache so it is - # recomputed from the restored collections rather than a mutated one. - CONSTANTS._pst = None diff --git a/tests/test_constants.py b/tests/test_constants.py index 9f619c3..c33002b 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -1,5 +1,5 @@ from nameparser import HumanName -from nameparser.config import Constants +from nameparser.config import Constants, SetManager from tests.base import HumanNameTestBase @@ -104,3 +104,119 @@ def test_add_constant_with_explicit_encoding(self) -> None: c = Constants() c.titles.add_with_encoding(b'b\351ck', encoding='latin_1') self.assertIn('béck', c.titles) + + def test_suffixes_prefixes_titles_reflects_add_title(self) -> None: + """suffixes_prefixes_titles must include titles added after construction.""" + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised + c.titles.add('emerita') + self.assertIn('emerita', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_prefix(self) -> None: + """suffixes_prefixes_titles must include prefixes added after construction.""" + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised + c.prefixes.add('xpfx') + self.assertIn('xpfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_remove_title(self) -> None: + """suffixes_prefixes_titles must not include a word that was only in titles and is then removed.""" + c = Constants() + c.titles.add('emerita') + self.assertIn('emerita', c.suffixes_prefixes_titles) + c.titles.remove('emerita') + self.assertNotIn('emerita', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_remove_prefix(self) -> None: + """suffixes_prefixes_titles must not include a word that was only in prefixes and is then removed.""" + c = Constants() + c.prefixes.add('xpfx') + self.assertIn('xpfx', c.suffixes_prefixes_titles) + c.prefixes.remove('xpfx') + self.assertNotIn('xpfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_suffix_acronym(self) -> None: + """suffixes_prefixes_titles must include suffix acronyms added after construction.""" + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised + c.suffix_acronyms.add('xsfx') + self.assertIn('xsfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_suffix_not_acronym(self) -> None: + """suffixes_prefixes_titles must include non-acronym suffixes added after construction.""" + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised + c.suffix_not_acronyms.add('xsfx') + self.assertIn('xsfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_after_initial_read(self) -> None: + """suffixes_prefixes_titles must reflect mutations even after the cache has been primed.""" + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache + c.titles.add('emerita') + self.assertIn('emerita', c.suffixes_prefixes_titles) + + def test_is_rootname_consistent_with_is_title(self) -> None: + """is_rootname must return False for words recognised by is_title.""" + hn = HumanName("", constants=None) + _ = hn.C.suffixes_prefixes_titles # prime the cache so a stale entry would be observable + hn.C.titles.add('emerita') + self.assertFalse(hn.is_rootname('emerita')) + + def test_is_rootname_consistent_with_is_prefix(self) -> None: + """is_rootname must return False for words recognised by is_prefix.""" + hn = HumanName("", constants=None) + _ = hn.C.suffixes_prefixes_titles # prime the cache so a stale entry would be observable + hn.C.prefixes.add('xpfx') + self.assertFalse(hn.is_rootname('xpfx')) + + def test_suffixes_prefixes_titles_reflects_remove_suffix_acronym(self) -> None: + """suffixes_prefixes_titles must reflect a suffix acronym removed after the cache is primed.""" + c = Constants() + c.suffix_acronyms.add('xsfx') + self.assertIn('xsfx', c.suffixes_prefixes_titles) # primes the cache + c.suffix_acronyms.remove('xsfx') + self.assertNotIn('xsfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_remove_suffix_not_acronym(self) -> None: + """suffixes_prefixes_titles must reflect a non-acronym suffix removed after the cache is primed.""" + c = Constants() + c.suffix_not_acronyms.add('xsfx') + self.assertIn('xsfx', c.suffixes_prefixes_titles) # primes the cache + c.suffix_not_acronyms.remove('xsfx') + self.assertNotIn('xsfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_with_encoding(self) -> None: + """add_with_encoding must invalidate the cache like add()/remove() do.""" + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache + c.titles.add_with_encoding(b'b\351ck', encoding='latin_1') + self.assertIn('béck', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_replaced_manager(self) -> None: + """Replacing a whole SetManager must invalidate the cache and wire the new manager. + + Covers the config-teardown path where a fresh SetManager is assigned + directly (e.g. ``setattr(CONSTANTS, 'titles', SetManager(...))``). + """ + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache + c.titles = SetManager(['brandnewtitle']) + # The replacement is reflected immediately... + self.assertIn('brandnewtitle', c.suffixes_prefixes_titles) + # ...and the new manager's own mutations invalidate the cache too, + # proving the on_change callback was re-wired to the replacement. + _ = c.suffixes_prefixes_titles + c.titles.add('secondtitle') + self.assertIn('secondtitle', c.suffixes_prefixes_titles) + + def test_replaced_manager_no_longer_invalidates_cache(self) -> None: + """A SetManager detached by reassignment must not invalidate the new cache.""" + c = Constants() + replaced = c.titles + c.titles = SetManager(['brandnewtitle']) + primed = c.suffixes_prefixes_titles + # Mutating the orphaned manager must leave the live cache untouched. + replaced.add('ghost') + self.assertIs(c.suffixes_prefixes_titles, primed) + self.assertNotIn('ghost', c.suffixes_prefixes_titles)