From e75e928cbb8c993f4f185e038038e54a39bab0b7 Mon Sep 17 00:00:00 2001 From: Vincent Gao Date: Thu, 25 Jun 2026 14:04:08 +0200 Subject: [PATCH 1/5] fix: suffixes_prefixes_titles always reflects current set state The `suffixes_prefixes_titles` property on `Constants` cached its result in `_pst` after the first access. Any subsequent `add()` or `remove()` call on `titles`, `prefixes`, `suffix_acronyms`, or `suffix_not_acronyms` was silently ignored by the cache, so `is_rootname()` kept returning the stale result. Concretely, a word added to `C.titles` after the property was first accessed would still be treated as a rootname (first/middle/last) by `join_on_conjunctions`, even though `is_title()` correctly returned `True` for it. This contradicts the documented behaviour of per-instance config customisation described in AGENTS.md. Fix: drop the `_pst` cache entirely and compute the union fresh on every access. The four-set union is cheap and the simplest correct approach. Add five tests that assert the property and `is_rootname` stay consistent with the live sets after `add()`/`remove()` calls. --- nameparser/config/__init__.py | 6 +----- tests/test_constants.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index ce9da44..624ef6b 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -179,7 +179,6 @@ class Constants: capitalization_exceptions: TupleManager[str] regexes: RegexTupleManager - _pst: Set[str] | None string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" """ @@ -262,13 +261,10 @@ def __init__(self, self.conjunctions = SetManager(conjunctions) self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = RegexTupleManager(regexes) - self._pst = None @property def suffixes_prefixes_titles(self) -> Set[str]: - if not self._pst: - self._pst = self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles - return self._pst + return self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles def __repr__(self) -> str: return "" diff --git a/tests/test_constants.py b/tests/test_constants.py index 9f619c3..ca13dc6 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -104,3 +104,35 @@ def test_add_constant_with_explicit_encoding(self) -> None: c = Constants() c.titles.add_with_encoding(b'b\351ck', encoding='latin_1') self.assertIn('béck', c.titles) + + def test_suffixes_prefixes_titles_reflects_add_title(self) -> None: + """suffixes_prefixes_titles must include titles added after construction.""" + c = Constants() + c.titles.add('emerita') + self.assertIn('emerita', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_prefix(self) -> None: + """suffixes_prefixes_titles must include prefixes added after construction.""" + c = Constants() + c.prefixes.add('xpfx') + self.assertIn('xpfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_remove_title(self) -> None: + """suffixes_prefixes_titles must not include a word that was only in titles and is then removed.""" + c = Constants() + c.titles.add('emerita') + self.assertIn('emerita', c.suffixes_prefixes_titles) + c.titles.remove('emerita') + self.assertFalse('emerita' in c.suffixes_prefixes_titles) + + def test_is_rootname_consistent_with_is_title(self) -> None: + """is_rootname must return False for words recognised by is_title.""" + hn = HumanName("", constants=None) + hn.C.titles.add('emerita') + self.assertFalse(hn.is_rootname('emerita')) + + def test_is_rootname_consistent_with_is_prefix(self) -> None: + """is_rootname must return False for words recognised by is_prefix.""" + hn = HumanName("", constants=None) + hn.C.prefixes.add('xpfx') + self.assertFalse(hn.is_rootname('xpfx')) From f69ecee0fa0fccd91eaf65088693ad3a06a82369 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sat, 27 Jun 2026 12:52:32 -0700 Subject: [PATCH 2/5] perf: restore _pst cache with proper invalidation via callbacks and __setattr__ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original cache was dropped to fix staleness, but recomputing the set union on every call is ~1000x slower. This restores the cache with a correct invalidation strategy: SetManager fires an on_change callback after add/remove, and Constants.__setattr__ clears _pst and re-wires the callback whenever one of the four contributing attrs is replaced (covering both user mutations and conftest teardown restores). The conftest manual `CONSTANTS._pst = None` reset is removed — it is now handled automatically by __setattr__ during collection restore. Co-Authored-By: Claude Sonnet 4.6 --- nameparser/config/__init__.py | 29 ++++++++++++++++++++++++----- tests/conftest.py | 3 --- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 624ef6b..e1060cf 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -29,7 +29,7 @@ """ import re import sys -from collections.abc import Iterable, Iterator, Mapping, Set +from collections.abc import Callable, Iterable, Iterator, Mapping, Set from typing import Any, TypeVar if sys.version_info >= (3, 11): @@ -62,8 +62,9 @@ class SetManager(Set): ''' - def __init__(self, elements: Iterable[str]) -> None: + def __init__(self, elements: Iterable[str], on_change: Callable[[], None] | None = None) -> None: self.elements = set(elements) + self._on_change = on_change def __call__(self) -> Set[str]: return self.elements @@ -93,6 +94,8 @@ def add_with_encoding(self, s: str, encoding: str | None = None) -> None: if isinstance(s, bytes): s = s.decode(encoding) self.elements.add(lc(s)) + if self._on_change: + self._on_change() def add(self, *strings: str) -> Self: """ @@ -112,7 +115,8 @@ def remove(self, *strings: str) -> Self: for s in strings: if (lower := lc(s)) in self.elements: self.elements.remove(lower) - + if self._on_change: + self._on_change() return self @@ -146,6 +150,9 @@ def __getattr__(self, attr: str) -> re.Pattern[str]: return self.get(attr, EMPTY_REGEX) +_PST_ATTRS = frozenset(('prefixes', 'suffix_acronyms', 'suffix_not_acronyms', 'titles')) + + class Constants: """ An instance of this class hold all of the configuration constants for the parser. @@ -178,7 +185,7 @@ class Constants: conjunctions: SetManager capitalization_exceptions: TupleManager[str] regexes: RegexTupleManager - + _pst: Set[str] | None string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" """ @@ -262,9 +269,21 @@ def __init__(self, self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = RegexTupleManager(regexes) + def __setattr__(self, name: str, value: object) -> None: + if name in _PST_ATTRS: + object.__setattr__(self, '_pst', None) + if isinstance(value, SetManager): + value._on_change = self._invalidate_pst + object.__setattr__(self, name, value) + + def _invalidate_pst(self) -> None: + self._pst = None + @property def suffixes_prefixes_titles(self) -> Set[str]: - return self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles + if not self._pst: + self._pst = self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles + return self._pst def __repr__(self) -> str: return "" diff --git a/tests/conftest.py b/tests/conftest.py index 27f19d8..7891b82 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -75,6 +75,3 @@ def empty_attribute_default(request: pytest.FixtureRequest) -> Iterator[str | No setattr(CONSTANTS, attr, value) for attr, value in collection_snapshot.items(): setattr(CONSTANTS, attr, value) - # Invalidate the lazily-built suffixes/prefixes/titles cache so it is - # recomputed from the restored collections rather than a mutated one. - CONSTANTS._pst = None From 5fc4b6b15ddbe0dd69cd10831c8b8ac50da19471 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sat, 27 Jun 2026 12:56:58 -0700 Subject: [PATCH 3/5] test: expand suffixes_prefixes_titles coverage and add assertNotIn helper Add missing test cases flagged during PR review: - prime-then-mutate test that would catch a regression to cached staleness - add tests for suffix_acronyms and suffix_not_acronyms (previously untested) - remove test for prefixes (mirroring the existing titles remove test) - assertFalse(x in ...) -> assertNotIn for the two remove tests Also adds assertNotIn to HumanNameTestBase to support the new assertions. Co-Authored-By: Claude Sonnet 4.6 --- tests/base.py | 3 +++ tests/test_constants.py | 29 ++++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/tests/base.py b/tests/base.py index 25032e2..1f20857 100644 --- a/tests/base.py +++ b/tests/base.py @@ -38,3 +38,6 @@ def assertFalse(self, expr: object, msg: object = None) -> None: def assertIn(self, member: object, container: object, msg: object = None) -> None: assert member in container, msg # type: ignore[operator] + + def assertNotIn(self, member: object, container: object, msg: object = None) -> None: + assert member not in container, msg # type: ignore[operator] diff --git a/tests/test_constants.py b/tests/test_constants.py index ca13dc6..5df2fe2 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -123,7 +123,34 @@ def test_suffixes_prefixes_titles_reflects_remove_title(self) -> None: c.titles.add('emerita') self.assertIn('emerita', c.suffixes_prefixes_titles) c.titles.remove('emerita') - self.assertFalse('emerita' in c.suffixes_prefixes_titles) + self.assertNotIn('emerita', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_remove_prefix(self) -> None: + """suffixes_prefixes_titles must not include a word that was only in prefixes and is then removed.""" + c = Constants() + c.prefixes.add('xpfx') + self.assertIn('xpfx', c.suffixes_prefixes_titles) + c.prefixes.remove('xpfx') + self.assertNotIn('xpfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_suffix_acronym(self) -> None: + """suffixes_prefixes_titles must include suffix acronyms added after construction.""" + c = Constants() + c.suffix_acronyms.add('xsfx') + self.assertIn('xsfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_suffix_not_acronym(self) -> None: + """suffixes_prefixes_titles must include non-acronym suffixes added after construction.""" + c = Constants() + c.suffix_not_acronyms.add('xsfx') + self.assertIn('xsfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_after_initial_read(self) -> None: + """suffixes_prefixes_titles must reflect mutations even after the cache has been primed.""" + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache + c.titles.add('emerita') + self.assertIn('emerita', c.suffixes_prefixes_titles) def test_is_rootname_consistent_with_is_title(self) -> None: """is_rootname must return False for words recognised by is_title.""" From 7f0aae840c2cf66c24338e338e86a1541ce15601 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sat, 27 Jun 2026 13:15:06 -0700 Subject: [PATCH 4/5] test: prime cache in invalidation tests; use is-None cache guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Six of the suffixes_prefixes_titles tests passed against the unfixed code because they read the property only once on a cold cache, so they asserted "the union includes its source sets" rather than "the cache stays consistent after a mutation." Prime the cache before mutating so they actually exercise invalidation; verified they now fail (14 across the dual-run fixture) against master's pre-fix config. Change the property guard from `if not self._pst` to `if self._pst is None` so a legitimately empty union caches instead of recomputing on every access — and so "cached" means "computed," which is the state the primed tests rely on. Co-Authored-By: Claude Opus 4.8 --- nameparser/config/__init__.py | 2 +- tests/test_constants.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index e1060cf..0778070 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -281,7 +281,7 @@ def _invalidate_pst(self) -> None: @property def suffixes_prefixes_titles(self) -> Set[str]: - if not self._pst: + if self._pst is None: self._pst = self.prefixes | self.suffix_acronyms | self.suffix_not_acronyms | self.titles return self._pst diff --git a/tests/test_constants.py b/tests/test_constants.py index 5df2fe2..ba9d967 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -108,12 +108,14 @@ def test_add_constant_with_explicit_encoding(self) -> None: def test_suffixes_prefixes_titles_reflects_add_title(self) -> None: """suffixes_prefixes_titles must include titles added after construction.""" c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised c.titles.add('emerita') self.assertIn('emerita', c.suffixes_prefixes_titles) def test_suffixes_prefixes_titles_reflects_add_prefix(self) -> None: """suffixes_prefixes_titles must include prefixes added after construction.""" c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised c.prefixes.add('xpfx') self.assertIn('xpfx', c.suffixes_prefixes_titles) @@ -136,12 +138,14 @@ def test_suffixes_prefixes_titles_reflects_remove_prefix(self) -> None: def test_suffixes_prefixes_titles_reflects_add_suffix_acronym(self) -> None: """suffixes_prefixes_titles must include suffix acronyms added after construction.""" c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised c.suffix_acronyms.add('xsfx') self.assertIn('xsfx', c.suffixes_prefixes_titles) def test_suffixes_prefixes_titles_reflects_add_suffix_not_acronym(self) -> None: """suffixes_prefixes_titles must include non-acronym suffixes added after construction.""" c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache so invalidation is exercised c.suffix_not_acronyms.add('xsfx') self.assertIn('xsfx', c.suffixes_prefixes_titles) @@ -155,11 +159,13 @@ def test_suffixes_prefixes_titles_reflects_add_after_initial_read(self) -> None: def test_is_rootname_consistent_with_is_title(self) -> None: """is_rootname must return False for words recognised by is_title.""" hn = HumanName("", constants=None) + _ = hn.C.suffixes_prefixes_titles # prime the cache so a stale entry would be observable hn.C.titles.add('emerita') self.assertFalse(hn.is_rootname('emerita')) def test_is_rootname_consistent_with_is_prefix(self) -> None: """is_rootname must return False for words recognised by is_prefix.""" hn = HumanName("", constants=None) + _ = hn.C.suffixes_prefixes_titles # prime the cache so a stale entry would be observable hn.C.prefixes.add('xpfx') self.assertFalse(hn.is_rootname('xpfx')) From 388c65bb423968892a7de6c8c35f56e5454e59b6 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Sat, 27 Jun 2026 13:20:34 -0700 Subject: [PATCH 5/5] refactor: scope _pst invalidation to a descriptor; fix stale-callback leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the catch-all Constants.__setattr__ + _PST_ATTRS frozenset with a _CachedUnionMember descriptor on the four cached attributes (prefixes, suffix_acronyms, suffix_not_acronyms, titles). The invalidation/rewiring behavior now lives beside the attributes it governs, runs only for those four (not on every attribute assignment), and removes the hand-synced attribute-name list. Fixes a stale-callback leak: reassigning a manager now detaches the replaced one (_on_change = None) so an orphaned reference can no longer invalidate the live cache. Drop the unused on_change constructor parameter from SetManager — wiring is always done by the owning Constants after construction (e.g. the config-teardown setattr path), so the constructor channel was dead and misleading. Behavior, mypy, and ruff are unchanged; parse throughput is identical to the cached baseline (~18us/name vs ~263us/name without the cache). Tests: add coverage for the replaced-manager rewiring path, the orphan-no-longer-invalidates guarantee, add_with_encoding invalidation, and suffix-set removal after priming; add an assertIs shim to the test base. Co-Authored-By: Claude Opus 4.8 --- nameparser/config/__init__.py | 61 ++++++++++++++++++++++++++--------- tests/base.py | 3 ++ tests/test_constants.py | 53 +++++++++++++++++++++++++++++- 3 files changed, 101 insertions(+), 16 deletions(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 0778070..2527a1c 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -30,7 +30,7 @@ import re import sys from collections.abc import Callable, Iterable, Iterator, Mapping, Set -from typing import Any, TypeVar +from typing import Any, TypeVar, overload if sys.version_info >= (3, 11): from typing import Self @@ -62,9 +62,14 @@ class SetManager(Set): ''' - def __init__(self, elements: Iterable[str], on_change: Callable[[], None] | None = None) -> None: + _on_change: Callable[[], None] | None + + def __init__(self, elements: Iterable[str]) -> None: self.elements = set(elements) - self._on_change = on_change + # Optional invalidation hook, wired by an owning Constants so that + # in-place add()/remove() can clear its cached suffixes_prefixes_titles + # union. None when the manager is used standalone. + self._on_change = None def __call__(self) -> Set[str]: return self.elements @@ -150,7 +155,40 @@ def __getattr__(self, attr: str) -> re.Pattern[str]: return self.get(attr, EMPTY_REGEX) -_PST_ATTRS = frozenset(('prefixes', 'suffix_acronyms', 'suffix_not_acronyms', 'titles')) +class _CachedUnionMember: + """Descriptor for the four ``SetManager`` attributes whose union ``Constants`` + caches in ``_pst`` (``prefixes``, ``suffix_acronyms``, ``suffix_not_acronyms``, + ``titles``). + + Assigning a new manager — or mutating one in place via ``add()`` / ``remove()`` + — invalidates that cache. Keeping the behavior on a descriptor scopes it to + exactly these attributes, beside their declarations, rather than spreading it + across a catch-all ``__setattr__`` and a separate attribute-name list. + """ + + _attr: str + + def __set_name__(self, owner: type, name: str) -> None: + self._attr = '_' + name + + @overload + def __get__(self, obj: None, objtype: type | None = None) -> '_CachedUnionMember': ... + @overload + def __get__(self, obj: 'Constants', objtype: type | None = None) -> SetManager: ... + + def __get__(self, obj: 'Constants | None', objtype: type | None = None) -> 'SetManager | _CachedUnionMember': + if obj is None: + return self + return getattr(obj, self._attr) + + def __set__(self, obj: 'Constants', value: SetManager) -> None: + previous = getattr(obj, self._attr, None) + if isinstance(previous, SetManager): + previous._on_change = None # detach the replaced manager so it no longer invalidates + if isinstance(value, SetManager): + value._on_change = obj._invalidate_pst + obj._invalidate_pst() + setattr(obj, self._attr, value) class Constants: @@ -177,10 +215,10 @@ class Constants: :py:attr:`regexes` wrapped with :py:class:`TupleManager`. """ - prefixes: SetManager - suffix_acronyms: SetManager - suffix_not_acronyms: SetManager - titles: SetManager + prefixes = _CachedUnionMember() + suffix_acronyms = _CachedUnionMember() + suffix_not_acronyms = _CachedUnionMember() + titles = _CachedUnionMember() first_name_titles: SetManager conjunctions: SetManager capitalization_exceptions: TupleManager[str] @@ -269,13 +307,6 @@ def __init__(self, self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = RegexTupleManager(regexes) - def __setattr__(self, name: str, value: object) -> None: - if name in _PST_ATTRS: - object.__setattr__(self, '_pst', None) - if isinstance(value, SetManager): - value._on_change = self._invalidate_pst - object.__setattr__(self, name, value) - def _invalidate_pst(self) -> None: self._pst = None diff --git a/tests/base.py b/tests/base.py index 1f20857..c60a450 100644 --- a/tests/base.py +++ b/tests/base.py @@ -41,3 +41,6 @@ def assertIn(self, member: object, container: object, msg: object = None) -> Non def assertNotIn(self, member: object, container: object, msg: object = None) -> None: assert member not in container, msg # type: ignore[operator] + + def assertIs(self, first: object, second: object, msg: object = None) -> None: + assert first is second, msg diff --git a/tests/test_constants.py b/tests/test_constants.py index ba9d967..c33002b 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -1,5 +1,5 @@ from nameparser import HumanName -from nameparser.config import Constants +from nameparser.config import Constants, SetManager from tests.base import HumanNameTestBase @@ -169,3 +169,54 @@ def test_is_rootname_consistent_with_is_prefix(self) -> None: _ = hn.C.suffixes_prefixes_titles # prime the cache so a stale entry would be observable hn.C.prefixes.add('xpfx') self.assertFalse(hn.is_rootname('xpfx')) + + def test_suffixes_prefixes_titles_reflects_remove_suffix_acronym(self) -> None: + """suffixes_prefixes_titles must reflect a suffix acronym removed after the cache is primed.""" + c = Constants() + c.suffix_acronyms.add('xsfx') + self.assertIn('xsfx', c.suffixes_prefixes_titles) # primes the cache + c.suffix_acronyms.remove('xsfx') + self.assertNotIn('xsfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_remove_suffix_not_acronym(self) -> None: + """suffixes_prefixes_titles must reflect a non-acronym suffix removed after the cache is primed.""" + c = Constants() + c.suffix_not_acronyms.add('xsfx') + self.assertIn('xsfx', c.suffixes_prefixes_titles) # primes the cache + c.suffix_not_acronyms.remove('xsfx') + self.assertNotIn('xsfx', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_add_with_encoding(self) -> None: + """add_with_encoding must invalidate the cache like add()/remove() do.""" + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache + c.titles.add_with_encoding(b'b\351ck', encoding='latin_1') + self.assertIn('béck', c.suffixes_prefixes_titles) + + def test_suffixes_prefixes_titles_reflects_replaced_manager(self) -> None: + """Replacing a whole SetManager must invalidate the cache and wire the new manager. + + Covers the config-teardown path where a fresh SetManager is assigned + directly (e.g. ``setattr(CONSTANTS, 'titles', SetManager(...))``). + """ + c = Constants() + _ = c.suffixes_prefixes_titles # prime the cache + c.titles = SetManager(['brandnewtitle']) + # The replacement is reflected immediately... + self.assertIn('brandnewtitle', c.suffixes_prefixes_titles) + # ...and the new manager's own mutations invalidate the cache too, + # proving the on_change callback was re-wired to the replacement. + _ = c.suffixes_prefixes_titles + c.titles.add('secondtitle') + self.assertIn('secondtitle', c.suffixes_prefixes_titles) + + def test_replaced_manager_no_longer_invalidates_cache(self) -> None: + """A SetManager detached by reassignment must not invalidate the new cache.""" + c = Constants() + replaced = c.titles + c.titles = SetManager(['brandnewtitle']) + primed = c.suffixes_prefixes_titles + # Mutating the orphaned manager must leave the live cache untouched. + replaced.add('ghost') + self.assertIs(c.suffixes_prefixes_titles, primed) + self.assertNotIn('ghost', c.suffixes_prefixes_titles)