From ce3d138dd28228916e44aa97cb73da071f5b0f6f Mon Sep 17 00:00:00 2001
From: David O'Keeffe <david.okeeffe@databricks.com>
Date: Tue, 19 May 2026 10:11:19 +1000
Subject: [PATCH 1/2] fix(hermes): route via content-filter proxy to survive
 PAT rotation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hermes is the only long-running interactive CLI in CoDA — Claude/Codex/Gemini
are re-spawned per turn so they re-read their token from disk on every call,
and OpenCode already routes through the local content-filter proxy. Hermes
loaded ~/.hermes/config.yaml once at startup, cached the api_key in memory,
and silently 403'd on its next turn after the in-process PAT rotator swapped
tokens (#issue).

setup_hermes.py now points base_url at http://127.0.0.1:4000 instead of the
gateway/serving endpoint. The proxy reads ~/.databrickscfg per request (with a
short cache) and injects a fresh Bearer, so Hermes' stale in-memory api_key
is overridden transparently — same trick OpenCode uses.

Also adds PAT_ROTATION_INTERVAL / PAT_TOKEN_LIFETIME env knobs to pat_rotator
so e2e tests can compress the rotation cycle without a code change. Prod
defaults (900s/600s) unchanged.

Test coverage closed too: test_cli_token_rotation.py was missing a
TestUpdateHermes class and the omnibus test was literally named "four". Added
the class and bumped to "five".

Verified end-to-end on daveok: minted PAT, opened hermes chat, sent prompt,
waited 90s for two rotations (old tokens ELIMINATED per app logs), sent a
second prompt in the same long-running Hermes process — second turn succeeded
where it would have 403'd against the unfixed code.

Co-authored-by: Isaac
---
 app.yaml                         |  2 +
 pat_rotator.py                   |  6 ++-
 setup_hermes.py                  | 22 +++++++---
 tests/test_cli_token_rotation.py | 71 +++++++++++++++++++++++++++++++-
 4 files changed, 93 insertions(+), 8 deletions(-)

diff --git a/app.yaml b/app.yaml
index 1a2fbc0..6348fe2 100644
--- a/app.yaml
+++ b/app.yaml
@@ -21,3 +21,5 @@ env:
     value: 0
   - name: MAX_CONCURRENT_SESSIONS
     value: "5"
+  # Override PAT_ROTATION_INTERVAL / PAT_TOKEN_LIFETIME here for e2e testing.
+  # Defaults (10 min / 15 min) live in pat_rotator.py.
diff --git a/pat_rotator.py b/pat_rotator.py
index cc8734f..5b3484d 100644
--- a/pat_rotator.py
+++ b/pat_rotator.py
@@ -18,8 +18,10 @@
 
 logger = logging.getLogger(__name__)
 
-DEFAULT_TOKEN_LIFETIME = 900        # 15 minutes
-DEFAULT_ROTATION_INTERVAL = 600     # 10 minutes
+# Env overrides exist so e2e tests can compress the cycle to seconds without
+# a code change. Production defaults: 15-min tokens rotated every 10 min.
+DEFAULT_TOKEN_LIFETIME = int(os.environ.get("PAT_TOKEN_LIFETIME", "900"))
+DEFAULT_ROTATION_INTERVAL = int(os.environ.get("PAT_ROTATION_INTERVAL", "600"))
 
 
 class PATRotator:
diff --git a/setup_hermes.py b/setup_hermes.py
index 07bb030..37f4309 100644
--- a/setup_hermes.py
+++ b/setup_hermes.py
@@ -107,14 +107,26 @@ def _run(cmd, **kwargs):
     print("Warning: AI Gateway resolved but DATABRICKS_TOKEN missing, falling back to DATABRICKS_HOST")
     gateway_host = ""
 
+# Route Hermes through the local content-filter proxy (127.0.0.1:4000) so
+# PAT rotation is transparent: the proxy reads ~/.databrickscfg on every
+# request and injects a fresh Bearer token, overriding whatever literal
+# api_key is cached in Hermes's in-memory config. Without this, the
+# long-running `hermes chat` process holds the startup token and gets
+# revoked-token 403s once the rotator swaps PATs (~10 min cadence).
+# OpenCode uses the same trick.
+#
+# upstream_base is recorded for the diagnostic banner below; the proxy
+# itself decides the actual upstream from PROXY_UPSTREAM_BASE.
 if gateway_host:
-    base_url = f"{gateway_host}/mlflow/v1"
+    upstream_base = f"{gateway_host}/mlflow/v1"
     auth_token = gateway_token
-    print(f"Using Databricks AI Gateway: {gateway_host}")
+    print(f"Hermes will route via content-filter proxy -> AI Gateway: {gateway_host}")
 else:
-    base_url = f"{host}/serving-endpoints"
+    upstream_base = f"{host}/serving-endpoints"
     auth_token = token
-    print(f"Using Databricks Host: {host}")
+    print(f"Hermes will route via content-filter proxy -> {host}/serving-endpoints")
+
+base_url = "http://127.0.0.1:4000"
 
 # 4. Write ~/.hermes/config.yaml
 config_path = hermes_home / "config.yaml"
@@ -226,7 +238,7 @@ def _run(cmd, **kwargs):
 print("  hermes model                   # Select default model")
 print("  hermes setup                   # Reconfigure wizard")
 print("  hermes mcp add <name> <url>    # Add MCP server")
-print(f"\nEndpoint:       {base_url}")
+print(f"\nEndpoint:       {base_url}  (forwards to {upstream_base})")
 print(f"Primary model:  {hermes_model}")
 print(f"Fallback model: {hermes_fallback_model} (auto-activates on 429/529/503)")
 print(f"Install:        minimal  (add extras: uv pip install \"hermes-agent[mcp,messaging,...]\")")
diff --git a/tests/test_cli_token_rotation.py b/tests/test_cli_token_rotation.py
index 7393299..cfa3655 100644
--- a/tests/test_cli_token_rotation.py
+++ b/tests/test_cli_token_rotation.py
@@ -88,8 +88,69 @@ def test_skips_missing_file(self, isolated_home):
         update_cli_tokens("new-token")
 
 
+class TestUpdateHermes:
+    def test_updates_both_api_key_lines(self, isolated_home):
+        """Hermes config has two api_key lines (primary + fallback). Both must rotate."""
+        from cli_auth import update_cli_tokens
+        hermes_dir = isolated_home / ".hermes"
+        hermes_dir.mkdir()
+        config = (
+            "model:\n"
+            "  default: databricks-claude-opus-4-7\n"
+            "  provider: custom\n"
+            "  base_url: http://127.0.0.1:4000\n"
+            "  api_key: old-token\n"
+            "\n"
+            "fallback_providers:\n"
+            "- provider: custom\n"
+            "  model: databricks-claude-opus-4-6\n"
+            "  base_url: http://127.0.0.1:4000\n"
+            "  api_key: old-token\n"
+        )
+        (hermes_dir / "config.yaml").write_text(config)
+
+        update_cli_tokens("new-token")
+
+        content = (hermes_dir / "config.yaml").read_text()
+        assert content.count("api_key: new-token") == 2, (
+            "Both primary and fallback api_key lines must be rotated. "
+            f"Content was:\n{content}"
+        )
+        assert "old-token" not in content
+        # Unrelated lines preserved
+        assert "default: databricks-claude-opus-4-7" in content
+        assert "model: databricks-claude-opus-4-6" in content
+
+    def test_preserves_other_indentation(self, isolated_home):
+        """Regex must match only `  api_key:` with two-space indent, not arbitrary text."""
+        from cli_auth import update_cli_tokens
+        hermes_dir = isolated_home / ".hermes"
+        hermes_dir.mkdir()
+        # Decoy: a comment that mentions api_key, plus a 4-space-indented api_key
+        # that should NOT be touched.
+        config = (
+            "# api_key: this-is-a-comment-not-a-value\n"
+            "model:\n"
+            "  api_key: old-token\n"
+            "deep:\n"
+            "    api_key: should-not-change\n"
+        )
+        (hermes_dir / "config.yaml").write_text(config)
+
+        update_cli_tokens("new-token")
+
+        content = (hermes_dir / "config.yaml").read_text()
+        assert "  api_key: new-token" in content
+        assert "# api_key: this-is-a-comment-not-a-value" in content
+        assert "    api_key: should-not-change" in content
+
+    def test_skips_missing_file(self, isolated_home):
+        from cli_auth import update_cli_tokens
+        update_cli_tokens("new-token")  # must not raise
+
+
 class TestAllCLIsUpdated:
-    def test_all_four_updated_in_one_call(self, isolated_home):
+    def test_all_five_updated_in_one_call(self, isolated_home):
         from cli_auth import update_cli_tokens
 
         # Set up all config files
@@ -111,6 +172,12 @@ def test_all_four_updated_in_one_call(self, isolated_home):
         gemini_dir.mkdir()
         (gemini_dir / ".env").write_text("GEMINI_API_KEY=old\n")
 
+        hermes_dir = isolated_home / ".hermes"
+        hermes_dir.mkdir()
+        (hermes_dir / "config.yaml").write_text(
+            "model:\n  api_key: old\nfallback_providers:\n- provider: custom\n  api_key: old\n"
+        )
+
         # One call updates all
         update_cli_tokens("rotated-token")
 
@@ -118,3 +185,5 @@ def test_all_four_updated_in_one_call(self, isolated_home):
         assert "OPENAI_API_KEY=rotated-token" in (codex_dir / ".env").read_text()
         assert json.loads((oc_dir / "auth.json").read_text())["databricks"]["api_key"] == "rotated-token"
         assert "GEMINI_API_KEY=rotated-token" in (gemini_dir / ".env").read_text()
+        hermes_content = (hermes_dir / "config.yaml").read_text()
+        assert hermes_content.count("api_key: rotated-token") == 2

From 4d3ff1caec9168d825e8e210b070971fb9b8d879 Mon Sep 17 00:00:00 2001
From: David O'Keeffe <david.okeeffe@databricks.com>
Date: Tue, 19 May 2026 10:15:33 +1000
Subject: [PATCH 2/2] fix(pat-lifecycle): close stale-token windows in proxy
 cache + rotator skip

Two related bugs surfaced while end-to-end testing the Hermes-routing fix
(GH-52):

1. **Proxy cache could serve revoked tokens for up to 30s after rotation.**
   content_filter_proxy._get_fresh_token cached the parsed ~/.databrickscfg
   value purely on read time, so a rotation that rewrote the file mid-cache
   was invisible until TTL expired. In prod (10-min rotation) that's a ~5%
   failure window per cycle. Now stats the cfg file's mtime and invalidates
   the cache the instant mtime advances; the TTL stays as a backstop for
   weird filesystem behaviour.

2. **Rotator deadlocked if an idle skip outran the token's lifetime.** The
   _rotation_loop skipped when session_count == 0, without checking whether
   the in-process token was approaching expiry. If it expired during the
   skip, the next attempt to mint a replacement 403'd (we'd authenticate
   with a dead token) and the rotator was stuck forever. Now still skips
   when idle but only while the token has > one rotation_interval of life
   left; otherwise rotates anyway to keep our own auth alive.

Tests added for both: tests/test_content_filter_proxy.py (4 cases) plus
TestRotationOnNearExpiry in test_pat_rotator.py (2 cases). All 37 tests in
the PAT-lifecycle group pass.

Closes #52

Co-authored-by: Isaac
---
 content_filter_proxy.py            | 24 ++++++++--
 pat_rotator.py                     | 23 ++++++++-
 tests/test_content_filter_proxy.py | 77 ++++++++++++++++++++++++++++++
 tests/test_pat_rotator.py          | 47 ++++++++++++++++++
 4 files changed, 165 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_content_filter_proxy.py

diff --git a/content_filter_proxy.py b/content_filter_proxy.py
index c20d8d2..39f3391 100644
--- a/content_filter_proxy.py
+++ b/content_filter_proxy.py
@@ -38,8 +38,12 @@
 # OpenCode (and this proxy) are separate processes with frozen env snapshots,
 # so we read the file on-demand instead of trusting os.environ.
 
-_TOKEN_CACHE: dict = {"token": None, "read_at": 0.0}
-_TOKEN_CACHE_TTL = 30  # seconds — short enough to pick up rotations quickly
+_TOKEN_CACHE: dict = {"token": None, "read_at": 0.0, "mtime": 0.0}
+# Hard ceiling on cache age. With mtime invalidation below, the cache normally
+# refreshes the instant the rotator rewrites the file, so this is just a
+# defence against an mtime that stops advancing (e.g. clock skew, watched fs
+# tools that touch the file without updating contents).
+_TOKEN_CACHE_TTL = 30
 
 _HOME = os.environ.get("HOME", "/app/python/source_code")
 if not _HOME or _HOME == "/":
@@ -50,10 +54,21 @@
 def _get_fresh_token() -> str | None:
     """Read current token from ~/.databrickscfg (updated by PAT rotator).
 
-    Returns cached value if read within the last _TOKEN_CACHE_TTL seconds.
+    Cache invalidates on file mtime change so a rotation produces a near-zero
+    window of stale tokens. The TTL is a backstop; mtime is authoritative.
     """
     now = time.time()
-    if _TOKEN_CACHE["token"] and (now - _TOKEN_CACHE["read_at"]) < _TOKEN_CACHE_TTL:
+    try:
+        mtime = os.stat(_DATABRICKSCFG_PATH).st_mtime
+    except OSError:
+        mtime = 0.0
+
+    cache_hot = (
+        _TOKEN_CACHE["token"]
+        and mtime <= _TOKEN_CACHE["mtime"]
+        and (now - _TOKEN_CACHE["read_at"]) < _TOKEN_CACHE_TTL
+    )
+    if cache_hot:
         return _TOKEN_CACHE["token"]
 
     try:
@@ -63,6 +78,7 @@ def _get_fresh_token() -> str | None:
         if token:
             _TOKEN_CACHE["token"] = token
             _TOKEN_CACHE["read_at"] = now
+            _TOKEN_CACHE["mtime"] = mtime
             return token
     except Exception as e:
         log.warning(f"Could not read fresh token from {_DATABRICKSCFG_PATH}: {e}")
diff --git a/pat_rotator.py b/pat_rotator.py
index 5b3484d..8b3e42e 100644
--- a/pat_rotator.py
+++ b/pat_rotator.py
@@ -82,16 +82,35 @@ def stop(self):
         self._stop_event.set()
 
     def _rotation_loop(self):
-        """Background loop: sleep, rotate if sessions exist, repeat."""
+        """Background loop: sleep, then rotate if sessions exist OR if the
+        in-process token is about to expire. Always-rotating-near-expiry
+        prevents the rotator from deadlocking when an idle skip outruns the
+        token's lifetime — at that point our own auth would be dead and we
+        could never mint a replacement.
+        """
+        # Force a refresh once we're inside one rotation interval of expiry.
+        # That window is the maximum time we can afford to skip a rotation and
+        # still be sure the next attempt can authenticate.
+        expiry_grace = max(self._rotation_interval, 60)
         while not self._stop_event.is_set():
             self._stop_event.wait(timeout=self._rotation_interval)
             if self._stop_event.is_set():
                 break
             try:
                 session_count = self._session_count_fn()
-                if session_count == 0:
+                token_age = (
+                    time.time() - self._last_rotation_time
+                    if self._last_rotation_time else float("inf")
+                )
+                token_near_expiry = token_age > (self._token_lifetime - expiry_grace)
+                if session_count == 0 and not token_near_expiry:
                     logger.info("PAT rotation: no active sessions — skipping rotation")
                     continue
+                if session_count == 0 and token_near_expiry:
+                    logger.info(
+                        "PAT rotation: no active sessions, but token approaching "
+                        f"expiry (age={int(token_age)}s, lifetime={self._token_lifetime}s) — rotating anyway"
+                    )
                 self._rotate_once()
             except Exception as e:
                 logger.error(f"PAT rotation failed unexpectedly: {e}")
diff --git a/tests/test_content_filter_proxy.py b/tests/test_content_filter_proxy.py
new file mode 100644
index 0000000..8cc0d55
--- /dev/null
+++ b/tests/test_content_filter_proxy.py
@@ -0,0 +1,77 @@
+"""Tests for content_filter_proxy._get_fresh_token cache invalidation.
+
+The proxy reads ~/.databrickscfg on every forwarded request, with a cache to
+avoid filesystem hits in tight request bursts. The cache must invalidate the
+moment the rotator rewrites the file, otherwise the proxy serves revoked
+tokens to upstream for up to TTL seconds after each rotation.
+"""
+
+import time
+from unittest import mock
+
+import pytest
+
+
+@pytest.fixture
+def tmp_cfg(tmp_path, monkeypatch):
+    """Point the proxy at a temp .databrickscfg, with a clean cache."""
+    cfg = tmp_path / ".databrickscfg"
+    import content_filter_proxy as cfp
+    monkeypatch.setattr(cfp, "_DATABRICKSCFG_PATH", str(cfg))
+    monkeypatch.setattr(cfp, "_TOKEN_CACHE", {"token": None, "read_at": 0.0, "mtime": 0.0})
+    return cfg
+
+
+def _write_cfg(path, token):
+    path.write_text(f"[DEFAULT]\nhost = https://example.databricks.com\ntoken = {token}\n")
+
+
+class TestFreshTokenCacheInvalidation:
+    def test_cache_invalidates_on_mtime_change(self, tmp_cfg):
+        from content_filter_proxy import _get_fresh_token
+        _write_cfg(tmp_cfg, "dapi-old")
+        assert _get_fresh_token() == "dapi-old"
+
+        # Simulate rotator rewriting the file. utime to a guaranteed-newer mtime
+        # so the test isn't sensitive to filesystem mtime granularity.
+        _write_cfg(tmp_cfg, "dapi-new")
+        import os
+        st = os.stat(tmp_cfg)
+        os.utime(tmp_cfg, (st.st_atime, st.st_mtime + 10))
+
+        assert _get_fresh_token() == "dapi-new", "must re-read after mtime change"
+
+    def test_cache_hits_when_mtime_unchanged(self, tmp_cfg):
+        from content_filter_proxy import _get_fresh_token
+        _write_cfg(tmp_cfg, "dapi-stable")
+        assert _get_fresh_token() == "dapi-stable"
+
+        # Mutate the file contents WITHOUT advancing mtime (force mtime backwards).
+        # If the cache ignored mtime, it'd happily keep serving "dapi-stable";
+        # if it consulted mtime, it'd still serve "dapi-stable" because mtime
+        # didn't advance. Either way we expect the cached value back, which
+        # asserts the cache is doing its de-dup job within the TTL.
+        import os
+        st = os.stat(tmp_cfg)
+        _write_cfg(tmp_cfg, "dapi-tampered")
+        os.utime(tmp_cfg, (st.st_atime, st.st_mtime))  # restore old mtime
+
+        assert _get_fresh_token() == "dapi-stable"
+
+    def test_falls_back_to_cache_on_stat_error(self, tmp_cfg, monkeypatch):
+        from content_filter_proxy import _get_fresh_token
+        _write_cfg(tmp_cfg, "dapi-cached")
+        assert _get_fresh_token() == "dapi-cached"
+
+        # Now make os.stat fail. The cache should still return the last known token.
+        def boom(_):
+            raise OSError("stat broken")
+        monkeypatch.setattr("content_filter_proxy.os.stat", boom)
+        assert _get_fresh_token() == "dapi-cached"
+
+    def test_returns_none_when_file_missing_and_cache_empty(self, tmp_path, monkeypatch):
+        import content_filter_proxy as cfp
+        missing = tmp_path / "does-not-exist"
+        monkeypatch.setattr(cfp, "_DATABRICKSCFG_PATH", str(missing))
+        monkeypatch.setattr(cfp, "_TOKEN_CACHE", {"token": None, "read_at": 0.0, "mtime": 0.0})
+        assert cfp._get_fresh_token() is None
diff --git a/tests/test_pat_rotator.py b/tests/test_pat_rotator.py
index 40b412b..9dc797c 100644
--- a/tests/test_pat_rotator.py
+++ b/tests/test_pat_rotator.py
@@ -398,3 +398,50 @@ def test_log_pat_rotated_label(self, mock_post, caplog, tmp_path):
 
         combined = " ".join(caplog.messages)
         assert "PAT rotation complete" in combined
+
+
+class TestRotationOnNearExpiry:
+    """When sessions are idle but the in-process token is near expiry, the loop
+    must rotate anyway. Otherwise our own auth dies during idle periods and the
+    rotator deadlocks on subsequent token/create attempts."""
+
+    def test_skip_when_token_fresh_and_no_sessions(self, caplog):
+        import time as _time
+        rotator = _make_rotator(
+            session_count_fn=lambda: 0,
+            rotation_interval=1,
+            token_lifetime=3600,
+        )
+        rotator._current_token = "dapi-fresh"
+        rotator._current_token_id = "tid-fresh"
+        rotator._last_rotation_time = _time.time()  # just minted
+
+        with mock.patch.object(rotator, "_rotate_once") as mr:
+            t = threading.Thread(target=rotator._rotation_loop, daemon=True)
+            t.start()
+            _time.sleep(1.5)
+            rotator.stop()
+            t.join(timeout=2)
+
+        assert mr.call_count == 0
+
+    def test_rotate_when_token_near_expiry_even_with_no_sessions(self):
+        import time as _time
+        rotator = _make_rotator(
+            session_count_fn=lambda: 0,
+            rotation_interval=1,
+            token_lifetime=10,
+        )
+        rotator._current_token = "dapi-near-expiry"
+        rotator._current_token_id = "tid-near-expiry"
+        # Token is 9s old of a 10s lifetime — within the rotation interval of expiry.
+        rotator._last_rotation_time = _time.time() - 9
+
+        with mock.patch.object(rotator, "_rotate_once", return_value=True) as mr:
+            t = threading.Thread(target=rotator._rotation_loop, daemon=True)
+            t.start()
+            _time.sleep(1.5)
+            rotator.stop()
+            t.join(timeout=2)
+
+        assert mr.call_count >= 1