From 61e0994966bc18fa922d9b7af133ac2d65bf1394 Mon Sep 17 00:00:00 2001
From: "bluecloud-gilfoyle[bot]"
 <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com>
Date: Thu, 7 May 2026 21:38:32 +0000
Subject: [PATCH 1/8] feat: cache docs and expose PyPI package docs lookup

---
 README.md                                     |  40 +++++-
 src/mcp_server_python_docs/app_context.py     |   4 +
 src/mcp_server_python_docs/models.py          |  43 ++++++
 src/mcp_server_python_docs/server.py          |  45 ++++++-
 .../services/content.py                       |  28 +++-
 .../services/package_docs.py                  | 124 ++++++++++++++++++
 .../services/persistent_cache.py              |  81 ++++++++++++
 tests/test_package_docs.py                    |  69 ++++++++++
 tests/test_persistent_docs_cache.py           |  75 +++++++++++
 tests/test_services.py                        |   4 +-
 10 files changed, 505 insertions(+), 8 deletions(-)
 create mode 100644 src/mcp_server_python_docs/services/package_docs.py
 create mode 100644 src/mcp_server_python_docs/services/persistent_cache.py
 create mode 100644 tests/test_package_docs.py
 create mode 100644 tests/test_persistent_docs_cache.py
diff --git a/README.md b/README.md
index cceb308..c09f9a2 100644
--- a/README.md
+++ b/README.md
@@ -164,12 +164,13 @@ Contributor commands and validation steps live in
 
 ## Tools
 
-The server currently exposes four MCP tools:
+The server currently exposes five MCP tools:
 
 | Tool | Description |
 |------|-------------|
 | `search_docs` | Search Python stdlib docs by query. Supports symbol lookup (`asyncio.TaskGroup`), module search (`json`), and free-text search. Returns ranked hits with BM25 scoring and snippet excerpts. |
-| `get_docs` | Retrieve a specific documentation page or section by slug and optional anchor. Returns markdown content with budget-enforced truncation and pagination. |
+| `get_docs` | Retrieve a specific documentation page or section by slug and optional anchor. Returns markdown content with budget-enforced truncation and pagination. Retrieved results are cached on disk by Python docs version and request identity. |
+| `lookup_package_docs` | Look up official PyPI package metadata and return package-declared documentation/homepage/source URLs. This is a controlled PyPI metadata lookup, not generic web search. |
 | `list_versions` | List all indexed Python versions with metadata. |
 | `detect_python_version` | Detect the user's local Python version and report whether it matches an indexed documentation version. |
 
@@ -184,10 +185,43 @@ Use this server when you need:
 
 Use a generic fetcher or broader docs MCP when you need:
 
-- third-party package docs outside the Python stdlib
+- arbitrary third-party package content beyond package-declared PyPI metadata
 - arbitrary web pages
 - mixed-source research across many frameworks
 
+## Retrieved docs cache
+
+`get_docs` responses are cached across MCP client/server restarts in the
+platform cache directory:
+
+```text
+<platform cache dir>/mcp-python-docs/retrieved-docs-cache.sqlite3
+```
+
+The cache stores completed `get_docs` results, including page/section content,
+for the resolved Python docs version plus request identity (`slug`, optional
+`anchor`, `max_chars`, and `start_index`). Cache misses fall back to the normal
+local index retrieval path and then write the retrieved result.
+
+Cache entries are also scoped to a fingerprint of the local `index.db` file
+(path, size, and modification timestamp). If you rebuild or replace the local
+docs index, older entries are ignored automatically instead of being returned
+for the new index generation. Deleting `retrieved-docs-cache.sqlite3` is safe;
+it only removes cached retrieval results, not the docs index.
+
+## PyPI package docs lookup
+
+`lookup_package_docs` queries the official PyPI JSON API documented at
+`https://docs.pypi.org/api/json/` (`GET /pypi/<project>/json`) and returns only
+sources declared in that package's PyPI metadata: the PyPI project URL,
+`docs_url`, `home_page`, and allowlisted `project_urls` labels such as
+Documentation, Homepage, Source, Repository, Issues, Changelog, and Release
+Notes.
+
+The tool makes the trust boundary explicit with
+`trust_boundary="pypi-declared-metadata"`. It does not crawl pages, perform web
+search, or silently fall back to unofficial community mirrors.
+
 ## Diagnostics
 
 Check the local environment:
diff --git a/src/mcp_server_python_docs/app_context.py b/src/mcp_server_python_docs/app_context.py
index 49e3d87..e8cc477 100644
--- a/src/mcp_server_python_docs/app_context.py
+++ b/src/mcp_server_python_docs/app_context.py
@@ -11,6 +11,8 @@
 from pathlib import Path
 
 from mcp_server_python_docs.services.content import ContentService
+from mcp_server_python_docs.services.package_docs import PackageDocsService
+from mcp_server_python_docs.services.persistent_cache import PersistentDocsCache
 from mcp_server_python_docs.services.search import SearchService
 from mcp_server_python_docs.services.version import VersionService
 
@@ -24,6 +26,8 @@ class AppContext:
     search_service: SearchService
     content_service: ContentService
     version_service: VersionService
+    package_docs_service: PackageDocsService = field(default_factory=PackageDocsService)
+    persistent_docs_cache: PersistentDocsCache | None = None
     synonyms: dict[str, list[str]] = field(default_factory=dict)
     detected_python_version: str | None = None
     detected_python_source: str | None = None
diff --git a/src/mcp_server_python_docs/models.py b/src/mcp_server_python_docs/models.py
index a688167..c08e86e 100644
--- a/src/mcp_server_python_docs/models.py
+++ b/src/mcp_server_python_docs/models.py
@@ -160,3 +160,46 @@ class DetectPythonVersionResult(BaseModel):
     is_default: bool = Field(
         description="Whether this detected version is being used as the default for get_docs"
     )
+
+
+# --- lookup_package_docs models ---
+
+
+class PackageDocsInput(BaseModel):
+    """Input parameters for lookup_package_docs tool."""
+
+    package: str = Field(
+        min_length=1,
+        max_length=214,
+        description="PyPI package/project name (e.g. 'requests').",
+    )
+
+
+class PackageDocsSource(BaseModel):
+    """A package-declared documentation or project source URL."""
+
+    label: str = Field(description="Label from PyPI metadata or a normalized core metadata field")
+    url: str = Field(description="HTTP(S) URL declared by the package on PyPI")
+    kind: str = Field(description="Source category, such as docs, homepage, source, or pypi")
+    declared_by: str = Field(description="Where this source declaration came from")
+
+
+class PackageDocsResult(BaseModel):
+    """Output from lookup_package_docs tool."""
+
+    package: str = Field(description="Canonical package name returned by PyPI when available")
+    version: str = Field(description="Latest version reported by PyPI metadata")
+    summary: str = Field(default="", description="Package summary from PyPI metadata")
+    metadata_source: str = Field(description="Official PyPI JSON API URL used for lookup")
+    trust_boundary: str = Field(
+        default="pypi-declared-metadata",
+        description="Indicates results are limited to PyPI/project-declared metadata",
+    )
+    sources: list[PackageDocsSource] = Field(
+        default_factory=list,
+        description="Package-declared PyPI, documentation, homepage, and source URLs",
+    )
+    note: str | None = Field(
+        default=None,
+        description="Controlled-scope note, for example skipped labels or not-found details",
+    )
diff --git a/src/mcp_server_python_docs/server.py b/src/mcp_server_python_docs/server.py
index a6d8f03..d9e9af5 100644
--- a/src/mcp_server_python_docs/server.py
+++ b/src/mcp_server_python_docs/server.py
@@ -29,9 +29,12 @@
     DetectPythonVersionResult,
     GetDocsResult,
     ListVersionsResult,
+    PackageDocsResult,
     SearchDocsResult,
 )
 from mcp_server_python_docs.services.content import ContentService
+from mcp_server_python_docs.services.package_docs import PackageDocsService
+from mcp_server_python_docs.services.persistent_cache import PersistentDocsCache
 from mcp_server_python_docs.services.search import SearchService
 from mcp_server_python_docs.services.version import VersionService
 from mcp_server_python_docs.storage.db import get_readonly_connection
@@ -86,15 +89,21 @@ async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]:
 
     # Open read-only connection (STOR-06, STOR-07)
     db = get_readonly_connection(index_path)
+    persistent_docs_cache: PersistentDocsCache | None = None
 
     try:
         # Check FTS5 (STOR-08)
         _assert_fts5(db)
 
         # Construct service instances (Phase 5 — service layer wiring)
+        persistent_docs_cache = PersistentDocsCache(
+            cache_path=cache_dir / "retrieved-docs-cache.sqlite3",
+            index_path=index_path,
+        )
         search_svc = SearchService(db, synonyms)
-        content_svc = ContentService(db)
+        content_svc = ContentService(db, persistent_cache=persistent_docs_cache)
         version_svc = VersionService(db)
+        package_docs_svc = PackageDocsService()
 
         # Detect user's Python version and match to indexed versions
         detected_ver, detected_src = detect_python_version()
@@ -119,6 +128,8 @@ async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]:
                 search_service=search_svc,
                 content_service=content_svc,
                 version_service=version_svc,
+                package_docs_service=package_docs_svc,
+                persistent_docs_cache=persistent_docs_cache,
                 detected_python_version=matched,
                 detected_python_source=detected_src,
             )
@@ -133,6 +144,11 @@ async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]:
                 pass
             raise
     finally:
+        if persistent_docs_cache is not None:
+            try:
+                persistent_docs_cache.close()
+            except Exception:
+                pass
         db.close()
 
 
@@ -143,6 +159,12 @@ async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]:
     idempotentHint=True,
     openWorldHint=False,
 )
+_PYPI_TOOL_ANNOTATIONS = ToolAnnotations(
+    readOnlyHint=True,
+    destructiveHint=False,
+    idempotentHint=True,
+    openWorldHint=True,
+)
 
 SearchQueryParam = Annotated[
     str,
@@ -184,6 +206,10 @@ async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]:
     int,
     Field(ge=0, description="Start position for pagination"),
 ]
+PackageParam = Annotated[
+    str,
+    Field(min_length=1, max_length=214, description="PyPI package/project name"),
+]
 
 
 def create_server() -> FastMCP:
@@ -239,6 +265,23 @@ def get_docs(
             logger.exception("Unexpected error in get_docs")
             raise ToolError(f"Internal error: {type(e).__name__}")
 
+    @mcp.tool(annotations=_PYPI_TOOL_ANNOTATIONS)
+    def lookup_package_docs(
+        package: PackageParam,
+        ctx: Context = None,  # type: ignore[assignment]
+    ) -> PackageDocsResult:
+        """Look up package-declared docs/homepage/source URLs via official PyPI metadata.
+
+        This is not generic web search: it only queries PyPI's JSON API and
+        returns official PyPI metadata plus package-declared project URLs.
+        """
+        app_ctx: AppContext = ctx.request_context.lifespan_context
+        try:
+            return app_ctx.package_docs_service.lookup(package)
+        except Exception as e:
+            logger.exception("Unexpected error in lookup_package_docs")
+            raise ToolError(f"Internal error: {type(e).__name__}")
+
     @mcp.tool(annotations=_TOOL_ANNOTATIONS)
     def list_versions(
         ctx: Context = None,  # type: ignore[assignment]
diff --git a/src/mcp_server_python_docs/services/content.py b/src/mcp_server_python_docs/services/content.py
index 60e6b52..14a0e0c 100644
--- a/src/mcp_server_python_docs/services/content.py
+++ b/src/mcp_server_python_docs/services/content.py
@@ -13,6 +13,7 @@
 from mcp_server_python_docs.retrieval.budget import apply_budget
 from mcp_server_python_docs.services.cache import create_section_cache
 from mcp_server_python_docs.services.observability import log_tool_call
+from mcp_server_python_docs.services.persistent_cache import PersistentDocsCache
 from mcp_server_python_docs.services.version_resolution import resolve_version_strict
 
 
@@ -23,9 +24,14 @@ class ContentService:
     When omitted, returns the full page with truncation/pagination.
     """
 
-    def __init__(self, db: sqlite3.Connection) -> None:
+    def __init__(
+        self,
+        db: sqlite3.Connection,
+        persistent_cache: PersistentDocsCache | None = None,
+    ) -> None:
         self._db = db
         self._get_section = create_section_cache(db)
+        self._persistent_cache = persistent_cache
 
     def _resolve_version(self, version: str | None) -> str:
         """Resolve version to a concrete version string using shared resolution logic.
@@ -47,6 +53,17 @@ def get_docs(
         """Retrieve documentation content by slug, optionally narrowed to a section by anchor."""
         resolved_version = self._resolve_version(version)
 
+        if self._persistent_cache is not None:
+            cached = self._persistent_cache.get(
+                version=resolved_version,
+                slug=slug,
+                anchor=anchor,
+                max_chars=max_chars,
+                start_index=start_index,
+            )
+            if cached is not None:
+                return cached
+
         # Find the document
         doc_row = self._db.execute(
             """
@@ -119,7 +136,7 @@ def get_docs(
             full_text, max_chars, start_index
         )
 
-        return GetDocsResult(
+        result = GetDocsResult(
             content=truncated_text,
             slug=slug,
             title=title,
@@ -129,3 +146,10 @@ def get_docs(
             truncated=is_truncated,
             next_start_index=next_idx,
         )
+        if self._persistent_cache is not None:
+            self._persistent_cache.put(
+                result=result,
+                max_chars=max_chars,
+                start_index=start_index,
+            )
+        return result
diff --git a/src/mcp_server_python_docs/services/package_docs.py b/src/mcp_server_python_docs/services/package_docs.py
new file mode 100644
index 0000000..cfbef88
--- /dev/null
+++ b/src/mcp_server_python_docs/services/package_docs.py
@@ -0,0 +1,124 @@
+"""Controlled package docs lookup using official PyPI JSON metadata.
+
+Source: https://docs.pypi.org/api/json/ documents GET /pypi/<project>/json.
+"""
+from __future__ import annotations
+
+import json
+import re
+from collections.abc import Callable
+from typing import Protocol
+from urllib.error import HTTPError, URLError
+from urllib.parse import quote, urlparse
+from urllib.request import Request, urlopen
+
+from mcp_server_python_docs.models import PackageDocsResult, PackageDocsSource
+from mcp_server_python_docs.services.observability import log_tool_call
+
+_ALLOWED = {
+    "documentation", "docs", "homepage", "home page", "source", "source code",
+    "repository", "repo", "bug tracker", "issues", "changelog", "release notes",
+}
+_BLOCKED = ("mirror", "community", "unofficial", "tutorial", "example")
+
+
+class _HTTPResponse(Protocol):
+    def read(self) -> bytes: ...
+    def __enter__(self) -> "_HTTPResponse": ...
+    def __exit__(self, exc_type: object, exc: object, tb: object) -> bool | None: ...
+
+
+Fetcher = Callable[[str, float], _HTTPResponse]
+
+
+def _default_fetcher(url: str, timeout: float) -> _HTTPResponse:
+    req = Request(
+        url,
+        headers={"Accept": "application/json", "User-Agent": "mcp-server-python-docs"},
+    )
+    return urlopen(req, timeout=timeout)
+
+
+def _normalize(name: str) -> str:
+    return quote(re.sub(r"[-_.]+", "-", name.strip().lower()), safe="-")
+
+
+def _http_url(url: object) -> str | None:
+    if not isinstance(url, str):
+        return None
+    parsed = urlparse(url.strip())
+    return url.strip() if parsed.scheme in {"http", "https"} and parsed.netloc else None
+
+
+def _source(label: str, url: object, kind: str) -> PackageDocsSource | None:
+    valid = _http_url(url)
+    if valid is None:
+        return None
+    return PackageDocsSource(label=label, url=valid, kind=kind, declared_by="PyPI project metadata")
+
+
+class PackageDocsService:
+    """Return package-declared docs/homepage/source URLs from PyPI metadata only."""
+
+    def __init__(self, fetcher: Fetcher = _default_fetcher, timeout: float = 10.0) -> None:
+        self._fetcher = fetcher
+        self._timeout = timeout
+
+    @log_tool_call("lookup_package_docs")
+    def lookup(self, package: str) -> PackageDocsResult:
+        project = _normalize(package)
+        metadata_source = f"https://pypi.org/pypi/{project}/json"
+        try:
+            with self._fetcher(metadata_source, self._timeout) as response:
+                payload = json.loads(response.read().decode("utf-8"))
+        except HTTPError as e:
+            if e.code == 404:
+                return PackageDocsResult(
+                    package=package, version="", metadata_source=metadata_source,
+                    sources=[], note="Package not found on PyPI.",
+                )
+            raise
+        except (URLError, TimeoutError, json.JSONDecodeError) as e:
+            return PackageDocsResult(
+                package=package, version="", metadata_source=metadata_source,
+                sources=[], note=f"Unable to retrieve PyPI metadata: {type(e).__name__}.",
+            )
+
+        info = payload.get("info") if isinstance(payload, dict) else {}
+        info = info if isinstance(info, dict) else {}
+        sources = [
+            s for s in (
+                _source(
+                    "PyPI project",
+                    info.get("project_url") or f"https://pypi.org/project/{project}/",
+                    "pypi",
+                ),
+                _source("Documentation", info.get("docs_url"), "docs"),
+                _source("Homepage", info.get("home_page"), "homepage"),
+            ) if s is not None
+        ]
+        skipped: list[str] = []
+        project_urls = info.get("project_urls")
+        if isinstance(project_urls, dict):
+            for label, url in project_urls.items():
+                lowered = str(label).strip().lower()
+                if lowered in _ALLOWED and not any(bad in lowered for bad in _BLOCKED):
+                    found = _source(str(label), url, lowered.replace(" ", "_"))
+                    if found is not None and found not in sources:
+                        sources.append(found)
+                else:
+                    skipped.append(str(label))
+        note = None
+        if skipped:
+            note = (
+                "Ignored project URL labels outside the controlled allowlist: "
+                f"{', '.join(sorted(skipped))}."
+            )
+        return PackageDocsResult(
+            package=str(info.get("name") or package),
+            version=str(info.get("version") or ""),
+            summary=str(info.get("summary") or ""),
+            metadata_source=metadata_source,
+            sources=sources,
+            note=note,
+        )
diff --git a/src/mcp_server_python_docs/services/persistent_cache.py b/src/mcp_server_python_docs/services/persistent_cache.py
new file mode 100644
index 0000000..f260cd5
--- /dev/null
+++ b/src/mcp_server_python_docs/services/persistent_cache.py
@@ -0,0 +1,81 @@
+"""SQLite-backed cache for completed get_docs results across MCP restarts."""
+from __future__ import annotations
+
+import sqlite3
+from pathlib import Path
+from typing import NamedTuple
+
+from mcp_server_python_docs.models import GetDocsResult
+
+
+class CacheStats(NamedTuple):
+    hits: int = 0
+    misses: int = 0
+    writes: int = 0
+
+
+class PersistentDocsCache:
+    """Persist get_docs results by index fingerprint, version, and request identity."""
+
+    def __init__(self, cache_path: Path, index_path: Path) -> None:
+        self._cache_path = Path(cache_path)
+        self._fingerprint = self._fingerprint_index(Path(index_path))
+        self._hits = self._misses = self._writes = 0
+        self._cache_path.parent.mkdir(parents=True, exist_ok=True)
+        self._conn = sqlite3.connect(str(self._cache_path), check_same_thread=False)
+        self._conn.execute("PRAGMA synchronous = NORMAL")
+        self._conn.execute(
+            "CREATE TABLE IF NOT EXISTS retrieved_docs_cache ("
+            "index_fingerprint TEXT NOT NULL, version TEXT NOT NULL, slug TEXT NOT NULL, "
+            "anchor TEXT NOT NULL, max_chars INTEGER NOT NULL, start_index INTEGER NOT NULL, "
+            "result_json TEXT NOT NULL, created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, "
+            "PRIMARY KEY (index_fingerprint, version, slug, anchor, max_chars, start_index))"
+        )
+        self._conn.commit()
+
+    @property
+    def cache_path(self) -> Path:
+        return self._cache_path
+
+    @staticmethod
+    def _fingerprint_index(index_path: Path) -> str:
+        stat = index_path.stat()
+        return f"{index_path.resolve()}:{stat.st_size}:{stat.st_mtime_ns}"
+
+    def stats(self) -> CacheStats:
+        return CacheStats(self._hits, self._misses, self._writes)
+
+    def get(
+        self, *, version: str, slug: str, anchor: str | None, max_chars: int, start_index: int
+    ) -> GetDocsResult | None:
+        row = self._conn.execute(
+            "SELECT result_json FROM retrieved_docs_cache WHERE index_fingerprint = ? "
+            "AND version = ? AND slug = ? AND anchor = ? AND max_chars = ? AND start_index = ?",
+            (self._fingerprint, version, slug, anchor or "", max_chars, start_index),
+        ).fetchone()
+        if row is None:
+            self._misses += 1
+            return None
+        self._hits += 1
+        return GetDocsResult.model_validate_json(row[0])
+
+    def put(self, *, result: GetDocsResult, max_chars: int, start_index: int) -> None:
+        self._conn.execute(
+            "INSERT OR REPLACE INTO retrieved_docs_cache "
+            "(index_fingerprint, version, slug, anchor, max_chars, start_index, result_json) "
+            "VALUES (?, ?, ?, ?, ?, ?, ?)",
+            (
+                self._fingerprint,
+                result.version,
+                result.slug,
+                result.anchor or "",
+                max_chars,
+                start_index,
+                result.model_dump_json(),
+            ),
+        )
+        self._conn.commit()
+        self._writes += 1
+
+    def close(self) -> None:
+        self._conn.close()
diff --git a/tests/test_package_docs.py b/tests/test_package_docs.py
new file mode 100644
index 0000000..cdfc0d1
--- /dev/null
+++ b/tests/test_package_docs.py
@@ -0,0 +1,69 @@
+"""Controlled PyPI package documentation lookup tests."""
+from __future__ import annotations
+
+import json
+from urllib.error import HTTPError
+
+from mcp_server_python_docs.services.package_docs import PackageDocsService
+
+
+class _Resp:
+    def __init__(self, payload: dict):
+        self._payload = payload
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc, tb):
+        return False
+    def read(self) -> bytes:
+        return json.dumps(self._payload).encode()
+
+
+def test_package_docs_uses_official_pypi_metadata_and_declared_urls():
+    seen: list[str] = []
+    def fetch(url: str, timeout: float):
+        seen.append(url)
+        return _Resp({"info": {"name": "SampleProject", "version": "4.0.0",
+            "summary": "sample", "project_url": "https://pypi.org/project/sampleproject/",
+            "home_page": "https://github.com/pypa/sampleproject",
+            "docs_url": "https://sampleproject.pypa.io/",
+            "project_urls": {"Documentation": "https://sampleproject.pypa.io/",
+                "Source": "https://github.com/pypa/sampleproject"}}})
+
+    result = PackageDocsService(fetcher=fetch).lookup("Sample_Project")
+
+    assert seen == ["https://pypi.org/pypi/sample-project/json"]
+    assert result.package == "SampleProject"
+    assert result.trust_boundary == "pypi-declared-metadata"
+    assert {s.label for s in result.sources} >= {
+        "PyPI project", "Documentation", "Homepage", "Source",
+    }
+    assert all(s.declared_by == "PyPI project metadata" for s in result.sources)
+
+
+def test_package_docs_filters_uncontrolled_urls_and_has_no_web_search_fallback():
+    def fetch(url: str, timeout: float):
+        return _Resp({"info": {"name": "demo", "version": "1.0.0",
+            "home_page": "https://demo.example/",
+            "description": "See https://random-blog.example/demo",
+            "project_urls": {"Community mirror": "https://mirror.example/demo"}}})
+
+    result = PackageDocsService(fetcher=fetch).lookup("demo")
+    urls = [s.url for s in result.sources]
+    assert "https://demo.example/" in urls
+    assert "https://random-blog.example/demo" not in urls
+    assert "https://mirror.example/demo" not in urls
+    assert "community mirror" in (result.note or "").lower()
+
+
+def test_package_docs_not_found_and_tool_annotation():
+    def missing(url: str, timeout: float):
+        raise HTTPError(url, 404, "Not Found", hdrs=None, fp=None)
+
+    result = PackageDocsService(fetcher=missing).lookup("missing-package")
+    assert result.sources == []
+    assert "not found" in (result.note or "").lower()
+
+    from mcp_server_python_docs.server import create_server
+    tool = create_server()._tool_manager._tools["lookup_package_docs"]
+    assert tool.annotations.readOnlyHint is True
+    assert tool.annotations.openWorldHint is True
diff --git a/tests/test_persistent_docs_cache.py b/tests/test_persistent_docs_cache.py
new file mode 100644
index 0000000..92bc01a
--- /dev/null
+++ b/tests/test_persistent_docs_cache.py
@@ -0,0 +1,75 @@
+"""Persistent get_docs cache coverage."""
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+from mcp_server_python_docs.services.content import ContentService
+from mcp_server_python_docs.services.persistent_cache import PersistentDocsCache
+
+
+def _doc(db, version: str, content: str, default: int = 0) -> None:
+    db.execute("DELETE FROM doc_sets WHERE source='python-docs' AND version=?", (version,))
+    db.execute(
+        "INSERT INTO doc_sets (source, version, language, label, is_default) "
+        "VALUES ('python-docs', ?, 'en', ?, ?)",
+        (version, f"Python {version}", default),
+    )
+    ds = db.execute("SELECT id FROM doc_sets WHERE version=?", (version,)).fetchone()[0]
+    db.execute(
+        "INSERT INTO documents (doc_set_id, uri, slug, title, content_text, char_count) "
+        "VALUES (?, 'library/json.html', 'library/json.html', ?, ?, ?)",
+        (ds, f"json {version}", content, len(content)),
+    )
+    doc_id = db.execute("SELECT last_insert_rowid()").fetchone()[0]
+    db.execute(
+        "INSERT INTO sections (document_id, uri, anchor, heading, level, ordinal, "
+        "content_text, char_count) VALUES (?, 'library/json.html#top', 'top', "
+        "'Top', 1, 0, ?, ?)",
+        (doc_id, content, len(content)),
+    )
+    db.commit()
+
+
+def _cache(tmp_path: Path, marker: bytes = b"index-1") -> tuple[Path, PersistentDocsCache]:
+    index_path = tmp_path / "index.db"
+    index_path.write_bytes(marker)
+    return index_path, PersistentDocsCache(tmp_path / "retrieved.sqlite3", index_path)
+
+
+def test_cache_survives_restart_and_miss_falls_back(populated_db, tmp_path: Path):
+    _doc(populated_db, "3.12", "persisted docs", 1)
+    index_path, cache = _cache(tmp_path)
+    first = ContentService(populated_db, cache).get_docs("library/json.html", "3.12", max_chars=500)
+    assert "persisted docs" in first.content
+    assert cache.stats().misses == cache.stats().writes == 1
+
+    restarted = PersistentDocsCache(tmp_path / "retrieved.sqlite3", index_path)
+    second = ContentService(populated_db, restarted).get_docs(
+        "library/json.html", "3.12", max_chars=500
+    )
+    assert second == first
+    assert restarted.stats().hits == 1
+
+
+def test_cache_key_includes_python_version(populated_db, tmp_path: Path):
+    _doc(populated_db, "3.12", "docs for 3.12")
+    _doc(populated_db, "3.13", "docs for 3.13", 1)
+    _, cache = _cache(tmp_path)
+    svc = ContentService(populated_db, cache)
+    assert "3.12" in svc.get_docs("library/json.html", "3.12", max_chars=500).content
+    assert "3.13" in svc.get_docs("library/json.html", "3.13", max_chars=500).content
+    assert cache.stats().misses == 2
+
+
+def test_cache_ignores_stale_entries_after_index_replacement(populated_db, tmp_path: Path):
+    _doc(populated_db, "3.12", "old generation", 1)
+    index_path, cache = _cache(tmp_path)
+    ContentService(populated_db, cache).get_docs("library/json.html", "3.12", max_chars=500)
+    index_path.write_bytes(b"index-generation-2-with-different-size")
+    os.utime(index_path, None)
+
+    stale = PersistentDocsCache(tmp_path / "retrieved.sqlite3", index_path)
+    ContentService(populated_db, stale).get_docs("library/json.html", "3.12", max_chars=500)
+    assert stale.stats().hits == 0
+    assert stale.stats().misses == 1
diff --git a/tests/test_services.py b/tests/test_services.py
index 2fde2bf..b084775 100644
--- a/tests/test_services.py
+++ b/tests/test_services.py
@@ -402,12 +402,12 @@ def test_all_tools_have_annotations(self):
                 annotations.openWorldHint is False
             ), f"{name} openWorldHint should be False"
 
-    def test_four_tools_registered(self):
+    def test_five_tools_registered(self):
         from mcp_server_python_docs.server import create_server
 
         server = create_server()
         tools = server._tool_manager._tools
-        assert len(tools) == 4
+        assert len(tools) == 5
 
     def test_runtime_tool_schemas_include_input_constraints(self):
         import anyio

From 15b2bc56470bdbdc45838bcb3e0e376aac58f81e Mon Sep 17 00:00:00 2001
From: "bluecloud-gilfoyle[bot]"
 <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com>
Date: Fri, 8 May 2026 10:29:11 +0000
Subject: [PATCH 2/8] fix: bound PyPI metadata response reads

---
 .../services/package_docs.py                  | 21 ++++++-
 tests/test_package_docs.py                    | 60 +++++++++++++++++--
 2 files changed, 74 insertions(+), 7 deletions(-)

diff --git a/src/mcp_server_python_docs/services/package_docs.py b/src/mcp_server_python_docs/services/package_docs.py
index cfbef88..9199804 100644
--- a/src/mcp_server_python_docs/services/package_docs.py
+++ b/src/mcp_server_python_docs/services/package_docs.py
@@ -20,10 +20,11 @@
     "repository", "repo", "bug tracker", "issues", "changelog", "release notes",
 }
 _BLOCKED = ("mirror", "community", "unofficial", "tutorial", "example")
+_PYPI_METADATA_MAX_BYTES = 5 * 1024 * 1024
 
 
 class _HTTPResponse(Protocol):
-    def read(self) -> bytes: ...
+    def read(self, size: int = -1) -> bytes: ...
     def __enter__(self) -> "_HTTPResponse": ...
     def __exit__(self, exc_type: object, exc: object, tb: object) -> bool | None: ...
 
@@ -57,6 +58,13 @@ def _source(label: str, url: object, kind: str) -> PackageDocsSource | None:
     return PackageDocsSource(label=label, url=valid, kind=kind, declared_by="PyPI project metadata")
 
 
+def _read_limited(response: _HTTPResponse) -> bytes | None:
+    data = response.read(_PYPI_METADATA_MAX_BYTES + 1)
+    if len(data) > _PYPI_METADATA_MAX_BYTES:
+        return None
+    return data
+
+
 class PackageDocsService:
     """Return package-declared docs/homepage/source URLs from PyPI metadata only."""
 
@@ -70,7 +78,16 @@ def lookup(self, package: str) -> PackageDocsResult:
         metadata_source = f"https://pypi.org/pypi/{project}/json"
         try:
             with self._fetcher(metadata_source, self._timeout) as response:
-                payload = json.loads(response.read().decode("utf-8"))
+                data = _read_limited(response)
+                if data is None:
+                    return PackageDocsResult(
+                        package=package,
+                        version="",
+                        metadata_source=metadata_source,
+                        sources=[],
+                        note="PyPI metadata exceeded size limit.",
+                    )
+                payload = json.loads(data.decode("utf-8"))
         except HTTPError as e:
             if e.code == 404:
                 return PackageDocsResult(
diff --git a/tests/test_package_docs.py b/tests/test_package_docs.py
index cdfc0d1..32b6ee6 100644
--- a/tests/test_package_docs.py
+++ b/tests/test_package_docs.py
@@ -2,20 +2,27 @@
 from __future__ import annotations
 
 import json
-from urllib.error import HTTPError
+from urllib.error import HTTPError, URLError
 
-from mcp_server_python_docs.services.package_docs import PackageDocsService
+from mcp_server_python_docs.services.package_docs import (
+    _PYPI_METADATA_MAX_BYTES,
+    PackageDocsService,
+)
 
 
 class _Resp:
-    def __init__(self, payload: dict):
+    def __init__(self, payload: dict | bytes):
         self._payload = payload
     def __enter__(self):
         return self
     def __exit__(self, exc_type, exc, tb):
         return False
-    def read(self) -> bytes:
-        return json.dumps(self._payload).encode()
+    def read(self, size: int = -1) -> bytes:
+        if isinstance(self._payload, bytes):
+            data = self._payload
+        else:
+            data = json.dumps(self._payload).encode()
+        return data if size < 0 else data[:size]
 
 
 def test_package_docs_uses_official_pypi_metadata_and_declared_urls():
@@ -67,3 +74,46 @@ def missing(url: str, timeout: float):
     tool = create_server()._tool_manager._tools["lookup_package_docs"]
     assert tool.annotations.readOnlyHint is True
     assert tool.annotations.openWorldHint is True
+
+
+def test_package_docs_rejects_oversized_pypi_metadata_without_unbounded_read():
+    class LargeResp:
+        requested_size: int | None = None
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, exc_type, exc, tb):
+            return False
+
+        def read(self, size: int = -1) -> bytes:
+            self.requested_size = size
+            assert size == _PYPI_METADATA_MAX_BYTES + 1
+            return b"x" * size
+
+    response = LargeResp()
+
+    def fetch(url: str, timeout: float):
+        return response
+
+    result = PackageDocsService(fetcher=fetch).lookup("huge-package")
+
+    assert response.requested_size == _PYPI_METADATA_MAX_BYTES + 1
+    assert result.sources == []
+    assert result.note == "PyPI metadata exceeded size limit."
+
+
+def test_package_docs_reports_retrieval_and_json_errors():
+    def unreachable(url: str, timeout: float):
+        raise URLError("network down")
+
+    network_result = PackageDocsService(fetcher=unreachable).lookup("demo")
+    assert network_result.sources == []
+    assert network_result.note == "Unable to retrieve PyPI metadata: URLError."
+
+    def invalid_json(url: str, timeout: float):
+        return _Resp(b"not json")
+
+    json_result = PackageDocsService(fetcher=invalid_json).lookup("demo")
+    assert json_result.sources == []
+    assert json_result.note == "Unable to retrieve PyPI metadata: JSONDecodeError."

From 5d8e8bcef925c3f2246cbbf876d0f0d118da3836 Mon Sep 17 00:00:00 2001
From: "bluecloud-gilfoyle[bot]"
 <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com>
Date: Fri, 8 May 2026 10:45:14 +0000
Subject: [PATCH 3/8] fix: address PR 9 review cache and PyPI errors

---
 src/mcp_server_python_docs/server.py          |  10 +-
 .../services/package_docs.py                  |  40 +++++--
 .../services/persistent_cache.py              | 113 +++++++++++++-----
 tests/conftest.py                             |   4 +-
 tests/test_package_docs.py                    |  11 ++
 tests/test_persistent_docs_cache.py           | 108 +++++++++++++++++
 6 files changed, 234 insertions(+), 52 deletions(-)

diff --git a/src/mcp_server_python_docs/server.py b/src/mcp_server_python_docs/server.py
index d9e9af5..4bba114 100644
--- a/src/mcp_server_python_docs/server.py
+++ b/src/mcp_server_python_docs/server.py
@@ -3,8 +3,10 @@
 Thin server layer — delegates all tool logic to services.
 Dependency rule: server -> services -> retrieval/storage.
 """
+
 from __future__ import annotations
 
+import asyncio
 import importlib.resources
 import logging
 import sqlite3
@@ -256,9 +258,7 @@ def get_docs(
         if version is None and app_ctx.detected_python_version:
             version = app_ctx.detected_python_version
         try:
-            return app_ctx.content_service.get_docs(
-                slug, version, anchor, max_chars, start_index
-            )
+            return app_ctx.content_service.get_docs(slug, version, anchor, max_chars, start_index)
         except DocsServerError as e:
             raise ToolError(str(e))
         except Exception as e:
@@ -266,7 +266,7 @@ def get_docs(
             raise ToolError(f"Internal error: {type(e).__name__}")
 
     @mcp.tool(annotations=_PYPI_TOOL_ANNOTATIONS)
-    def lookup_package_docs(
+    async def lookup_package_docs(
         package: PackageParam,
         ctx: Context = None,  # type: ignore[assignment]
     ) -> PackageDocsResult:
@@ -277,7 +277,7 @@ def lookup_package_docs(
         """
         app_ctx: AppContext = ctx.request_context.lifespan_context
         try:
-            return app_ctx.package_docs_service.lookup(package)
+            return await asyncio.to_thread(app_ctx.package_docs_service.lookup, package)
         except Exception as e:
             logger.exception("Unexpected error in lookup_package_docs")
             raise ToolError(f"Internal error: {type(e).__name__}")
diff --git a/src/mcp_server_python_docs/services/package_docs.py b/src/mcp_server_python_docs/services/package_docs.py
index 9199804..63a69a6 100644
--- a/src/mcp_server_python_docs/services/package_docs.py
+++ b/src/mcp_server_python_docs/services/package_docs.py
@@ -2,6 +2,7 @@
 
 Source: https://docs.pypi.org/api/json/ documents GET /pypi/<project>/json.
 """
+
 from __future__ import annotations
 
 import json
@@ -16,8 +17,14 @@
 from mcp_server_python_docs.services.observability import log_tool_call
 
 _ALLOWED = {
-    "documentation", "docs", "homepage", "home page", "source", "source code",
-    "repository", "repo", "bug tracker", "issues", "changelog", "release notes",
+    "documentation",
+    "docs",
+    "homepage",
+    "home page",
+    "source",
+    "source code",
+    "repository",
+    "repo",
 }
 _BLOCKED = ("mirror", "community", "unofficial", "tutorial", "example")
 _PYPI_METADATA_MAX_BYTES = 5 * 1024 * 1024
@@ -89,22 +96,30 @@ def lookup(self, package: str) -> PackageDocsResult:
                     )
                 payload = json.loads(data.decode("utf-8"))
         except HTTPError as e:
-            if e.code == 404:
-                return PackageDocsResult(
-                    package=package, version="", metadata_source=metadata_source,
-                    sources=[], note="Package not found on PyPI.",
-                )
-            raise
+            note = (
+                "Package not found on PyPI." if e.code == 404 else f"PyPI returned HTTP {e.code}."
+            )
+            return PackageDocsResult(
+                package=package,
+                version="",
+                metadata_source=metadata_source,
+                sources=[],
+                note=note,
+            )
         except (URLError, TimeoutError, json.JSONDecodeError) as e:
             return PackageDocsResult(
-                package=package, version="", metadata_source=metadata_source,
-                sources=[], note=f"Unable to retrieve PyPI metadata: {type(e).__name__}.",
+                package=package,
+                version="",
+                metadata_source=metadata_source,
+                sources=[],
+                note=f"Unable to retrieve PyPI metadata: {type(e).__name__}.",
             )
 
         info = payload.get("info") if isinstance(payload, dict) else {}
         info = info if isinstance(info, dict) else {}
         sources = [
-            s for s in (
+            s
+            for s in (
                 _source(
                     "PyPI project",
                     info.get("project_url") or f"https://pypi.org/project/{project}/",
@@ -112,7 +127,8 @@ def lookup(self, package: str) -> PackageDocsResult:
                 ),
                 _source("Documentation", info.get("docs_url"), "docs"),
                 _source("Homepage", info.get("home_page"), "homepage"),
-            ) if s is not None
+            )
+            if s is not None
         ]
         skipped: list[str] = []
         project_urls = info.get("project_urls")
diff --git a/src/mcp_server_python_docs/services/persistent_cache.py b/src/mcp_server_python_docs/services/persistent_cache.py
index f260cd5..725a69d 100644
--- a/src/mcp_server_python_docs/services/persistent_cache.py
+++ b/src/mcp_server_python_docs/services/persistent_cache.py
@@ -1,12 +1,19 @@
 """SQLite-backed cache for completed get_docs results across MCP restarts."""
+
 from __future__ import annotations
 
+import logging
 import sqlite3
 from pathlib import Path
 from typing import NamedTuple
 
+from pydantic import ValidationError
+
 from mcp_server_python_docs.models import GetDocsResult
 
+logger = logging.getLogger(__name__)
+_NO_ANCHOR_KEY = "\x00mcp-python-docs:no-anchor\x00"
+
 
 class CacheStats(NamedTuple):
     hits: int = 0
@@ -21,17 +28,24 @@ def __init__(self, cache_path: Path, index_path: Path) -> None:
         self._cache_path = Path(cache_path)
         self._fingerprint = self._fingerprint_index(Path(index_path))
         self._hits = self._misses = self._writes = 0
-        self._cache_path.parent.mkdir(parents=True, exist_ok=True)
-        self._conn = sqlite3.connect(str(self._cache_path), check_same_thread=False)
-        self._conn.execute("PRAGMA synchronous = NORMAL")
-        self._conn.execute(
-            "CREATE TABLE IF NOT EXISTS retrieved_docs_cache ("
-            "index_fingerprint TEXT NOT NULL, version TEXT NOT NULL, slug TEXT NOT NULL, "
-            "anchor TEXT NOT NULL, max_chars INTEGER NOT NULL, start_index INTEGER NOT NULL, "
-            "result_json TEXT NOT NULL, created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, "
-            "PRIMARY KEY (index_fingerprint, version, slug, anchor, max_chars, start_index))"
-        )
-        self._conn.commit()
+        self._conn: sqlite3.Connection | None = None
+        try:
+            self._cache_path.parent.mkdir(parents=True, exist_ok=True)
+            self._conn = sqlite3.connect(str(self._cache_path), check_same_thread=False)
+            self._conn.execute("PRAGMA synchronous = NORMAL")
+            self._conn.execute(
+                "CREATE TABLE IF NOT EXISTS retrieved_docs_cache ("
+                "index_fingerprint TEXT NOT NULL, version TEXT NOT NULL, slug TEXT NOT NULL, "
+                "anchor TEXT NOT NULL, max_chars INTEGER NOT NULL, start_index INTEGER NOT NULL, "
+                "result_json TEXT NOT NULL, created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, "
+                "PRIMARY KEY (index_fingerprint, version, slug, anchor, max_chars, start_index))"
+            )
+            self._conn.commit()
+        except (OSError, sqlite3.Error) as e:
+            if self._conn is not None:
+                self._conn.close()
+                self._conn = None
+            logger.warning("Persistent docs cache disabled: %s", e)
 
     @property
     def cache_path(self) -> Path:
@@ -42,40 +56,73 @@ def _fingerprint_index(index_path: Path) -> str:
         stat = index_path.stat()
         return f"{index_path.resolve()}:{stat.st_size}:{stat.st_mtime_ns}"
 
+    @staticmethod
+    def _anchor_key(anchor: str | None) -> str:
+        return _NO_ANCHOR_KEY if anchor is None else anchor
+
     def stats(self) -> CacheStats:
         return CacheStats(self._hits, self._misses, self._writes)
 
     def get(
         self, *, version: str, slug: str, anchor: str | None, max_chars: int, start_index: int
     ) -> GetDocsResult | None:
-        row = self._conn.execute(
-            "SELECT result_json FROM retrieved_docs_cache WHERE index_fingerprint = ? "
-            "AND version = ? AND slug = ? AND anchor = ? AND max_chars = ? AND start_index = ?",
-            (self._fingerprint, version, slug, anchor or "", max_chars, start_index),
-        ).fetchone()
+        if self._conn is None:
+            self._misses += 1
+            return None
+        try:
+            row = self._conn.execute(
+                "SELECT result_json FROM retrieved_docs_cache WHERE index_fingerprint = ? "
+                "AND version = ? AND slug = ? AND anchor = ? AND max_chars = ? AND start_index = ?",
+                (
+                    self._fingerprint,
+                    version,
+                    slug,
+                    self._anchor_key(anchor),
+                    max_chars,
+                    start_index,
+                ),
+            ).fetchone()
+        except sqlite3.Error as e:
+            self._misses += 1
+            logger.warning("Persistent docs cache read skipped: %s", e)
+            return None
         if row is None:
             self._misses += 1
             return None
+        try:
+            result = GetDocsResult.model_validate_json(row[0])
+        except (ValidationError, ValueError) as e:
+            self._misses += 1
+            logger.warning("Persistent docs cache entry ignored: %s", e)
+            return None
         self._hits += 1
-        return GetDocsResult.model_validate_json(row[0])
+        return result
 
     def put(self, *, result: GetDocsResult, max_chars: int, start_index: int) -> None:
-        self._conn.execute(
-            "INSERT OR REPLACE INTO retrieved_docs_cache "
-            "(index_fingerprint, version, slug, anchor, max_chars, start_index, result_json) "
-            "VALUES (?, ?, ?, ?, ?, ?, ?)",
-            (
-                self._fingerprint,
-                result.version,
-                result.slug,
-                result.anchor or "",
-                max_chars,
-                start_index,
-                result.model_dump_json(),
-            ),
-        )
-        self._conn.commit()
+        if self._conn is None:
+            return
+        try:
+            self._conn.execute(
+                "INSERT OR REPLACE INTO retrieved_docs_cache "
+                "(index_fingerprint, version, slug, anchor, max_chars, start_index, result_json) "
+                "VALUES (?, ?, ?, ?, ?, ?, ?)",
+                (
+                    self._fingerprint,
+                    result.version,
+                    result.slug,
+                    self._anchor_key(result.anchor),
+                    max_chars,
+                    start_index,
+                    result.model_dump_json(),
+                ),
+            )
+            self._conn.commit()
+        except (sqlite3.Error, ValueError) as e:
+            logger.warning("Persistent docs cache write skipped: %s", e)
+            return
         self._writes += 1
 
     def close(self) -> None:
-        self._conn.close()
+        if self._conn is not None:
+            self._conn.close()
+            self._conn = None
diff --git a/tests/conftest.py b/tests/conftest.py
index 94cefd0..7de4664 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -150,8 +150,8 @@ def populated_db(test_db):
         "os.path -- Common pathname manipulations",
         [
             ("os.path.join", "os.path.join", 2, 1,
-             "Join one or more path components intelligently. If a component is an absolute path, "
-             "all previous components are thrown away."),
+             "Join one or more path components intelligently. If a component is an "
+             "absolute path, all previous components are thrown away."),
         ],
     ),
 ]
diff --git a/tests/test_package_docs.py b/tests/test_package_docs.py
index 32b6ee6..d0455c2 100644
--- a/tests/test_package_docs.py
+++ b/tests/test_package_docs.py
@@ -76,6 +76,17 @@ def missing(url: str, timeout: float):
     assert tool.annotations.openWorldHint is True
 
 
+def test_package_docs_reports_non_404_pypi_http_errors():
+    def rate_limited(url: str, timeout: float):
+        raise HTTPError(url, 429, "Too Many Requests", hdrs=None, fp=None)
+
+    result = PackageDocsService(fetcher=rate_limited).lookup("busy-package")
+
+    assert result.sources == []
+    assert result.metadata_source == "https://pypi.org/pypi/busy-package/json"
+    assert result.note == "PyPI returned HTTP 429."
+
+
 def test_package_docs_rejects_oversized_pypi_metadata_without_unbounded_read():
     class LargeResp:
         requested_size: int | None = None
diff --git a/tests/test_persistent_docs_cache.py b/tests/test_persistent_docs_cache.py
index 92bc01a..6cacc01 100644
--- a/tests/test_persistent_docs_cache.py
+++ b/tests/test_persistent_docs_cache.py
@@ -1,9 +1,13 @@
 """Persistent get_docs cache coverage."""
+
 from __future__ import annotations
 
+import logging
 import os
+import sqlite3
 from pathlib import Path
 
+from mcp_server_python_docs.models import GetDocsResult
 from mcp_server_python_docs.services.content import ContentService
 from mcp_server_python_docs.services.persistent_cache import PersistentDocsCache
 
@@ -37,6 +41,17 @@ def _cache(tmp_path: Path, marker: bytes = b"index-1") -> tuple[Path, Persistent
     return index_path, PersistentDocsCache(tmp_path / "retrieved.sqlite3", index_path)
 
 
+def _result(content: str, *, anchor: str | None = None) -> GetDocsResult:
+    return GetDocsResult(
+        content=content,
+        slug="library/json.html",
+        title="json",
+        version="3.12",
+        anchor=anchor,
+        char_count=len(content),
+    )
+
+
 def test_cache_survives_restart_and_miss_falls_back(populated_db, tmp_path: Path):
     _doc(populated_db, "3.12", "persisted docs", 1)
     index_path, cache = _cache(tmp_path)
@@ -73,3 +88,96 @@ def test_cache_ignores_stale_entries_after_index_replacement(populated_db, tmp_p
     ContentService(populated_db, stale).get_docs("library/json.html", "3.12", max_chars=500)
     assert stale.stats().hits == 0
     assert stale.stats().misses == 1
+
+
+def test_cache_key_distinguishes_no_anchor_from_empty_anchor(tmp_path: Path):
+    _, cache = _cache(tmp_path)
+    full_page = _result("full page", anchor=None)
+    empty_anchor = _result("empty anchor", anchor="")
+
+    cache.put(result=full_page, max_chars=500, start_index=0)
+    assert (
+        cache.get(version="3.12", slug="library/json.html", anchor="", max_chars=500, start_index=0)
+        is None
+    )
+
+    cache.put(result=empty_anchor, max_chars=500, start_index=0)
+    assert (
+        cache.get(
+            version="3.12", slug="library/json.html", anchor=None, max_chars=500, start_index=0
+        )
+        == full_page
+    )
+    assert (
+        cache.get(version="3.12", slug="library/json.html", anchor="", max_chars=500, start_index=0)
+        == empty_anchor
+    )
+
+
+def test_cache_key_includes_budget_and_start_index(tmp_path: Path):
+    _, cache = _cache(tmp_path)
+    page_100 = _result("chars-100")
+    page_200 = _result("chars-200")
+    page_start_10 = _result("start-10")
+
+    cache.put(result=page_100, max_chars=100, start_index=0)
+    cache.put(result=page_200, max_chars=200, start_index=0)
+    cache.put(result=page_start_10, max_chars=100, start_index=10)
+
+    assert (
+        cache.get(
+            version="3.12", slug="library/json.html", anchor=None, max_chars=100, start_index=0
+        )
+        == page_100
+    )
+    assert (
+        cache.get(
+            version="3.12", slug="library/json.html", anchor=None, max_chars=200, start_index=0
+        )
+        == page_200
+    )
+    assert (
+        cache.get(
+            version="3.12", slug="library/json.html", anchor=None, max_chars=100, start_index=10
+        )
+        == page_start_10
+    )
+
+
+def test_corrupt_cache_db_is_best_effort_miss(tmp_path: Path, caplog):
+    index_path = tmp_path / "index.db"
+    index_path.write_bytes(b"index")
+    cache_path = tmp_path / "retrieved.sqlite3"
+    cache_path.write_bytes(b"not sqlite")
+
+    with caplog.at_level(logging.WARNING):
+        cache = PersistentDocsCache(cache_path, index_path)
+        cache.put(result=_result("ignored"), max_chars=100, start_index=0)
+        assert (
+            cache.get(
+                version="3.12", slug="library/json.html", anchor=None, max_chars=100, start_index=0
+            )
+            is None
+        )
+
+    assert "Persistent docs cache disabled" in caplog.text
+    assert cache.stats().misses == 1
+    assert cache.stats().writes == 0
+
+
+def test_invalid_cached_json_is_best_effort_miss(tmp_path: Path, caplog):
+    _, cache = _cache(tmp_path)
+    cache.put(result=_result("valid"), max_chars=100, start_index=0)
+    with sqlite3.connect(cache.cache_path) as conn:
+        conn.execute("UPDATE retrieved_docs_cache SET result_json = 'not json'")
+
+    with caplog.at_level(logging.WARNING):
+        assert (
+            cache.get(
+                version="3.12", slug="library/json.html", anchor=None, max_chars=100, start_index=0
+            )
+            is None
+        )
+
+    assert "Persistent docs cache entry ignored" in caplog.text
+    assert cache.stats().misses == 1

From f7fe49b9bcfe36ce121d454a646acbb37fce919a Mon Sep 17 00:00:00 2001
From: "bluecloud-gilfoyle[bot]"
 <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com>
Date: Fri, 8 May 2026 11:24:04 +0000
Subject: [PATCH 4/8] docs: align PyPI lookup scope

---
 README.md                                           | 3 +--
 src/mcp_server_python_docs/services/package_docs.py | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index c09f9a2..0e4dec6 100644
--- a/README.md
+++ b/README.md
@@ -215,8 +215,7 @@ it only removes cached retrieval results, not the docs index.
 `https://docs.pypi.org/api/json/` (`GET /pypi/<project>/json`) and returns only
 sources declared in that package's PyPI metadata: the PyPI project URL,
 `docs_url`, `home_page`, and allowlisted `project_urls` labels such as
-Documentation, Homepage, Source, Repository, Issues, Changelog, and Release
-Notes.
+Documentation, Homepage, Source, and Repository.
 
 The tool makes the trust boundary explicit with
 `trust_boundary="pypi-declared-metadata"`. It does not crawl pages, perform web
diff --git a/src/mcp_server_python_docs/services/package_docs.py b/src/mcp_server_python_docs/services/package_docs.py
index 63a69a6..e7001e2 100644
--- a/src/mcp_server_python_docs/services/package_docs.py
+++ b/src/mcp_server_python_docs/services/package_docs.py
@@ -26,7 +26,6 @@
     "repository",
     "repo",
 }
-_BLOCKED = ("mirror", "community", "unofficial", "tutorial", "example")
 _PYPI_METADATA_MAX_BYTES = 5 * 1024 * 1024
 
 
@@ -135,7 +134,7 @@ def lookup(self, package: str) -> PackageDocsResult:
         if isinstance(project_urls, dict):
             for label, url in project_urls.items():
                 lowered = str(label).strip().lower()
-                if lowered in _ALLOWED and not any(bad in lowered for bad in _BLOCKED):
+                if lowered in _ALLOWED:
                     found = _source(str(label), url, lowered.replace(" ", "_"))
                     if found is not None and found not in sources:
                         sources.append(found)

From a54a2fe1cfd3f759517188d63cd1313fff5483d8 Mon Sep 17 00:00:00 2001
From: "bluecloud-gilfoyle[bot]"
 <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com>
Date: Fri, 8 May 2026 11:27:38 +0000
Subject: [PATCH 5/8] test: add PR 9 MCP smoke test plan

---
 PR9_MCP_TEST_PLAN.md | 393 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 393 insertions(+)
 create mode 100644 PR9_MCP_TEST_PLAN.md

diff --git a/PR9_MCP_TEST_PLAN.md b/PR9_MCP_TEST_PLAN.md
new file mode 100644
index 0000000..9429899
--- /dev/null
+++ b/PR9_MCP_TEST_PLAN.md
@@ -0,0 +1,393 @@
+# PR #9 MCP Test Plan — Persistent `get_docs` Cache + `lookup_package_docs`
+
+PR: https://github.com/ayhammouda/python-docs-mcp-server/pull/9  
+Branch: `fix/open-issues-cache-pypi-docs`  
+Purpose: validate the PR through actual MCP/tool-level behavior, not only unit tests.
+
+## Goals
+
+Confirm that:
+
+1. Existing stdlib docs tools still work.
+2. `get_docs` returns correct content.
+3. Persistent cache is written and reused across server restarts.
+4. Cache identity is correct: version, slug, anchor, `max_chars`, `start_index`, and index fingerprint matter.
+5. Cache failure is best-effort and does not break retrieval.
+6. `lookup_package_docs` returns controlled PyPI-declared docs/homepage/source/repository links.
+7. PyPI error modes return controlled notes rather than internal errors.
+8. MCP annotations and tool count are coherent.
+
+## Preconditions
+
+```bash
+cd /srv/openclaw/.openclaw/workspace/tmp/python-docs-mcp-review
+git checkout fix/open-issues-cache-pypi-docs || git checkout review-pr-9
+git pull --ff-only origin fix/open-issues-cache-pypi-docs
+uv sync --all-extras
+```
+
+Baseline gates:
+
+```bash
+uv run ruff check src/ tests/
+uv run pyright src/
+uv run pytest --tb=short -q
+uv build
+```
+
+Expected:
+
+- Ruff passes.
+- Pyright passes for `src/`.
+- Pytest passes: currently expected `254 passed, 3 skipped`.
+- Build succeeds.
+
+## Test Area 1 — Tool Registration / MCP Contract
+
+### 1.1 Verify five tools are registered
+
+Run a small introspection script or use the MCP client harness if available.
+
+Expected tools:
+
+- `search_docs`
+- `get_docs`
+- `lookup_package_docs`
+- `list_versions`
+- `detect_python_version`
+
+Expected annotations:
+
+- stdlib tools: `readOnlyHint=True`, `openWorldHint=False`
+- `lookup_package_docs`: `readOnlyHint=True`, `openWorldHint=True`
+
+Pass criteria:
+
+- Exactly five tools are exposed.
+- `lookup_package_docs` is visibly open-world because it calls PyPI.
+
+## Test Area 2 — `get_docs` Functional Retrieval
+
+### 2.1 Full page retrieval
+
+Call:
+
+```text
+get_docs(slug="library/json.html", version="3.12", max_chars=1000, start_index=0)
+```
+
+Expected:
+
+- Result contains JSON documentation content.
+- `slug == "library/json.html"`
+- `version == "3.12"`
+- `anchor is null`
+- `char_count > 0`
+
+### 2.2 Section retrieval
+
+First use `search_docs` to find a valid section anchor for `json`, then call:
+
+```text
+get_docs(slug="library/json.html", version="3.12", anchor=<valid_anchor>, max_chars=1000, start_index=0)
+```
+
+Expected:
+
+- Result is section-scoped.
+- `anchor == <valid_anchor>`
+- Content is not the full page.
+
+### 2.3 Empty anchor remains invalid
+
+Call:
+
+```text
+get_docs(slug="library/json.html", version="3.12", anchor="", max_chars=1000, start_index=0)
+```
+
+Expected:
+
+- Controlled tool error / page-not-found style response.
+- It must **not** return a cached full-page response.
+
+This specifically verifies the `anchor=None` vs `anchor=""` cache fix.
+
+## Test Area 3 — Persistent Cache Behavior
+
+Before running, locate cache path from platform cache dir. Expected filename:
+
+```text
+retrieved-docs-cache.sqlite3
+```
+
+Likely under:
+
+```text
+~/.cache/mcp-python-docs/retrieved-docs-cache.sqlite3
+```
+
+or the platform cache directory used by the app.
+
+### 3.1 Cache file creation
+
+1. Delete the cache file if present.
+2. Start the MCP server/client.
+3. Call `get_docs` for `library/json.html`.
+4. Stop the server.
+
+Expected:
+
+- Cache file exists.
+- SQLite table `retrieved_docs_cache` exists.
+- At least one row is present.
+
+Suggested inspection:
+
+```bash
+sqlite3 <cache-path>/retrieved-docs-cache.sqlite3 \
+  "SELECT version, slug, anchor, max_chars, start_index, length(result_json) FROM retrieved_docs_cache;"
+```
+
+### 3.2 Cache survives restart
+
+1. Start server again.
+2. Call the same `get_docs` request.
+
+Expected:
+
+- Same response content.
+- No user-visible behavior change.
+- If logs expose cache hits/misses, second call should be a hit.
+
+### 3.3 Cache key separates pagination/budget
+
+Call:
+
+```text
+get_docs(slug="library/json.html", version="3.12", max_chars=500, start_index=0)
+get_docs(slug="library/json.html", version="3.12", max_chars=1000, start_index=0)
+get_docs(slug="library/json.html", version="3.12", max_chars=500, start_index=100)
+```
+
+Expected:
+
+- Separate cache rows for each identity.
+- Results are not cross-contaminated.
+
+### 3.4 Corrupt cache is best-effort
+
+1. Stop server.
+2. Replace cache file with invalid bytes:
+
+```bash
+printf 'not sqlite' > <cache-path>/retrieved-docs-cache.sqlite3
+```
+
+3. Start server.
+4. Call `get_docs(slug="library/json.html", version="3.12")`.
+
+Expected:
+
+- Docs retrieval still succeeds.
+- Warning is logged about disabled/skipped persistent cache.
+- No internal server error.
+
+## Test Area 4 — `lookup_package_docs` Happy Path
+
+### 4.1 Known package with docs/source
+
+Call:
+
+```text
+lookup_package_docs(package="requests")
+```
+
+Expected:
+
+- `metadata_source == "https://pypi.org/pypi/requests/json"`
+- `trust_boundary == "pypi-declared-metadata"`
+- `package` is canonical from PyPI if available.
+- `version` is non-empty.
+- `sources` includes PyPI project URL and likely homepage/source/docs links.
+- Every source URL is `http://` or `https://`.
+- No web search / unofficial mirror fallback.
+
+### 4.2 Normalization
+
+Call:
+
+```text
+lookup_package_docs(package="Sample_Project")
+```
+
+Expected:
+
+- Metadata source normalizes to:
+
+```text
+https://pypi.org/pypi/sample-project/json
+```
+
+- Returned package may be PyPI canonical name.
+
+### 4.3 Missing package
+
+Call:
+
+```text
+lookup_package_docs(package="definitely-not-a-real-package-vision-test-xyz")
+```
+
+Expected:
+
+- `sources == []`
+- note contains package not found / PyPI 404 style message.
+- No internal error.
+
+## Test Area 5 — PyPI Failure Handling
+
+These may require monkeypatching/fake fetcher or temporary network blocking if not practical via live MCP.
+
+### 5.1 Non-404 HTTP errors
+
+Simulate PyPI `429` or `503`.
+
+Expected:
+
+- Controlled result:
+
+```text
+sources=[]
+note="PyPI returned HTTP 429."
+```
+
+or equivalent code.
+
+### 5.2 Network/JSON failure
+
+Simulate:
+
+- `URLError`
+- timeout
+- invalid JSON body
+
+Expected:
+
+- Controlled result note:
+
+```text
+Unable to retrieve PyPI metadata: <ErrorType>.
+```
+
+- No internal server error.
+
+### 5.3 Oversized PyPI JSON body
+
+Simulate a response larger than 5 MiB.
+
+Expected:
+
+- The service reads at most `5 MiB + 1 byte`.
+- Controlled result:
+
+```text
+sources=[]
+note="PyPI metadata exceeded size limit."
+```
+
+## Test Area 6 — Scope / Trust Boundary
+
+Use a package or fake response with broad `project_urls`, e.g. labels:
+
+- `Documentation`
+- `Homepage`
+- `Source`
+- `Repository`
+- `Issues`
+- `Changelog`
+- `Community mirror`
+- `Tutorial`
+
+Expected:
+
+Included:
+
+- Documentation
+- Homepage
+- Source
+- Repository
+
+Excluded/skipped:
+
+- Issues
+- Changelog
+- Community mirror
+- Tutorial
+
+Result note should mention ignored labels outside controlled allowlist.
+
+## Suggested Manual MCP Smoke Script
+
+If direct MCP client execution is awkward, use a minimal Python script that imports the services and mimics the tool layer:
+
+```python
+from mcp_server_python_docs.services.package_docs import PackageDocsService
+
+for pkg in ["requests", "Sample_Project", "definitely-not-a-real-package-vision-test-xyz"]:
+    print(PackageDocsService().lookup(pkg).model_dump())
+```
+
+For `get_docs`, prefer actual MCP invocation or server lifespan because cache path wiring happens there.
+
+## Pass / Fail Summary Template
+
+Return results in this format:
+
+```markdown
+## PR #9 MCP Test Results
+
+### Environment
+- Commit:
+- OS:
+- Python:
+- Cache path:
+
+### Gates
+- ruff:
+- pyright src:
+- pytest:
+- uv build:
+
+### MCP Tool Tests
+- Tool registration:
+- get_docs full page:
+- get_docs section:
+- empty anchor behavior:
+- cache file creation:
+- cache survives restart:
+- cache key separation:
+- corrupt cache fallback:
+- lookup_package_docs requests:
+- missing package:
+- failure simulation:
+- scope/trust boundary:
+
+### Verdict
+PASS / FAIL
+
+### Notes / Bugs Found
+- ...
+```
+
+## Final Acceptance Criteria
+
+The PR is considered MCP-smoke-test ready if:
+
+- Local gates pass.
+- Live MCP/tool invocation returns correct stdlib docs.
+- Cache file is created and reused after restart.
+- Corrupt cache does not break docs retrieval.
+- PyPI lookup returns only controlled PyPI-declared metadata.
+- PyPI expected failures return controlled notes.
+- No internal errors are observed for expected failure modes.

From 02d38217192a138837f8fe7a10fa6190b553eab8 Mon Sep 17 00:00:00 2001
From: "bluecloud-gilfoyle[bot]"
 <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com>
Date: Fri, 8 May 2026 11:39:55 +0000
Subject: [PATCH 6/8] docs: add PR 9 smoke failure fix plan

---
 PR9_SMOKE_FAILURE_FIX_PLAN.md | 445 ++++++++++++++++++++++++++++++++++
 1 file changed, 445 insertions(+)
 create mode 100644 PR9_SMOKE_FAILURE_FIX_PLAN.md

diff --git a/PR9_SMOKE_FAILURE_FIX_PLAN.md b/PR9_SMOKE_FAILURE_FIX_PLAN.md
new file mode 100644
index 0000000..6a2c26e
--- /dev/null
+++ b/PR9_SMOKE_FAILURE_FIX_PLAN.md
@@ -0,0 +1,445 @@
+# PR #9 Smoke Failure Fix + Retest Plan
+
+PR: https://github.com/ayhammouda/python-docs-mcp-server/pull/9  
+Current branch: `fix/open-issues-cache-pypi-docs`  
+Current head when this plan was written: `a54a2fe`
+
+## Executive Summary
+
+The PR implementation itself passed unit/service-level checks, but the MCP smoke test failed for two infrastructure reasons:
+
+1. **MCP stdio transport bug** — protocol frames appear on `stderr` instead of `stdout`, so a real MCP client hangs.
+2. **Local docs index is unusable for `get_docs` smoke testing** — installed index has version `3.13` but `0` documents, so full-page/section docs retrieval and real persistent cache writes cannot be validated.
+
+This plan fixes those issues and then reruns the MCP smoke test properly.
+
+```mermaid
+flowchart TD
+  A[Smoke test failed] --> B[Fix stdio transport]
+  A --> C[Prepare contentful docs index]
+  B --> D[Add regression tests]
+  C --> E[Validate get_docs live]
+  D --> F[Run gates]
+  E --> F
+  F --> G[Run MCP smoke test]
+  G --> H{Pass?}
+  H -->|Yes| I[PR ready to merge]
+  H -->|No| J[File focused bug / fix]
+```
+
+## Issue 1 — MCP stdio responses emitted on `stderr`
+
+### Symptom
+
+Actual MCP stdio client invocation failed:
+
+- JSON-RPC frames were emitted on `stderr`, not `stdout`.
+- Client could not consume responses and hung.
+- Existing service-level tests passed, so this is a transport/runtime bug.
+
+### Likely Root Cause
+
+`src/mcp_server_python_docs/__main__.py` intentionally redirects fd `1` to fd `2` early to protect the MCP protocol pipe from import-time stdout pollution:
+
+```python
+_saved_stdout_fd = os.dup(1)
+os.dup2(2, 1)
+sys.stdout = sys.stderr
+```
+
+Later, in `serve()`, it restores fd `1`:
+
+```python
+os.dup2(saved_stdout_fd, 1)
+os.close(saved_stdout_fd)
+mcp_server.run(transport="stdio")
+```
+
+But `sys.stdout` still references `sys.stderr`. If FastMCP writes through `sys.stdout` rather than raw fd `1`, frames still go to stderr.
+
+### Fix Strategy
+
+After restoring fd `1`, also restore the Python-level stdout object before calling FastMCP:
+
+```python
+os.dup2(saved_stdout_fd, 1)
+os.close(saved_stdout_fd)
+sys.stdout = sys.__stdout__
+```
+
+If `sys.__stdout__` does not behave reliably under tests, use an explicit fd wrapper:
+
+```python
+sys.stdout = os.fdopen(
+    os.dup(1),
+    "w",
+    buffering=1,
+    encoding=getattr(sys.__stdout__, "encoding", None) or "utf-8",
+    errors="replace",
+)
+```
+
+Prefer the smaller `sys.stdout = sys.__stdout__` first.
+
+### Constraints
+
+Keep non-serve commands safe:
+
+- `doctor`
+- `build-index`
+- `--help`
+- `--version`
+
+These should still avoid stdout pollution unless deliberately allowed by existing tests.
+
+### Regression Tests
+
+Add or update tests around `tests/test_stdio_smoke.py` / `tests/test_stdio_hygiene.py`.
+
+Required test behavior:
+
+1. Start server as subprocess:
+
+```bash
+python -m mcp_server_python_docs serve
+```
+
+2. Send JSON-RPC initialize + tools/list frames through stdin.
+3. Assert:
+   - protocol responses are present on `stdout`;
+   - `stderr` may contain logs but contains no JSON-RPC response frames;
+   - stdout contains only valid JSON-RPC frames.
+
+Suggested assertions:
+
+```python
+assert b'"jsonrpc"' in result.stdout
+assert b'"tools/list"' not in result.stderr
+assert b'"result"' not in result.stderr  # if robust enough
+```
+
+Better: parse both streams line-by-line and ensure all JSON-RPC response objects came from stdout only.
+
+### Manual Validation
+
+Run:
+
+```bash
+uv run pytest tests/test_stdio_smoke.py tests/test_stdio_hygiene.py -q
+```
+
+Then run the full gates:
+
+```bash
+uv run ruff check src/ tests/
+uv run pyright src/
+uv run pytest --tb=short -q
+uv build
+```
+
+## Issue 2 — Local index has no documents
+
+### Symptom
+
+Gilfoyle reported:
+
+```text
+/home/ahammouda/.cache/mcp-python-docs/index.db has only 3.13 and 0 documents
+```
+
+Observed behavior:
+
+- `get_docs(... version="3.12")` → version not found
+- default `3.13` → page not found
+- persistent cache DB exists but has `0` rows because `get_docs` never succeeds
+
+### Root Cause
+
+The currently installed index is likely symbol-only or incomplete. The smoke test requires contentful docs ingestion, not only `objects.inv` symbols.
+
+Attempted full 3.12 build failed because system Python lacks `ensurepip` / `python3.12-venv`.
+
+### Fix Strategy A — Preferred: Build a contentful index
+
+1. Run doctor:
+
+```bash
+uv run python -m mcp_server_python_docs doctor
+```
+
+2. If it reports missing venv/ensurepip support, install the matching venv package.
+
+On Ubuntu/Debian with Python 3.12:
+
+```bash
+sudo apt-get update
+sudo apt-get install -y python3.12-venv
+```
+
+This requires elevated approval.
+
+3. Build a minimal contentful index for smoke tests.
+
+Prefer one version to keep runtime manageable:
+
+```bash
+uv run python -m mcp_server_python_docs build-index --versions 3.13
+```
+
+If the project specifically needs 3.12 too:
+
+```bash
+uv run python -m mcp_server_python_docs build-index --versions 3.12,3.13
+```
+
+4. Validate index content:
+
+```bash
+sqlite3 ~/.cache/mcp-python-docs/index.db \
+  "SELECT ds.version, COUNT(d.id) AS documents, COUNT(s.id) AS sections
+   FROM doc_sets ds
+   LEFT JOIN documents d ON d.doc_set_id = ds.id
+   LEFT JOIN sections s ON s.document_id = d.id
+   GROUP BY ds.version;"
+```
+
+Expected:
+
+- `documents > 0`
+- `sections > 0`
+
+5. Confirm a known page exists:
+
+```bash
+sqlite3 ~/.cache/mcp-python-docs/index.db \
+  "SELECT slug, title FROM documents WHERE slug='library/json.html' LIMIT 1;"
+```
+
+Expected:
+
+- one row for `library/json.html`
+
+### Fix Strategy B — If full docs build is too slow/unavailable
+
+Use a **temporary synthetic content index** for smoke tests only.
+
+Do not replace the user’s real index permanently. Use an isolated `XDG_CACHE_HOME`:
+
+```bash
+export XDG_CACHE_HOME="$(mktemp -d)"
+```
+
+Create a minimal `index.db` with:
+
+- one `doc_sets` row for `3.13`
+- one `documents` row for `library/json.html`
+- one or more `sections` rows, including a valid anchor like `top`
+- rebuilt FTS tables if needed
+
+Then run MCP server with that same `XDG_CACHE_HOME`.
+
+This is acceptable for transport/cache smoke validation, but final release confidence should still include one real contentful index build when possible.
+
+### Regression Test Recommendation
+
+Add a new smoke fixture that creates a temporary contentful index and starts the server with isolated cache env.
+
+Test should not depend on the machine’s global `~/.cache` state.
+
+Suggested new test file:
+
+```text
+tests/test_mcp_get_docs_cache_smoke.py
+```
+
+Test behavior:
+
+1. Create temp cache dir.
+2. Create minimal valid index at `<temp-cache>/mcp-python-docs/index.db`.
+3. Start server subprocess with `XDG_CACHE_HOME=<temp-cache>`.
+4. Call `get_docs` through MCP stdio.
+5. Assert returned content.
+6. Stop server.
+7. Inspect `retrieved-docs-cache.sqlite3` has one row.
+8. Restart server.
+9. Call same `get_docs`; assert same result.
+10. Corrupt cache file; restart; assert `get_docs` still succeeds.
+
+This makes future smoke tests deterministic and prevents the exact failure seen here: “global index exists but has no documents.”
+
+## Issue 3 — Persistent cache live validation blocked
+
+### Symptom
+
+Cache service passed synthetic tests, but real cache creation/restart behavior could not be validated because `get_docs` failed before writing cache rows.
+
+### Fix Strategy
+
+After Issue 2 is fixed, run the cache smoke tests again.
+
+Manual commands:
+
+```bash
+CACHE_DIR="$HOME/.cache/mcp-python-docs"
+rm -f "$CACHE_DIR/retrieved-docs-cache.sqlite3"
+```
+
+Then call live MCP:
+
+```text
+get_docs(slug="library/json.html", version="3.13", max_chars=1000, start_index=0)
+```
+
+Inspect:
+
+```bash
+sqlite3 "$CACHE_DIR/retrieved-docs-cache.sqlite3" \
+  "SELECT version, slug, anchor, max_chars, start_index, length(result_json)
+   FROM retrieved_docs_cache;"
+```
+
+Expected:
+
+- row exists for version `3.13`
+- slug `library/json.html`
+- anchor sentinel/internal representation is not user-visible, but row exists
+- `result_json` length > 0
+
+Restart server and repeat call.
+
+Expected:
+
+- same response
+- if stats/logs expose cache hit, hit count increments
+
+## Issue 4 — Test the actual MCP path, not just services
+
+### Current State
+
+The smoke run had many service-level passes. Those are useful but not enough because MCP transport failed.
+
+### Required Post-Fix MCP Tests
+
+Use an actual MCP client or subprocess JSON-RPC harness.
+
+Minimum calls:
+
+1. `tools/list`
+2. `tools/call search_docs`
+3. `tools/call get_docs` full page
+4. `tools/call get_docs` valid section
+5. `tools/call get_docs` empty anchor
+6. `tools/call lookup_package_docs requests`
+7. `tools/call lookup_package_docs missing package`
+
+### Expected Results
+
+| Test | Expected |
+|---|---|
+| tools/list | five tools, `lookup_package_docs` present |
+| search_docs | returns hits from local index |
+| get_docs full page | returns content and writes persistent cache row |
+| get_docs section | returns section content |
+| get_docs empty anchor | controlled not-found error, not cached full page |
+| lookup_package_docs requests | PyPI-declared sources + trust boundary |
+| missing package | `sources=[]`, controlled note |
+
+## Proposed Work Breakdown for Gilfoyle
+
+### Phase 1 — Fix stdio transport
+
+- Patch `serve()` to restore `sys.stdout` after fd restoration.
+- Add regression test proving JSON-RPC responses are on stdout, not stderr.
+- Run stdio tests.
+
+### Phase 2 — Make smoke index deterministic
+
+Choose one:
+
+- **Option A:** install `python3.12-venv` and build a real contentful index.
+- **Option B:** add a temp synthetic-index smoke test harness using isolated `XDG_CACHE_HOME`.
+
+Recommendation: do both if time allows. Option B should become the permanent CI regression test; Option A validates real user setup.
+
+### Phase 3 — Rerun PR #9 MCP smoke plan
+
+Use `PR9_MCP_TEST_PLAN.md` again after the fixes.
+
+### Phase 4 — Report
+
+Return a result summary:
+
+```markdown
+## PR #9 Transport/Index Fix Results
+
+### Fixes Applied
+- ...
+
+### Gates
+- ruff:
+- pyright src:
+- pytest:
+- uv build:
+
+### MCP Stdio
+- initialize:
+- tools/list:
+- stdout/stderr framing:
+
+### Index
+- index build/fixture:
+- versions:
+- document count:
+- section count:
+
+### get_docs + Cache
+- full page:
+- section:
+- empty anchor:
+- cache file rows:
+- restart cache reuse:
+- corrupt cache fallback:
+
+### lookup_package_docs
+- requests:
+- missing package:
+- failure simulations:
+
+### Verdict
+PASS / FAIL
+
+### Remaining Blockers
+- ...
+```
+
+## Definition of Done
+
+The issues are fixed when:
+
+1. MCP JSON-RPC responses are emitted on stdout only.
+2. stderr contains logs only, not protocol response frames.
+3. Smoke tests use a contentful index, either real or deterministic synthetic fixture.
+4. `get_docs` works through actual MCP, not just service calls.
+5. Persistent cache file is created after real `get_docs` MCP call.
+6. Cache survives server restart.
+7. Corrupt cache does not break real MCP `get_docs`.
+8. `lookup_package_docs` works through actual MCP.
+9. Full validation passes:
+
+```bash
+uv run ruff check src/ tests/
+uv run pyright src/
+uv run pytest --tb=short -q
+uv build
+```
+
+10. GitHub CI is green.
+
+## Merge Guidance
+
+Do **not** merge PR #9 until either:
+
+- the stdio transport bug is fixed in this PR/branch and live MCP smoke tests pass; or
+- the stdio transport bug is confirmed pre-existing, tracked separately, and Aymen explicitly accepts merging PR #9 with only service-level validation.
+
+My recommendation: fix stdio now. An MCP server whose protocol writes to the wrong stream is a red-alert goblin, not a “later maybe.”

From d8c5c7e2e199ac8464a9c3e2f8060f3448851b11 Mon Sep 17 00:00:00 2001
From: "bluecloud-gilfoyle[bot]"
 <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com>
Date: Fri, 8 May 2026 11:46:42 +0000
Subject: [PATCH 7/8] fix: restore stdio transport stdout

---
 src/mcp_server_python_docs/__main__.py |   4 +
 tests/test_mcp_get_docs_cache_smoke.py | 207 +++++++++++++++++++++++++
 tests/test_stdio_smoke.py              |  76 ++++++---
 3 files changed, 262 insertions(+), 25 deletions(-)
 create mode 100644 tests/test_mcp_get_docs_cache_smoke.py

diff --git a/src/mcp_server_python_docs/__main__.py b/src/mcp_server_python_docs/__main__.py
index 35fc016..a4baa5f 100644
--- a/src/mcp_server_python_docs/__main__.py
+++ b/src/mcp_server_python_docs/__main__.py
@@ -98,6 +98,10 @@ def serve() -> None:
     # print to stdout during MCP communication.
     os.dup2(saved_stdout_fd, 1)
     os.close(saved_stdout_fd)
+    # FastMCP writes JSON-RPC frames through Python's stdout object.
+    # Restore it as well as fd 1, otherwise protocol frames are emitted
+    # on stderr because module import redirected sys.stdout there.
+    sys.stdout = sys.__stdout__
 
     try:
         mcp_server.run(transport="stdio")
diff --git a/tests/test_mcp_get_docs_cache_smoke.py b/tests/test_mcp_get_docs_cache_smoke.py
new file mode 100644
index 0000000..2d4761c
--- /dev/null
+++ b/tests/test_mcp_get_docs_cache_smoke.py
@@ -0,0 +1,207 @@
+"""MCP subprocess smoke tests for get_docs and persistent cache behavior."""
+from __future__ import annotations
+
+import sqlite3
+import subprocess
+import sys
+from pathlib import Path
+
+from mcp_server_python_docs.storage.db import bootstrap_schema, get_readwrite_connection
+from tests.test_stdio_smoke import (
+    _assert_protocol_on_stdout_only,
+    _find_response,
+    _isolated_cache_env,
+    _make_notification,
+    _make_request,
+)
+
+
+def _create_contentful_json_index(cache_dir: Path) -> Path:
+    """Create a deterministic contentful docs index for subprocess smoke tests."""
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    db_path = cache_dir / "index.db"
+    conn = get_readwrite_connection(db_path)
+    bootstrap_schema(conn)
+    conn.execute(
+        "INSERT INTO doc_sets (source, version, language, label, is_default, base_url) "
+        "VALUES ('python-docs', '3.13', 'en', 'Python 3.13', 1, "
+        "'https://docs.python.org/3.13/')"
+    )
+    doc_set_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
+    conn.execute(
+        "INSERT INTO symbols (doc_set_id, qualified_name, normalized_name, "
+        "module, symbol_type, uri, anchor) "
+        "VALUES (?, 'json.dumps', 'json_dumps', 'json', 'function', "
+        "'library/json.html#json.dumps', 'json.dumps')",
+        (doc_set_id,),
+    )
+    conn.execute(
+        "INSERT INTO documents (doc_set_id, uri, slug, title, content_text, char_count) "
+        "VALUES (?, 'library/json.html', 'library/json.html', "
+        "'json — JSON encoder and decoder', "
+        "'The json module exposes APIs for encoding and decoding JSON data.', 64)",
+        (doc_set_id,),
+    )
+    doc_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
+    conn.executemany(
+        "INSERT INTO sections (document_id, uri, anchor, heading, level, ordinal, "
+        "content_text, char_count) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
+        [
+            (
+                doc_id,
+                "library/json.html#top",
+                "top",
+                "json — JSON encoder and decoder",
+                1,
+                1,
+                "The json module exposes APIs for encoding and decoding JSON data.",
+                64,
+            ),
+            (
+                doc_id,
+                "library/json.html#json.dumps",
+                "json.dumps",
+                "json.dumps",
+                2,
+                2,
+                "Serialize obj to a JSON formatted str using a conversion table.",
+                62,
+            ),
+        ],
+    )
+    conn.commit()
+    conn.execute("INSERT INTO symbols_fts(symbols_fts) VALUES('rebuild')")
+    conn.execute("INSERT INTO sections_fts(sections_fts) VALUES('rebuild')")
+    conn.execute("INSERT INTO examples_fts(examples_fts) VALUES('rebuild')")
+    conn.commit()
+    conn.close()
+    return db_path
+
+
+def _run_server(stdin_data: bytes, env: dict[str, str]) -> subprocess.CompletedProcess:
+    return subprocess.run(
+        [sys.executable, "-m", "mcp_server_python_docs", "serve"],
+        input=stdin_data,
+        capture_output=True,
+        timeout=15,
+        env=env,
+    )
+
+
+def _initialized_tool_call(name: str, arguments: dict, req_id: int = 2) -> bytes:
+    return (
+        _make_request(
+            "initialize",
+            {
+                "protocolVersion": "2024-11-05",
+                "capabilities": {},
+                "clientInfo": {"name": "test", "version": "0.1"},
+            },
+            req_id=1,
+        )
+        + _make_notification("notifications/initialized")
+        + _make_request("tools/call", {"name": name, "arguments": arguments}, req_id=req_id)
+    )
+
+
+def _tool_structured_content(result: subprocess.CompletedProcess, req_id: int = 2) -> dict:
+    responses = _assert_protocol_on_stdout_only(result)
+    response = _find_response(responses, req_id)
+    assert response is not None, f"Missing tools/call response: {responses}"
+    assert "result" in response, response
+    assert response["result"].get("isError") is not True, response
+    return response["result"]["structuredContent"]
+
+
+def _tool_error_text(result: subprocess.CompletedProcess, req_id: int = 2) -> str:
+    responses = _assert_protocol_on_stdout_only(result)
+    response = _find_response(responses, req_id)
+    assert response is not None, f"Missing tools/call response: {responses}"
+    assert response["result"].get("isError") is True, response
+    return "\n".join(item.get("text", "") for item in response["result"].get("content", []))
+
+
+def test_get_docs_cache_restart_and_corrupt_cache_fallback(tmp_path: Path):
+    """Exercise get_docs through real MCP stdio with isolated contentful cache."""
+    env, cache_dir = _isolated_cache_env(tmp_path)
+    _create_contentful_json_index(cache_dir)
+    cache_path = cache_dir / "retrieved-docs-cache.sqlite3"
+
+    full_page = _tool_structured_content(
+        _run_server(
+            _initialized_tool_call(
+                "get_docs",
+                {"slug": "library/json.html", "version": "3.13"},
+            ),
+            env,
+        )
+    )
+    assert full_page["slug"] == "library/json.html"
+    assert full_page["anchor"] is None
+    assert "json module" in full_page["content"]
+
+    with sqlite3.connect(cache_path) as conn:
+        rows = conn.execute(
+            "SELECT version, slug, anchor, max_chars, start_index, length(result_json) "
+            "FROM retrieved_docs_cache"
+        ).fetchall()
+    assert len(rows) == 1
+    version, slug, anchor, max_chars, start_index, result_json_length = rows[0]
+    assert (version, slug, anchor, max_chars, start_index) == (
+        "3.13",
+        "library/json.html",
+        "\x00mcp-python-docs:no-anchor\x00",
+        8000,
+        0,
+    )
+    assert result_json_length > 0
+
+    restarted_page = _tool_structured_content(
+        _run_server(
+            _initialized_tool_call(
+                "get_docs",
+                {"slug": "library/json.html", "version": "3.13"},
+            ),
+            env,
+        )
+    )
+    assert restarted_page == full_page
+
+    section = _tool_structured_content(
+        _run_server(
+            _initialized_tool_call(
+                "get_docs",
+                {
+                    "slug": "library/json.html",
+                    "version": "3.13",
+                    "anchor": "json.dumps",
+                },
+            ),
+            env,
+        )
+    )
+    assert section["anchor"] == "json.dumps"
+    assert "Serialize obj" in section["content"]
+
+    empty_anchor_error = _tool_error_text(
+        _run_server(
+            _initialized_tool_call(
+                "get_docs",
+                {"slug": "library/json.html", "version": "3.13", "anchor": ""},
+            ),
+            env,
+        )
+    )
+    assert "Section '' not found" in empty_anchor_error
+
+    cache_path.write_bytes(b"not a sqlite database")
+    after_corrupt_cache = _tool_structured_content(
+        _run_server(
+            _initialized_tool_call(
+                "get_docs",
+                {"slug": "library/json.html", "version": "3.13"},
+            ),
+            env,
+        )
+    )
+    assert after_corrupt_cache == full_page
diff --git a/tests/test_stdio_smoke.py b/tests/test_stdio_smoke.py
index f2e7541..1b10563 100644
--- a/tests/test_stdio_smoke.py
+++ b/tests/test_stdio_smoke.py
@@ -16,6 +16,7 @@
 import os
 import subprocess
 import sys
+import time
 from pathlib import Path
 
 import pytest
@@ -131,6 +132,36 @@ def _read_responses(stdout_data: bytes) -> list[dict]:
     return responses
 
 
+def _jsonrpc_frames(stream_data: bytes) -> list[dict]:
+    """Return JSON-RPC objects parsed from a stream, ignoring log lines."""
+    frames: list[dict] = []
+    for line in stream_data.split(b"\n"):
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            parsed = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if isinstance(parsed, dict) and parsed.get("jsonrpc") == "2.0":
+            frames.append(parsed)
+    return frames
+
+
+def _assert_protocol_on_stdout_only(result: subprocess.CompletedProcess) -> list[dict]:
+    """Assert JSON-RPC protocol frames are emitted only on stdout."""
+    responses = _read_responses(result.stdout)
+    assert responses, f"No JSON-RPC responses on stdout; stderr was: {result.stderr!r}"
+
+    for resp in responses:
+        assert "_raw" not in resp, f"Non-JSON stdout pollution: {resp.get('_raw')}"
+        assert resp.get("jsonrpc") == "2.0", f"Missing JSON-RPC frame on stdout: {resp}"
+
+    stderr_frames = _jsonrpc_frames(result.stderr)
+    assert stderr_frames == [], f"JSON-RPC frames leaked to stderr: {stderr_frames}"
+    return responses
+
+
 def _find_response(responses: list[dict], req_id: int) -> dict | None:
     """Find a JSON-RPC response matching the given request id."""
     for resp in responses:
@@ -152,14 +183,23 @@ def _setup_test_env(self, tmp_path):
     def _run_server_with_input(
         self, stdin_data: bytes, timeout: int = 15,
     ) -> subprocess.CompletedProcess:
-        """Run the server subprocess with the given stdin and return the result."""
-        return subprocess.run(
+        """Run the server subprocess with line-paced stdin and return the result."""
+        proc = subprocess.Popen(
             [sys.executable, "-m", "mcp_server_python_docs", "serve"],
-            input=stdin_data,
-            capture_output=True,
-            timeout=timeout,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
             env=self.env,
         )
+        assert proc.stdin is not None
+        for index, line in enumerate(stdin_data.splitlines(keepends=True)):
+            proc.stdin.write(line)
+            proc.stdin.flush()
+            time.sleep(0.3 if index == 0 else 0.05)
+        proc.stdin.close()
+        proc.stdin = None
+        stdout, stderr = proc.communicate(timeout=timeout)
+        return subprocess.CompletedProcess(proc.args, proc.returncode, stdout, stderr)
 
     def test_server_lists_tools_no_stdout_pollution(self):
         """Server returns tool list and stdout has no non-JSON-RPC bytes."""
@@ -175,16 +215,11 @@ def test_server_lists_tools_no_stdout_pollution(self):
 
         result = self._run_server_with_input(stdin_data)
 
-        responses = _read_responses(result.stdout)
-
-        # Every line on stdout must be valid JSON-RPC
-        for resp in responses:
-            assert "_raw" not in resp, f"Non-JSON stdout pollution: {resp.get('_raw')}"
+        responses = _assert_protocol_on_stdout_only(result)
 
         # Find the tools/list response
         tools_resp = _find_response(responses, 2)
-        if tools_resp is None:
-            pytest.skip("Server exited before returning tools/list response")
+        assert tools_resp is not None, f"Server exited before tools/list response: {responses}"
         assert "result" in tools_resp, f"tools/list error: {tools_resp}"
         tool_names = [t["name"] for t in tools_resp["result"].get("tools", [])]
         assert "search_docs" in tool_names
@@ -209,16 +244,11 @@ def test_search_docs_round_trip(self):
         )
 
         result = self._run_server_with_input(stdin_data)
-        responses = _read_responses(result.stdout)
-
-        # No stdout pollution
-        for resp in responses:
-            assert "_raw" not in resp, f"Stdout pollution: {resp.get('_raw')}"
+        responses = _assert_protocol_on_stdout_only(result)
 
         # Find the tools/call response
         call_resp = _find_response(responses, 2)
-        if call_resp is None:
-            pytest.skip("Server exited before returning search_docs response")
+        assert call_resp is not None, f"Server exited before search_docs response: {responses}"
         assert "result" in call_resp, f"tools/call error: {call_resp}"
         content = call_resp["result"].get("content", [])
         assert len(content) >= 1, "search_docs returned no content"
@@ -239,14 +269,10 @@ def test_list_versions_round_trip(self):
         )
 
         result = self._run_server_with_input(stdin_data)
-        responses = _read_responses(result.stdout)
-
-        for resp in responses:
-            assert "_raw" not in resp, f"Stdout pollution: {resp.get('_raw')}"
+        responses = _assert_protocol_on_stdout_only(result)
 
         call_resp = _find_response(responses, 2)
-        if call_resp is None:
-            pytest.skip("Server exited before returning list_versions response")
+        assert call_resp is not None, f"Server exited before list_versions response: {responses}"
         assert "result" in call_resp, f"tools/call error: {call_resp}"
 
     def test_all_stdout_is_valid_jsonrpc(self):

From 694c2057ee4c5af2465439ad966dd65a02379879 Mon Sep 17 00:00:00 2001
From: Aymen Hammouda <hammouda.aymen@gmail.com>
Date: Fri, 8 May 2026 23:08:46 +0200
Subject: [PATCH 8/8] fix: return retrievable slugs for symbol hits

---
 .../retrieval/ranker.py                       | 118 +++++++++++++++---
 tests/test_services.py                        |  35 ++++++
 2 files changed, 137 insertions(+), 16 deletions(-)

diff --git a/src/mcp_server_python_docs/retrieval/ranker.py b/src/mcp_server_python_docs/retrieval/ranker.py
index b203277..e42c4a5 100644
--- a/src/mcp_server_python_docs/retrieval/ranker.py
+++ b/src/mcp_server_python_docs/retrieval/ranker.py
@@ -16,6 +16,89 @@
 logger = logging.getLogger(__name__)
 
 
+def _page_uri(uri: str) -> str:
+    """Return the page part of an objects.inv URI."""
+    return uri.split("#", 1)[0]
+
+
+def _document_candidates(uri: str) -> tuple[str, ...]:
+    """Return possible document slugs for a symbol URI.
+
+    ``objects.inv`` entries use HTML paths such as ``library/json.html``,
+    while Sphinx JSON content is ingested with extensionless slugs such as
+    ``library/json``. Prefer the exact URI first, then the extensionless form.
+    """
+    page_uri = _page_uri(uri)
+    if page_uri.endswith(".html"):
+        return (page_uri, page_uri[:-5])
+    return (page_uri,)
+
+
+def _resolve_symbol_location(
+    conn: sqlite3.Connection,
+    row: sqlite3.Row,
+) -> tuple[str, str]:
+    """Resolve a symbol row to a get_docs-compatible slug and anchor."""
+    uri = str(row["uri"])
+    fallback_slug = _page_uri(uri)
+    fallback_anchor = str(row["anchor"] or "")
+
+    section_id = row["section_id"]
+    if section_id is not None:
+        section_row = conn.execute(
+            """
+            SELECT d.slug, s.anchor
+            FROM sections s
+            JOIN documents d ON s.document_id = d.id
+            WHERE s.id = ?
+            LIMIT 1
+            """,
+            (section_id,),
+        ).fetchone()
+        if section_row is not None:
+            return section_row["slug"], section_row["anchor"] or ""
+
+    doc_row = None
+    document_id = row["document_id"]
+    if document_id is not None:
+        doc_row = conn.execute(
+            "SELECT id, slug FROM documents WHERE id = ? LIMIT 1",
+            (document_id,),
+        ).fetchone()
+
+    if doc_row is None:
+        for candidate in _document_candidates(uri):
+            doc_row = conn.execute(
+                """
+                SELECT id, slug
+                FROM documents
+                WHERE doc_set_id = ? AND (slug = ? OR uri = ?)
+                LIMIT 1
+                """,
+                (row["doc_set_id"], candidate, candidate),
+            ).fetchone()
+            if doc_row is not None:
+                break
+
+    if doc_row is None:
+        return fallback_slug, fallback_anchor
+
+    if fallback_anchor:
+        section_row = conn.execute(
+            """
+            SELECT anchor
+            FROM sections
+            WHERE document_id = ? AND anchor = ?
+            LIMIT 1
+            """,
+            (doc_row["id"], fallback_anchor),
+        ).fetchone()
+        if section_row is not None:
+            return doc_row["slug"], section_row["anchor"] or ""
+
+    return doc_row["slug"], ""
+
+
 def _normalize_scores(hits: list[SymbolHit]) -> list[SymbolHit]:
     """Normalize BM25 scores to [0.1, 1.0] range.
 
@@ -129,7 +212,8 @@ def search_symbols(
     try:
         cursor = conn.execute(
             """
-            SELECT sym.id, sym.qualified_name, sym.symbol_type, sym.uri,
+            SELECT sym.id, sym.doc_set_id, sym.document_id, sym.section_id,
+                   sym.qualified_name, sym.symbol_type, sym.uri,
                    sym.anchor, sym.module, d.version,
                    bm25(symbols_fts, 10.0, 1.0) as score,
                    snippet(symbols_fts, 0, '**', '**', '...', 32) as snippet_text
@@ -148,19 +232,19 @@ def search_symbols(
         logger.warning("FTS5 query failed for symbols: %r", match_expr)
         return []
 
-    hits = [
-        SymbolHit(
+    hits = []
+    for row in rows:
+        slug, anchor = _resolve_symbol_location(conn, row)
+        hits.append(SymbolHit(
             uri=row["uri"],
             title=row["qualified_name"],
             kind=row["symbol_type"] or "symbol",
             snippet=row["snippet_text"] or "",
             score=row["score"],
             version=row["version"],
-            slug=row["uri"].split("#")[0] if "#" in row["uri"] else row["uri"],
-            anchor=row["anchor"] or "",
-        )
-        for row in rows
-    ]
+            slug=slug,
+            anchor=anchor,
+        ))
 
     return _normalize_scores(hits)
 
@@ -251,7 +335,8 @@ def lookup_symbols_exact(
 
     cursor = conn.execute(
         """
-        SELECT s.qualified_name, s.symbol_type, s.uri, s.anchor,
+        SELECT s.doc_set_id, s.document_id, s.section_id,
+               s.qualified_name, s.symbol_type, s.uri, s.anchor,
                s.module, d.version
         FROM symbols s
         JOIN doc_sets d ON s.doc_set_id = d.id
@@ -264,16 +349,17 @@ def lookup_symbols_exact(
     )
     rows = cursor.fetchall()
 
-    return [
-        SymbolHit(
+    hits = []
+    for row in rows:
+        slug, anchor = _resolve_symbol_location(conn, row)
+        hits.append(SymbolHit(
             uri=row["uri"],
             title=row["qualified_name"],
             kind=row["symbol_type"] or "symbol",
             snippet="",
             score=1.0 if row["qualified_name"] == query else 0.8,
             version=row["version"],
-            slug=row["uri"].split("#")[0] if "#" in row["uri"] else row["uri"],
-            anchor=row["anchor"] or "",
-        )
-        for row in rows
-    ]
+            slug=slug,
+            anchor=anchor,
+        ))
+    return hits
diff --git a/tests/test_services.py b/tests/test_services.py
index b084775..f563cdc 100644
--- a/tests/test_services.py
+++ b/tests/test_services.py
@@ -102,6 +102,41 @@ def test_search_symbol_fast_path(self, populated_with_content):
         assert len(result.hits) >= 1
         assert result.hits[0].title == "asyncio.TaskGroup"
 
+    def test_symbol_hit_slug_is_retrievable_when_content_slug_is_extensionless(
+        self,
+        populated_db,
+    ):
+        db = populated_db
+        doc_set_id = db.execute("SELECT id FROM doc_sets LIMIT 1").fetchone()[0]
+        db.execute(
+            "INSERT INTO documents (doc_set_id, uri, slug, title, content_text, char_count) "
+            "VALUES (?, 'library/json', 'library/json', 'json', "
+            "'json page content', 17)",
+            (doc_set_id,),
+        )
+        doc_id = db.execute("SELECT last_insert_rowid()").fetchone()[0]
+        db.execute(
+            "INSERT INTO sections (document_id, uri, anchor, heading, level, ordinal, "
+            "content_text, char_count) VALUES (?, 'library/json', '', 'Introduction', "
+            "1, 0, 'json.dumps guidance for command line tools.', 43)",
+            (doc_id,),
+        )
+        db.execute(
+            "INSERT INTO symbols (doc_set_id, qualified_name, normalized_name, "
+            "symbol_type, uri, anchor, module) VALUES (?, 'json.dumps', "
+            "'json.dumps', 'function', 'library/json.html#json.dumps', "
+            "'json.dumps', 'json')",
+            (doc_set_id,),
+        )
+        db.commit()
+
+        hit = SearchService(db, {}).search("json.dumps", version="3.13", kind="symbol").hits[0]
+
+        assert hit.slug == "library/json"
+        assert hit.anchor == ""
+        docs = ContentService(db).get_docs(hit.slug, hit.version, hit.anchor or None)
+        assert "json.dumps guidance" in docs.content
+
     def test_search_no_results(self, populated_with_content):
         svc = SearchService(populated_with_content, {})
         result = svc.search("nonexistent_xyz_symbol", kind="symbol")