From 61e0994966bc18fa922d9b7af133ac2d65bf1394 Mon Sep 17 00:00:00 2001 From: "bluecloud-gilfoyle[bot]" <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com> Date: Thu, 7 May 2026 21:38:32 +0000 Subject: [PATCH 1/8] feat: cache docs and expose PyPI package docs lookup --- README.md | 40 +++++- src/mcp_server_python_docs/app_context.py | 4 + src/mcp_server_python_docs/models.py | 43 ++++++ src/mcp_server_python_docs/server.py | 45 ++++++- .../services/content.py | 28 +++- .../services/package_docs.py | 124 ++++++++++++++++++ .../services/persistent_cache.py | 81 ++++++++++++ tests/test_package_docs.py | 69 ++++++++++ tests/test_persistent_docs_cache.py | 75 +++++++++++ tests/test_services.py | 4 +- 10 files changed, 505 insertions(+), 8 deletions(-) create mode 100644 src/mcp_server_python_docs/services/package_docs.py create mode 100644 src/mcp_server_python_docs/services/persistent_cache.py create mode 100644 tests/test_package_docs.py create mode 100644 tests/test_persistent_docs_cache.py diff --git a/README.md b/README.md index cceb308..c09f9a2 100644 --- a/README.md +++ b/README.md @@ -164,12 +164,13 @@ Contributor commands and validation steps live in ## Tools -The server currently exposes four MCP tools: +The server currently exposes five MCP tools: | Tool | Description | |------|-------------| | `search_docs` | Search Python stdlib docs by query. Supports symbol lookup (`asyncio.TaskGroup`), module search (`json`), and free-text search. Returns ranked hits with BM25 scoring and snippet excerpts. | -| `get_docs` | Retrieve a specific documentation page or section by slug and optional anchor. Returns markdown content with budget-enforced truncation and pagination. | +| `get_docs` | Retrieve a specific documentation page or section by slug and optional anchor. Returns markdown content with budget-enforced truncation and pagination. Retrieved results are cached on disk by Python docs version and request identity. | +| `lookup_package_docs` | Look up official PyPI package metadata and return package-declared documentation/homepage/source URLs. This is a controlled PyPI metadata lookup, not generic web search. | | `list_versions` | List all indexed Python versions with metadata. | | `detect_python_version` | Detect the user's local Python version and report whether it matches an indexed documentation version. | @@ -184,10 +185,43 @@ Use this server when you need: Use a generic fetcher or broader docs MCP when you need: -- third-party package docs outside the Python stdlib +- arbitrary third-party package content beyond package-declared PyPI metadata - arbitrary web pages - mixed-source research across many frameworks +## Retrieved docs cache + +`get_docs` responses are cached across MCP client/server restarts in the +platform cache directory: + +```text +/mcp-python-docs/retrieved-docs-cache.sqlite3 +``` + +The cache stores completed `get_docs` results, including page/section content, +for the resolved Python docs version plus request identity (`slug`, optional +`anchor`, `max_chars`, and `start_index`). Cache misses fall back to the normal +local index retrieval path and then write the retrieved result. + +Cache entries are also scoped to a fingerprint of the local `index.db` file +(path, size, and modification timestamp). If you rebuild or replace the local +docs index, older entries are ignored automatically instead of being returned +for the new index generation. Deleting `retrieved-docs-cache.sqlite3` is safe; +it only removes cached retrieval results, not the docs index. + +## PyPI package docs lookup + +`lookup_package_docs` queries the official PyPI JSON API documented at +`https://docs.pypi.org/api/json/` (`GET /pypi//json`) and returns only +sources declared in that package's PyPI metadata: the PyPI project URL, +`docs_url`, `home_page`, and allowlisted `project_urls` labels such as +Documentation, Homepage, Source, Repository, Issues, Changelog, and Release +Notes. + +The tool makes the trust boundary explicit with +`trust_boundary="pypi-declared-metadata"`. It does not crawl pages, perform web +search, or silently fall back to unofficial community mirrors. + ## Diagnostics Check the local environment: diff --git a/src/mcp_server_python_docs/app_context.py b/src/mcp_server_python_docs/app_context.py index 49e3d87..e8cc477 100644 --- a/src/mcp_server_python_docs/app_context.py +++ b/src/mcp_server_python_docs/app_context.py @@ -11,6 +11,8 @@ from pathlib import Path from mcp_server_python_docs.services.content import ContentService +from mcp_server_python_docs.services.package_docs import PackageDocsService +from mcp_server_python_docs.services.persistent_cache import PersistentDocsCache from mcp_server_python_docs.services.search import SearchService from mcp_server_python_docs.services.version import VersionService @@ -24,6 +26,8 @@ class AppContext: search_service: SearchService content_service: ContentService version_service: VersionService + package_docs_service: PackageDocsService = field(default_factory=PackageDocsService) + persistent_docs_cache: PersistentDocsCache | None = None synonyms: dict[str, list[str]] = field(default_factory=dict) detected_python_version: str | None = None detected_python_source: str | None = None diff --git a/src/mcp_server_python_docs/models.py b/src/mcp_server_python_docs/models.py index a688167..c08e86e 100644 --- a/src/mcp_server_python_docs/models.py +++ b/src/mcp_server_python_docs/models.py @@ -160,3 +160,46 @@ class DetectPythonVersionResult(BaseModel): is_default: bool = Field( description="Whether this detected version is being used as the default for get_docs" ) + + +# --- lookup_package_docs models --- + + +class PackageDocsInput(BaseModel): + """Input parameters for lookup_package_docs tool.""" + + package: str = Field( + min_length=1, + max_length=214, + description="PyPI package/project name (e.g. 'requests').", + ) + + +class PackageDocsSource(BaseModel): + """A package-declared documentation or project source URL.""" + + label: str = Field(description="Label from PyPI metadata or a normalized core metadata field") + url: str = Field(description="HTTP(S) URL declared by the package on PyPI") + kind: str = Field(description="Source category, such as docs, homepage, source, or pypi") + declared_by: str = Field(description="Where this source declaration came from") + + +class PackageDocsResult(BaseModel): + """Output from lookup_package_docs tool.""" + + package: str = Field(description="Canonical package name returned by PyPI when available") + version: str = Field(description="Latest version reported by PyPI metadata") + summary: str = Field(default="", description="Package summary from PyPI metadata") + metadata_source: str = Field(description="Official PyPI JSON API URL used for lookup") + trust_boundary: str = Field( + default="pypi-declared-metadata", + description="Indicates results are limited to PyPI/project-declared metadata", + ) + sources: list[PackageDocsSource] = Field( + default_factory=list, + description="Package-declared PyPI, documentation, homepage, and source URLs", + ) + note: str | None = Field( + default=None, + description="Controlled-scope note, for example skipped labels or not-found details", + ) diff --git a/src/mcp_server_python_docs/server.py b/src/mcp_server_python_docs/server.py index a6d8f03..d9e9af5 100644 --- a/src/mcp_server_python_docs/server.py +++ b/src/mcp_server_python_docs/server.py @@ -29,9 +29,12 @@ DetectPythonVersionResult, GetDocsResult, ListVersionsResult, + PackageDocsResult, SearchDocsResult, ) from mcp_server_python_docs.services.content import ContentService +from mcp_server_python_docs.services.package_docs import PackageDocsService +from mcp_server_python_docs.services.persistent_cache import PersistentDocsCache from mcp_server_python_docs.services.search import SearchService from mcp_server_python_docs.services.version import VersionService from mcp_server_python_docs.storage.db import get_readonly_connection @@ -86,15 +89,21 @@ async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]: # Open read-only connection (STOR-06, STOR-07) db = get_readonly_connection(index_path) + persistent_docs_cache: PersistentDocsCache | None = None try: # Check FTS5 (STOR-08) _assert_fts5(db) # Construct service instances (Phase 5 — service layer wiring) + persistent_docs_cache = PersistentDocsCache( + cache_path=cache_dir / "retrieved-docs-cache.sqlite3", + index_path=index_path, + ) search_svc = SearchService(db, synonyms) - content_svc = ContentService(db) + content_svc = ContentService(db, persistent_cache=persistent_docs_cache) version_svc = VersionService(db) + package_docs_svc = PackageDocsService() # Detect user's Python version and match to indexed versions detected_ver, detected_src = detect_python_version() @@ -119,6 +128,8 @@ async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]: search_service=search_svc, content_service=content_svc, version_service=version_svc, + package_docs_service=package_docs_svc, + persistent_docs_cache=persistent_docs_cache, detected_python_version=matched, detected_python_source=detected_src, ) @@ -133,6 +144,11 @@ async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]: pass raise finally: + if persistent_docs_cache is not None: + try: + persistent_docs_cache.close() + except Exception: + pass db.close() @@ -143,6 +159,12 @@ async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]: idempotentHint=True, openWorldHint=False, ) +_PYPI_TOOL_ANNOTATIONS = ToolAnnotations( + readOnlyHint=True, + destructiveHint=False, + idempotentHint=True, + openWorldHint=True, +) SearchQueryParam = Annotated[ str, @@ -184,6 +206,10 @@ async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]: int, Field(ge=0, description="Start position for pagination"), ] +PackageParam = Annotated[ + str, + Field(min_length=1, max_length=214, description="PyPI package/project name"), +] def create_server() -> FastMCP: @@ -239,6 +265,23 @@ def get_docs( logger.exception("Unexpected error in get_docs") raise ToolError(f"Internal error: {type(e).__name__}") + @mcp.tool(annotations=_PYPI_TOOL_ANNOTATIONS) + def lookup_package_docs( + package: PackageParam, + ctx: Context = None, # type: ignore[assignment] + ) -> PackageDocsResult: + """Look up package-declared docs/homepage/source URLs via official PyPI metadata. + + This is not generic web search: it only queries PyPI's JSON API and + returns official PyPI metadata plus package-declared project URLs. + """ + app_ctx: AppContext = ctx.request_context.lifespan_context + try: + return app_ctx.package_docs_service.lookup(package) + except Exception as e: + logger.exception("Unexpected error in lookup_package_docs") + raise ToolError(f"Internal error: {type(e).__name__}") + @mcp.tool(annotations=_TOOL_ANNOTATIONS) def list_versions( ctx: Context = None, # type: ignore[assignment] diff --git a/src/mcp_server_python_docs/services/content.py b/src/mcp_server_python_docs/services/content.py index 60e6b52..14a0e0c 100644 --- a/src/mcp_server_python_docs/services/content.py +++ b/src/mcp_server_python_docs/services/content.py @@ -13,6 +13,7 @@ from mcp_server_python_docs.retrieval.budget import apply_budget from mcp_server_python_docs.services.cache import create_section_cache from mcp_server_python_docs.services.observability import log_tool_call +from mcp_server_python_docs.services.persistent_cache import PersistentDocsCache from mcp_server_python_docs.services.version_resolution import resolve_version_strict @@ -23,9 +24,14 @@ class ContentService: When omitted, returns the full page with truncation/pagination. """ - def __init__(self, db: sqlite3.Connection) -> None: + def __init__( + self, + db: sqlite3.Connection, + persistent_cache: PersistentDocsCache | None = None, + ) -> None: self._db = db self._get_section = create_section_cache(db) + self._persistent_cache = persistent_cache def _resolve_version(self, version: str | None) -> str: """Resolve version to a concrete version string using shared resolution logic. @@ -47,6 +53,17 @@ def get_docs( """Retrieve documentation content by slug, optionally narrowed to a section by anchor.""" resolved_version = self._resolve_version(version) + if self._persistent_cache is not None: + cached = self._persistent_cache.get( + version=resolved_version, + slug=slug, + anchor=anchor, + max_chars=max_chars, + start_index=start_index, + ) + if cached is not None: + return cached + # Find the document doc_row = self._db.execute( """ @@ -119,7 +136,7 @@ def get_docs( full_text, max_chars, start_index ) - return GetDocsResult( + result = GetDocsResult( content=truncated_text, slug=slug, title=title, @@ -129,3 +146,10 @@ def get_docs( truncated=is_truncated, next_start_index=next_idx, ) + if self._persistent_cache is not None: + self._persistent_cache.put( + result=result, + max_chars=max_chars, + start_index=start_index, + ) + return result diff --git a/src/mcp_server_python_docs/services/package_docs.py b/src/mcp_server_python_docs/services/package_docs.py new file mode 100644 index 0000000..cfbef88 --- /dev/null +++ b/src/mcp_server_python_docs/services/package_docs.py @@ -0,0 +1,124 @@ +"""Controlled package docs lookup using official PyPI JSON metadata. + +Source: https://docs.pypi.org/api/json/ documents GET /pypi//json. +""" +from __future__ import annotations + +import json +import re +from collections.abc import Callable +from typing import Protocol +from urllib.error import HTTPError, URLError +from urllib.parse import quote, urlparse +from urllib.request import Request, urlopen + +from mcp_server_python_docs.models import PackageDocsResult, PackageDocsSource +from mcp_server_python_docs.services.observability import log_tool_call + +_ALLOWED = { + "documentation", "docs", "homepage", "home page", "source", "source code", + "repository", "repo", "bug tracker", "issues", "changelog", "release notes", +} +_BLOCKED = ("mirror", "community", "unofficial", "tutorial", "example") + + +class _HTTPResponse(Protocol): + def read(self) -> bytes: ... + def __enter__(self) -> "_HTTPResponse": ... + def __exit__(self, exc_type: object, exc: object, tb: object) -> bool | None: ... + + +Fetcher = Callable[[str, float], _HTTPResponse] + + +def _default_fetcher(url: str, timeout: float) -> _HTTPResponse: + req = Request( + url, + headers={"Accept": "application/json", "User-Agent": "mcp-server-python-docs"}, + ) + return urlopen(req, timeout=timeout) + + +def _normalize(name: str) -> str: + return quote(re.sub(r"[-_.]+", "-", name.strip().lower()), safe="-") + + +def _http_url(url: object) -> str | None: + if not isinstance(url, str): + return None + parsed = urlparse(url.strip()) + return url.strip() if parsed.scheme in {"http", "https"} and parsed.netloc else None + + +def _source(label: str, url: object, kind: str) -> PackageDocsSource | None: + valid = _http_url(url) + if valid is None: + return None + return PackageDocsSource(label=label, url=valid, kind=kind, declared_by="PyPI project metadata") + + +class PackageDocsService: + """Return package-declared docs/homepage/source URLs from PyPI metadata only.""" + + def __init__(self, fetcher: Fetcher = _default_fetcher, timeout: float = 10.0) -> None: + self._fetcher = fetcher + self._timeout = timeout + + @log_tool_call("lookup_package_docs") + def lookup(self, package: str) -> PackageDocsResult: + project = _normalize(package) + metadata_source = f"https://pypi.org/pypi/{project}/json" + try: + with self._fetcher(metadata_source, self._timeout) as response: + payload = json.loads(response.read().decode("utf-8")) + except HTTPError as e: + if e.code == 404: + return PackageDocsResult( + package=package, version="", metadata_source=metadata_source, + sources=[], note="Package not found on PyPI.", + ) + raise + except (URLError, TimeoutError, json.JSONDecodeError) as e: + return PackageDocsResult( + package=package, version="", metadata_source=metadata_source, + sources=[], note=f"Unable to retrieve PyPI metadata: {type(e).__name__}.", + ) + + info = payload.get("info") if isinstance(payload, dict) else {} + info = info if isinstance(info, dict) else {} + sources = [ + s for s in ( + _source( + "PyPI project", + info.get("project_url") or f"https://pypi.org/project/{project}/", + "pypi", + ), + _source("Documentation", info.get("docs_url"), "docs"), + _source("Homepage", info.get("home_page"), "homepage"), + ) if s is not None + ] + skipped: list[str] = [] + project_urls = info.get("project_urls") + if isinstance(project_urls, dict): + for label, url in project_urls.items(): + lowered = str(label).strip().lower() + if lowered in _ALLOWED and not any(bad in lowered for bad in _BLOCKED): + found = _source(str(label), url, lowered.replace(" ", "_")) + if found is not None and found not in sources: + sources.append(found) + else: + skipped.append(str(label)) + note = None + if skipped: + note = ( + "Ignored project URL labels outside the controlled allowlist: " + f"{', '.join(sorted(skipped))}." + ) + return PackageDocsResult( + package=str(info.get("name") or package), + version=str(info.get("version") or ""), + summary=str(info.get("summary") or ""), + metadata_source=metadata_source, + sources=sources, + note=note, + ) diff --git a/src/mcp_server_python_docs/services/persistent_cache.py b/src/mcp_server_python_docs/services/persistent_cache.py new file mode 100644 index 0000000..f260cd5 --- /dev/null +++ b/src/mcp_server_python_docs/services/persistent_cache.py @@ -0,0 +1,81 @@ +"""SQLite-backed cache for completed get_docs results across MCP restarts.""" +from __future__ import annotations + +import sqlite3 +from pathlib import Path +from typing import NamedTuple + +from mcp_server_python_docs.models import GetDocsResult + + +class CacheStats(NamedTuple): + hits: int = 0 + misses: int = 0 + writes: int = 0 + + +class PersistentDocsCache: + """Persist get_docs results by index fingerprint, version, and request identity.""" + + def __init__(self, cache_path: Path, index_path: Path) -> None: + self._cache_path = Path(cache_path) + self._fingerprint = self._fingerprint_index(Path(index_path)) + self._hits = self._misses = self._writes = 0 + self._cache_path.parent.mkdir(parents=True, exist_ok=True) + self._conn = sqlite3.connect(str(self._cache_path), check_same_thread=False) + self._conn.execute("PRAGMA synchronous = NORMAL") + self._conn.execute( + "CREATE TABLE IF NOT EXISTS retrieved_docs_cache (" + "index_fingerprint TEXT NOT NULL, version TEXT NOT NULL, slug TEXT NOT NULL, " + "anchor TEXT NOT NULL, max_chars INTEGER NOT NULL, start_index INTEGER NOT NULL, " + "result_json TEXT NOT NULL, created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, " + "PRIMARY KEY (index_fingerprint, version, slug, anchor, max_chars, start_index))" + ) + self._conn.commit() + + @property + def cache_path(self) -> Path: + return self._cache_path + + @staticmethod + def _fingerprint_index(index_path: Path) -> str: + stat = index_path.stat() + return f"{index_path.resolve()}:{stat.st_size}:{stat.st_mtime_ns}" + + def stats(self) -> CacheStats: + return CacheStats(self._hits, self._misses, self._writes) + + def get( + self, *, version: str, slug: str, anchor: str | None, max_chars: int, start_index: int + ) -> GetDocsResult | None: + row = self._conn.execute( + "SELECT result_json FROM retrieved_docs_cache WHERE index_fingerprint = ? " + "AND version = ? AND slug = ? AND anchor = ? AND max_chars = ? AND start_index = ?", + (self._fingerprint, version, slug, anchor or "", max_chars, start_index), + ).fetchone() + if row is None: + self._misses += 1 + return None + self._hits += 1 + return GetDocsResult.model_validate_json(row[0]) + + def put(self, *, result: GetDocsResult, max_chars: int, start_index: int) -> None: + self._conn.execute( + "INSERT OR REPLACE INTO retrieved_docs_cache " + "(index_fingerprint, version, slug, anchor, max_chars, start_index, result_json) " + "VALUES (?, ?, ?, ?, ?, ?, ?)", + ( + self._fingerprint, + result.version, + result.slug, + result.anchor or "", + max_chars, + start_index, + result.model_dump_json(), + ), + ) + self._conn.commit() + self._writes += 1 + + def close(self) -> None: + self._conn.close() diff --git a/tests/test_package_docs.py b/tests/test_package_docs.py new file mode 100644 index 0000000..cdfc0d1 --- /dev/null +++ b/tests/test_package_docs.py @@ -0,0 +1,69 @@ +"""Controlled PyPI package documentation lookup tests.""" +from __future__ import annotations + +import json +from urllib.error import HTTPError + +from mcp_server_python_docs.services.package_docs import PackageDocsService + + +class _Resp: + def __init__(self, payload: dict): + self._payload = payload + def __enter__(self): + return self + def __exit__(self, exc_type, exc, tb): + return False + def read(self) -> bytes: + return json.dumps(self._payload).encode() + + +def test_package_docs_uses_official_pypi_metadata_and_declared_urls(): + seen: list[str] = [] + def fetch(url: str, timeout: float): + seen.append(url) + return _Resp({"info": {"name": "SampleProject", "version": "4.0.0", + "summary": "sample", "project_url": "https://pypi.org/project/sampleproject/", + "home_page": "https://github.com/pypa/sampleproject", + "docs_url": "https://sampleproject.pypa.io/", + "project_urls": {"Documentation": "https://sampleproject.pypa.io/", + "Source": "https://github.com/pypa/sampleproject"}}}) + + result = PackageDocsService(fetcher=fetch).lookup("Sample_Project") + + assert seen == ["https://pypi.org/pypi/sample-project/json"] + assert result.package == "SampleProject" + assert result.trust_boundary == "pypi-declared-metadata" + assert {s.label for s in result.sources} >= { + "PyPI project", "Documentation", "Homepage", "Source", + } + assert all(s.declared_by == "PyPI project metadata" for s in result.sources) + + +def test_package_docs_filters_uncontrolled_urls_and_has_no_web_search_fallback(): + def fetch(url: str, timeout: float): + return _Resp({"info": {"name": "demo", "version": "1.0.0", + "home_page": "https://demo.example/", + "description": "See https://random-blog.example/demo", + "project_urls": {"Community mirror": "https://mirror.example/demo"}}}) + + result = PackageDocsService(fetcher=fetch).lookup("demo") + urls = [s.url for s in result.sources] + assert "https://demo.example/" in urls + assert "https://random-blog.example/demo" not in urls + assert "https://mirror.example/demo" not in urls + assert "community mirror" in (result.note or "").lower() + + +def test_package_docs_not_found_and_tool_annotation(): + def missing(url: str, timeout: float): + raise HTTPError(url, 404, "Not Found", hdrs=None, fp=None) + + result = PackageDocsService(fetcher=missing).lookup("missing-package") + assert result.sources == [] + assert "not found" in (result.note or "").lower() + + from mcp_server_python_docs.server import create_server + tool = create_server()._tool_manager._tools["lookup_package_docs"] + assert tool.annotations.readOnlyHint is True + assert tool.annotations.openWorldHint is True diff --git a/tests/test_persistent_docs_cache.py b/tests/test_persistent_docs_cache.py new file mode 100644 index 0000000..92bc01a --- /dev/null +++ b/tests/test_persistent_docs_cache.py @@ -0,0 +1,75 @@ +"""Persistent get_docs cache coverage.""" +from __future__ import annotations + +import os +from pathlib import Path + +from mcp_server_python_docs.services.content import ContentService +from mcp_server_python_docs.services.persistent_cache import PersistentDocsCache + + +def _doc(db, version: str, content: str, default: int = 0) -> None: + db.execute("DELETE FROM doc_sets WHERE source='python-docs' AND version=?", (version,)) + db.execute( + "INSERT INTO doc_sets (source, version, language, label, is_default) " + "VALUES ('python-docs', ?, 'en', ?, ?)", + (version, f"Python {version}", default), + ) + ds = db.execute("SELECT id FROM doc_sets WHERE version=?", (version,)).fetchone()[0] + db.execute( + "INSERT INTO documents (doc_set_id, uri, slug, title, content_text, char_count) " + "VALUES (?, 'library/json.html', 'library/json.html', ?, ?, ?)", + (ds, f"json {version}", content, len(content)), + ) + doc_id = db.execute("SELECT last_insert_rowid()").fetchone()[0] + db.execute( + "INSERT INTO sections (document_id, uri, anchor, heading, level, ordinal, " + "content_text, char_count) VALUES (?, 'library/json.html#top', 'top', " + "'Top', 1, 0, ?, ?)", + (doc_id, content, len(content)), + ) + db.commit() + + +def _cache(tmp_path: Path, marker: bytes = b"index-1") -> tuple[Path, PersistentDocsCache]: + index_path = tmp_path / "index.db" + index_path.write_bytes(marker) + return index_path, PersistentDocsCache(tmp_path / "retrieved.sqlite3", index_path) + + +def test_cache_survives_restart_and_miss_falls_back(populated_db, tmp_path: Path): + _doc(populated_db, "3.12", "persisted docs", 1) + index_path, cache = _cache(tmp_path) + first = ContentService(populated_db, cache).get_docs("library/json.html", "3.12", max_chars=500) + assert "persisted docs" in first.content + assert cache.stats().misses == cache.stats().writes == 1 + + restarted = PersistentDocsCache(tmp_path / "retrieved.sqlite3", index_path) + second = ContentService(populated_db, restarted).get_docs( + "library/json.html", "3.12", max_chars=500 + ) + assert second == first + assert restarted.stats().hits == 1 + + +def test_cache_key_includes_python_version(populated_db, tmp_path: Path): + _doc(populated_db, "3.12", "docs for 3.12") + _doc(populated_db, "3.13", "docs for 3.13", 1) + _, cache = _cache(tmp_path) + svc = ContentService(populated_db, cache) + assert "3.12" in svc.get_docs("library/json.html", "3.12", max_chars=500).content + assert "3.13" in svc.get_docs("library/json.html", "3.13", max_chars=500).content + assert cache.stats().misses == 2 + + +def test_cache_ignores_stale_entries_after_index_replacement(populated_db, tmp_path: Path): + _doc(populated_db, "3.12", "old generation", 1) + index_path, cache = _cache(tmp_path) + ContentService(populated_db, cache).get_docs("library/json.html", "3.12", max_chars=500) + index_path.write_bytes(b"index-generation-2-with-different-size") + os.utime(index_path, None) + + stale = PersistentDocsCache(tmp_path / "retrieved.sqlite3", index_path) + ContentService(populated_db, stale).get_docs("library/json.html", "3.12", max_chars=500) + assert stale.stats().hits == 0 + assert stale.stats().misses == 1 diff --git a/tests/test_services.py b/tests/test_services.py index 2fde2bf..b084775 100644 --- a/tests/test_services.py +++ b/tests/test_services.py @@ -402,12 +402,12 @@ def test_all_tools_have_annotations(self): annotations.openWorldHint is False ), f"{name} openWorldHint should be False" - def test_four_tools_registered(self): + def test_five_tools_registered(self): from mcp_server_python_docs.server import create_server server = create_server() tools = server._tool_manager._tools - assert len(tools) == 4 + assert len(tools) == 5 def test_runtime_tool_schemas_include_input_constraints(self): import anyio From 15b2bc56470bdbdc45838bcb3e0e376aac58f81e Mon Sep 17 00:00:00 2001 From: "bluecloud-gilfoyle[bot]" <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com> Date: Fri, 8 May 2026 10:29:11 +0000 Subject: [PATCH 2/8] fix: bound PyPI metadata response reads --- .../services/package_docs.py | 21 ++++++- tests/test_package_docs.py | 60 +++++++++++++++++-- 2 files changed, 74 insertions(+), 7 deletions(-) diff --git a/src/mcp_server_python_docs/services/package_docs.py b/src/mcp_server_python_docs/services/package_docs.py index cfbef88..9199804 100644 --- a/src/mcp_server_python_docs/services/package_docs.py +++ b/src/mcp_server_python_docs/services/package_docs.py @@ -20,10 +20,11 @@ "repository", "repo", "bug tracker", "issues", "changelog", "release notes", } _BLOCKED = ("mirror", "community", "unofficial", "tutorial", "example") +_PYPI_METADATA_MAX_BYTES = 5 * 1024 * 1024 class _HTTPResponse(Protocol): - def read(self) -> bytes: ... + def read(self, size: int = -1) -> bytes: ... def __enter__(self) -> "_HTTPResponse": ... def __exit__(self, exc_type: object, exc: object, tb: object) -> bool | None: ... @@ -57,6 +58,13 @@ def _source(label: str, url: object, kind: str) -> PackageDocsSource | None: return PackageDocsSource(label=label, url=valid, kind=kind, declared_by="PyPI project metadata") +def _read_limited(response: _HTTPResponse) -> bytes | None: + data = response.read(_PYPI_METADATA_MAX_BYTES + 1) + if len(data) > _PYPI_METADATA_MAX_BYTES: + return None + return data + + class PackageDocsService: """Return package-declared docs/homepage/source URLs from PyPI metadata only.""" @@ -70,7 +78,16 @@ def lookup(self, package: str) -> PackageDocsResult: metadata_source = f"https://pypi.org/pypi/{project}/json" try: with self._fetcher(metadata_source, self._timeout) as response: - payload = json.loads(response.read().decode("utf-8")) + data = _read_limited(response) + if data is None: + return PackageDocsResult( + package=package, + version="", + metadata_source=metadata_source, + sources=[], + note="PyPI metadata exceeded size limit.", + ) + payload = json.loads(data.decode("utf-8")) except HTTPError as e: if e.code == 404: return PackageDocsResult( diff --git a/tests/test_package_docs.py b/tests/test_package_docs.py index cdfc0d1..32b6ee6 100644 --- a/tests/test_package_docs.py +++ b/tests/test_package_docs.py @@ -2,20 +2,27 @@ from __future__ import annotations import json -from urllib.error import HTTPError +from urllib.error import HTTPError, URLError -from mcp_server_python_docs.services.package_docs import PackageDocsService +from mcp_server_python_docs.services.package_docs import ( + _PYPI_METADATA_MAX_BYTES, + PackageDocsService, +) class _Resp: - def __init__(self, payload: dict): + def __init__(self, payload: dict | bytes): self._payload = payload def __enter__(self): return self def __exit__(self, exc_type, exc, tb): return False - def read(self) -> bytes: - return json.dumps(self._payload).encode() + def read(self, size: int = -1) -> bytes: + if isinstance(self._payload, bytes): + data = self._payload + else: + data = json.dumps(self._payload).encode() + return data if size < 0 else data[:size] def test_package_docs_uses_official_pypi_metadata_and_declared_urls(): @@ -67,3 +74,46 @@ def missing(url: str, timeout: float): tool = create_server()._tool_manager._tools["lookup_package_docs"] assert tool.annotations.readOnlyHint is True assert tool.annotations.openWorldHint is True + + +def test_package_docs_rejects_oversized_pypi_metadata_without_unbounded_read(): + class LargeResp: + requested_size: int | None = None + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def read(self, size: int = -1) -> bytes: + self.requested_size = size + assert size == _PYPI_METADATA_MAX_BYTES + 1 + return b"x" * size + + response = LargeResp() + + def fetch(url: str, timeout: float): + return response + + result = PackageDocsService(fetcher=fetch).lookup("huge-package") + + assert response.requested_size == _PYPI_METADATA_MAX_BYTES + 1 + assert result.sources == [] + assert result.note == "PyPI metadata exceeded size limit." + + +def test_package_docs_reports_retrieval_and_json_errors(): + def unreachable(url: str, timeout: float): + raise URLError("network down") + + network_result = PackageDocsService(fetcher=unreachable).lookup("demo") + assert network_result.sources == [] + assert network_result.note == "Unable to retrieve PyPI metadata: URLError." + + def invalid_json(url: str, timeout: float): + return _Resp(b"not json") + + json_result = PackageDocsService(fetcher=invalid_json).lookup("demo") + assert json_result.sources == [] + assert json_result.note == "Unable to retrieve PyPI metadata: JSONDecodeError." From 5d8e8bcef925c3f2246cbbf876d0f0d118da3836 Mon Sep 17 00:00:00 2001 From: "bluecloud-gilfoyle[bot]" <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com> Date: Fri, 8 May 2026 10:45:14 +0000 Subject: [PATCH 3/8] fix: address PR 9 review cache and PyPI errors --- src/mcp_server_python_docs/server.py | 10 +- .../services/package_docs.py | 40 +++++-- .../services/persistent_cache.py | 113 +++++++++++++----- tests/conftest.py | 4 +- tests/test_package_docs.py | 11 ++ tests/test_persistent_docs_cache.py | 108 +++++++++++++++++ 6 files changed, 234 insertions(+), 52 deletions(-) diff --git a/src/mcp_server_python_docs/server.py b/src/mcp_server_python_docs/server.py index d9e9af5..4bba114 100644 --- a/src/mcp_server_python_docs/server.py +++ b/src/mcp_server_python_docs/server.py @@ -3,8 +3,10 @@ Thin server layer — delegates all tool logic to services. Dependency rule: server -> services -> retrieval/storage. """ + from __future__ import annotations +import asyncio import importlib.resources import logging import sqlite3 @@ -256,9 +258,7 @@ def get_docs( if version is None and app_ctx.detected_python_version: version = app_ctx.detected_python_version try: - return app_ctx.content_service.get_docs( - slug, version, anchor, max_chars, start_index - ) + return app_ctx.content_service.get_docs(slug, version, anchor, max_chars, start_index) except DocsServerError as e: raise ToolError(str(e)) except Exception as e: @@ -266,7 +266,7 @@ def get_docs( raise ToolError(f"Internal error: {type(e).__name__}") @mcp.tool(annotations=_PYPI_TOOL_ANNOTATIONS) - def lookup_package_docs( + async def lookup_package_docs( package: PackageParam, ctx: Context = None, # type: ignore[assignment] ) -> PackageDocsResult: @@ -277,7 +277,7 @@ def lookup_package_docs( """ app_ctx: AppContext = ctx.request_context.lifespan_context try: - return app_ctx.package_docs_service.lookup(package) + return await asyncio.to_thread(app_ctx.package_docs_service.lookup, package) except Exception as e: logger.exception("Unexpected error in lookup_package_docs") raise ToolError(f"Internal error: {type(e).__name__}") diff --git a/src/mcp_server_python_docs/services/package_docs.py b/src/mcp_server_python_docs/services/package_docs.py index 9199804..63a69a6 100644 --- a/src/mcp_server_python_docs/services/package_docs.py +++ b/src/mcp_server_python_docs/services/package_docs.py @@ -2,6 +2,7 @@ Source: https://docs.pypi.org/api/json/ documents GET /pypi//json. """ + from __future__ import annotations import json @@ -16,8 +17,14 @@ from mcp_server_python_docs.services.observability import log_tool_call _ALLOWED = { - "documentation", "docs", "homepage", "home page", "source", "source code", - "repository", "repo", "bug tracker", "issues", "changelog", "release notes", + "documentation", + "docs", + "homepage", + "home page", + "source", + "source code", + "repository", + "repo", } _BLOCKED = ("mirror", "community", "unofficial", "tutorial", "example") _PYPI_METADATA_MAX_BYTES = 5 * 1024 * 1024 @@ -89,22 +96,30 @@ def lookup(self, package: str) -> PackageDocsResult: ) payload = json.loads(data.decode("utf-8")) except HTTPError as e: - if e.code == 404: - return PackageDocsResult( - package=package, version="", metadata_source=metadata_source, - sources=[], note="Package not found on PyPI.", - ) - raise + note = ( + "Package not found on PyPI." if e.code == 404 else f"PyPI returned HTTP {e.code}." + ) + return PackageDocsResult( + package=package, + version="", + metadata_source=metadata_source, + sources=[], + note=note, + ) except (URLError, TimeoutError, json.JSONDecodeError) as e: return PackageDocsResult( - package=package, version="", metadata_source=metadata_source, - sources=[], note=f"Unable to retrieve PyPI metadata: {type(e).__name__}.", + package=package, + version="", + metadata_source=metadata_source, + sources=[], + note=f"Unable to retrieve PyPI metadata: {type(e).__name__}.", ) info = payload.get("info") if isinstance(payload, dict) else {} info = info if isinstance(info, dict) else {} sources = [ - s for s in ( + s + for s in ( _source( "PyPI project", info.get("project_url") or f"https://pypi.org/project/{project}/", @@ -112,7 +127,8 @@ def lookup(self, package: str) -> PackageDocsResult: ), _source("Documentation", info.get("docs_url"), "docs"), _source("Homepage", info.get("home_page"), "homepage"), - ) if s is not None + ) + if s is not None ] skipped: list[str] = [] project_urls = info.get("project_urls") diff --git a/src/mcp_server_python_docs/services/persistent_cache.py b/src/mcp_server_python_docs/services/persistent_cache.py index f260cd5..725a69d 100644 --- a/src/mcp_server_python_docs/services/persistent_cache.py +++ b/src/mcp_server_python_docs/services/persistent_cache.py @@ -1,12 +1,19 @@ """SQLite-backed cache for completed get_docs results across MCP restarts.""" + from __future__ import annotations +import logging import sqlite3 from pathlib import Path from typing import NamedTuple +from pydantic import ValidationError + from mcp_server_python_docs.models import GetDocsResult +logger = logging.getLogger(__name__) +_NO_ANCHOR_KEY = "\x00mcp-python-docs:no-anchor\x00" + class CacheStats(NamedTuple): hits: int = 0 @@ -21,17 +28,24 @@ def __init__(self, cache_path: Path, index_path: Path) -> None: self._cache_path = Path(cache_path) self._fingerprint = self._fingerprint_index(Path(index_path)) self._hits = self._misses = self._writes = 0 - self._cache_path.parent.mkdir(parents=True, exist_ok=True) - self._conn = sqlite3.connect(str(self._cache_path), check_same_thread=False) - self._conn.execute("PRAGMA synchronous = NORMAL") - self._conn.execute( - "CREATE TABLE IF NOT EXISTS retrieved_docs_cache (" - "index_fingerprint TEXT NOT NULL, version TEXT NOT NULL, slug TEXT NOT NULL, " - "anchor TEXT NOT NULL, max_chars INTEGER NOT NULL, start_index INTEGER NOT NULL, " - "result_json TEXT NOT NULL, created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, " - "PRIMARY KEY (index_fingerprint, version, slug, anchor, max_chars, start_index))" - ) - self._conn.commit() + self._conn: sqlite3.Connection | None = None + try: + self._cache_path.parent.mkdir(parents=True, exist_ok=True) + self._conn = sqlite3.connect(str(self._cache_path), check_same_thread=False) + self._conn.execute("PRAGMA synchronous = NORMAL") + self._conn.execute( + "CREATE TABLE IF NOT EXISTS retrieved_docs_cache (" + "index_fingerprint TEXT NOT NULL, version TEXT NOT NULL, slug TEXT NOT NULL, " + "anchor TEXT NOT NULL, max_chars INTEGER NOT NULL, start_index INTEGER NOT NULL, " + "result_json TEXT NOT NULL, created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, " + "PRIMARY KEY (index_fingerprint, version, slug, anchor, max_chars, start_index))" + ) + self._conn.commit() + except (OSError, sqlite3.Error) as e: + if self._conn is not None: + self._conn.close() + self._conn = None + logger.warning("Persistent docs cache disabled: %s", e) @property def cache_path(self) -> Path: @@ -42,40 +56,73 @@ def _fingerprint_index(index_path: Path) -> str: stat = index_path.stat() return f"{index_path.resolve()}:{stat.st_size}:{stat.st_mtime_ns}" + @staticmethod + def _anchor_key(anchor: str | None) -> str: + return _NO_ANCHOR_KEY if anchor is None else anchor + def stats(self) -> CacheStats: return CacheStats(self._hits, self._misses, self._writes) def get( self, *, version: str, slug: str, anchor: str | None, max_chars: int, start_index: int ) -> GetDocsResult | None: - row = self._conn.execute( - "SELECT result_json FROM retrieved_docs_cache WHERE index_fingerprint = ? " - "AND version = ? AND slug = ? AND anchor = ? AND max_chars = ? AND start_index = ?", - (self._fingerprint, version, slug, anchor or "", max_chars, start_index), - ).fetchone() + if self._conn is None: + self._misses += 1 + return None + try: + row = self._conn.execute( + "SELECT result_json FROM retrieved_docs_cache WHERE index_fingerprint = ? " + "AND version = ? AND slug = ? AND anchor = ? AND max_chars = ? AND start_index = ?", + ( + self._fingerprint, + version, + slug, + self._anchor_key(anchor), + max_chars, + start_index, + ), + ).fetchone() + except sqlite3.Error as e: + self._misses += 1 + logger.warning("Persistent docs cache read skipped: %s", e) + return None if row is None: self._misses += 1 return None + try: + result = GetDocsResult.model_validate_json(row[0]) + except (ValidationError, ValueError) as e: + self._misses += 1 + logger.warning("Persistent docs cache entry ignored: %s", e) + return None self._hits += 1 - return GetDocsResult.model_validate_json(row[0]) + return result def put(self, *, result: GetDocsResult, max_chars: int, start_index: int) -> None: - self._conn.execute( - "INSERT OR REPLACE INTO retrieved_docs_cache " - "(index_fingerprint, version, slug, anchor, max_chars, start_index, result_json) " - "VALUES (?, ?, ?, ?, ?, ?, ?)", - ( - self._fingerprint, - result.version, - result.slug, - result.anchor or "", - max_chars, - start_index, - result.model_dump_json(), - ), - ) - self._conn.commit() + if self._conn is None: + return + try: + self._conn.execute( + "INSERT OR REPLACE INTO retrieved_docs_cache " + "(index_fingerprint, version, slug, anchor, max_chars, start_index, result_json) " + "VALUES (?, ?, ?, ?, ?, ?, ?)", + ( + self._fingerprint, + result.version, + result.slug, + self._anchor_key(result.anchor), + max_chars, + start_index, + result.model_dump_json(), + ), + ) + self._conn.commit() + except (sqlite3.Error, ValueError) as e: + logger.warning("Persistent docs cache write skipped: %s", e) + return self._writes += 1 def close(self) -> None: - self._conn.close() + if self._conn is not None: + self._conn.close() + self._conn = None diff --git a/tests/conftest.py b/tests/conftest.py index 94cefd0..7de4664 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -150,8 +150,8 @@ def populated_db(test_db): "os.path -- Common pathname manipulations", [ ("os.path.join", "os.path.join", 2, 1, - "Join one or more path components intelligently. If a component is an absolute path, " - "all previous components are thrown away."), + "Join one or more path components intelligently. If a component is an " + "absolute path, all previous components are thrown away."), ], ), ] diff --git a/tests/test_package_docs.py b/tests/test_package_docs.py index 32b6ee6..d0455c2 100644 --- a/tests/test_package_docs.py +++ b/tests/test_package_docs.py @@ -76,6 +76,17 @@ def missing(url: str, timeout: float): assert tool.annotations.openWorldHint is True +def test_package_docs_reports_non_404_pypi_http_errors(): + def rate_limited(url: str, timeout: float): + raise HTTPError(url, 429, "Too Many Requests", hdrs=None, fp=None) + + result = PackageDocsService(fetcher=rate_limited).lookup("busy-package") + + assert result.sources == [] + assert result.metadata_source == "https://pypi.org/pypi/busy-package/json" + assert result.note == "PyPI returned HTTP 429." + + def test_package_docs_rejects_oversized_pypi_metadata_without_unbounded_read(): class LargeResp: requested_size: int | None = None diff --git a/tests/test_persistent_docs_cache.py b/tests/test_persistent_docs_cache.py index 92bc01a..6cacc01 100644 --- a/tests/test_persistent_docs_cache.py +++ b/tests/test_persistent_docs_cache.py @@ -1,9 +1,13 @@ """Persistent get_docs cache coverage.""" + from __future__ import annotations +import logging import os +import sqlite3 from pathlib import Path +from mcp_server_python_docs.models import GetDocsResult from mcp_server_python_docs.services.content import ContentService from mcp_server_python_docs.services.persistent_cache import PersistentDocsCache @@ -37,6 +41,17 @@ def _cache(tmp_path: Path, marker: bytes = b"index-1") -> tuple[Path, Persistent return index_path, PersistentDocsCache(tmp_path / "retrieved.sqlite3", index_path) +def _result(content: str, *, anchor: str | None = None) -> GetDocsResult: + return GetDocsResult( + content=content, + slug="library/json.html", + title="json", + version="3.12", + anchor=anchor, + char_count=len(content), + ) + + def test_cache_survives_restart_and_miss_falls_back(populated_db, tmp_path: Path): _doc(populated_db, "3.12", "persisted docs", 1) index_path, cache = _cache(tmp_path) @@ -73,3 +88,96 @@ def test_cache_ignores_stale_entries_after_index_replacement(populated_db, tmp_p ContentService(populated_db, stale).get_docs("library/json.html", "3.12", max_chars=500) assert stale.stats().hits == 0 assert stale.stats().misses == 1 + + +def test_cache_key_distinguishes_no_anchor_from_empty_anchor(tmp_path: Path): + _, cache = _cache(tmp_path) + full_page = _result("full page", anchor=None) + empty_anchor = _result("empty anchor", anchor="") + + cache.put(result=full_page, max_chars=500, start_index=0) + assert ( + cache.get(version="3.12", slug="library/json.html", anchor="", max_chars=500, start_index=0) + is None + ) + + cache.put(result=empty_anchor, max_chars=500, start_index=0) + assert ( + cache.get( + version="3.12", slug="library/json.html", anchor=None, max_chars=500, start_index=0 + ) + == full_page + ) + assert ( + cache.get(version="3.12", slug="library/json.html", anchor="", max_chars=500, start_index=0) + == empty_anchor + ) + + +def test_cache_key_includes_budget_and_start_index(tmp_path: Path): + _, cache = _cache(tmp_path) + page_100 = _result("chars-100") + page_200 = _result("chars-200") + page_start_10 = _result("start-10") + + cache.put(result=page_100, max_chars=100, start_index=0) + cache.put(result=page_200, max_chars=200, start_index=0) + cache.put(result=page_start_10, max_chars=100, start_index=10) + + assert ( + cache.get( + version="3.12", slug="library/json.html", anchor=None, max_chars=100, start_index=0 + ) + == page_100 + ) + assert ( + cache.get( + version="3.12", slug="library/json.html", anchor=None, max_chars=200, start_index=0 + ) + == page_200 + ) + assert ( + cache.get( + version="3.12", slug="library/json.html", anchor=None, max_chars=100, start_index=10 + ) + == page_start_10 + ) + + +def test_corrupt_cache_db_is_best_effort_miss(tmp_path: Path, caplog): + index_path = tmp_path / "index.db" + index_path.write_bytes(b"index") + cache_path = tmp_path / "retrieved.sqlite3" + cache_path.write_bytes(b"not sqlite") + + with caplog.at_level(logging.WARNING): + cache = PersistentDocsCache(cache_path, index_path) + cache.put(result=_result("ignored"), max_chars=100, start_index=0) + assert ( + cache.get( + version="3.12", slug="library/json.html", anchor=None, max_chars=100, start_index=0 + ) + is None + ) + + assert "Persistent docs cache disabled" in caplog.text + assert cache.stats().misses == 1 + assert cache.stats().writes == 0 + + +def test_invalid_cached_json_is_best_effort_miss(tmp_path: Path, caplog): + _, cache = _cache(tmp_path) + cache.put(result=_result("valid"), max_chars=100, start_index=0) + with sqlite3.connect(cache.cache_path) as conn: + conn.execute("UPDATE retrieved_docs_cache SET result_json = 'not json'") + + with caplog.at_level(logging.WARNING): + assert ( + cache.get( + version="3.12", slug="library/json.html", anchor=None, max_chars=100, start_index=0 + ) + is None + ) + + assert "Persistent docs cache entry ignored" in caplog.text + assert cache.stats().misses == 1 From f7fe49b9bcfe36ce121d454a646acbb37fce919a Mon Sep 17 00:00:00 2001 From: "bluecloud-gilfoyle[bot]" <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com> Date: Fri, 8 May 2026 11:24:04 +0000 Subject: [PATCH 4/8] docs: align PyPI lookup scope --- README.md | 3 +-- src/mcp_server_python_docs/services/package_docs.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c09f9a2..0e4dec6 100644 --- a/README.md +++ b/README.md @@ -215,8 +215,7 @@ it only removes cached retrieval results, not the docs index. `https://docs.pypi.org/api/json/` (`GET /pypi//json`) and returns only sources declared in that package's PyPI metadata: the PyPI project URL, `docs_url`, `home_page`, and allowlisted `project_urls` labels such as -Documentation, Homepage, Source, Repository, Issues, Changelog, and Release -Notes. +Documentation, Homepage, Source, and Repository. The tool makes the trust boundary explicit with `trust_boundary="pypi-declared-metadata"`. It does not crawl pages, perform web diff --git a/src/mcp_server_python_docs/services/package_docs.py b/src/mcp_server_python_docs/services/package_docs.py index 63a69a6..e7001e2 100644 --- a/src/mcp_server_python_docs/services/package_docs.py +++ b/src/mcp_server_python_docs/services/package_docs.py @@ -26,7 +26,6 @@ "repository", "repo", } -_BLOCKED = ("mirror", "community", "unofficial", "tutorial", "example") _PYPI_METADATA_MAX_BYTES = 5 * 1024 * 1024 @@ -135,7 +134,7 @@ def lookup(self, package: str) -> PackageDocsResult: if isinstance(project_urls, dict): for label, url in project_urls.items(): lowered = str(label).strip().lower() - if lowered in _ALLOWED and not any(bad in lowered for bad in _BLOCKED): + if lowered in _ALLOWED: found = _source(str(label), url, lowered.replace(" ", "_")) if found is not None and found not in sources: sources.append(found) From a54a2fe1cfd3f759517188d63cd1313fff5483d8 Mon Sep 17 00:00:00 2001 From: "bluecloud-gilfoyle[bot]" <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com> Date: Fri, 8 May 2026 11:27:38 +0000 Subject: [PATCH 5/8] test: add PR 9 MCP smoke test plan --- PR9_MCP_TEST_PLAN.md | 393 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 393 insertions(+) create mode 100644 PR9_MCP_TEST_PLAN.md diff --git a/PR9_MCP_TEST_PLAN.md b/PR9_MCP_TEST_PLAN.md new file mode 100644 index 0000000..9429899 --- /dev/null +++ b/PR9_MCP_TEST_PLAN.md @@ -0,0 +1,393 @@ +# PR #9 MCP Test Plan — Persistent `get_docs` Cache + `lookup_package_docs` + +PR: https://github.com/ayhammouda/python-docs-mcp-server/pull/9 +Branch: `fix/open-issues-cache-pypi-docs` +Purpose: validate the PR through actual MCP/tool-level behavior, not only unit tests. + +## Goals + +Confirm that: + +1. Existing stdlib docs tools still work. +2. `get_docs` returns correct content. +3. Persistent cache is written and reused across server restarts. +4. Cache identity is correct: version, slug, anchor, `max_chars`, `start_index`, and index fingerprint matter. +5. Cache failure is best-effort and does not break retrieval. +6. `lookup_package_docs` returns controlled PyPI-declared docs/homepage/source/repository links. +7. PyPI error modes return controlled notes rather than internal errors. +8. MCP annotations and tool count are coherent. + +## Preconditions + +```bash +cd /srv/openclaw/.openclaw/workspace/tmp/python-docs-mcp-review +git checkout fix/open-issues-cache-pypi-docs || git checkout review-pr-9 +git pull --ff-only origin fix/open-issues-cache-pypi-docs +uv sync --all-extras +``` + +Baseline gates: + +```bash +uv run ruff check src/ tests/ +uv run pyright src/ +uv run pytest --tb=short -q +uv build +``` + +Expected: + +- Ruff passes. +- Pyright passes for `src/`. +- Pytest passes: currently expected `254 passed, 3 skipped`. +- Build succeeds. + +## Test Area 1 — Tool Registration / MCP Contract + +### 1.1 Verify five tools are registered + +Run a small introspection script or use the MCP client harness if available. + +Expected tools: + +- `search_docs` +- `get_docs` +- `lookup_package_docs` +- `list_versions` +- `detect_python_version` + +Expected annotations: + +- stdlib tools: `readOnlyHint=True`, `openWorldHint=False` +- `lookup_package_docs`: `readOnlyHint=True`, `openWorldHint=True` + +Pass criteria: + +- Exactly five tools are exposed. +- `lookup_package_docs` is visibly open-world because it calls PyPI. + +## Test Area 2 — `get_docs` Functional Retrieval + +### 2.1 Full page retrieval + +Call: + +```text +get_docs(slug="library/json.html", version="3.12", max_chars=1000, start_index=0) +``` + +Expected: + +- Result contains JSON documentation content. +- `slug == "library/json.html"` +- `version == "3.12"` +- `anchor is null` +- `char_count > 0` + +### 2.2 Section retrieval + +First use `search_docs` to find a valid section anchor for `json`, then call: + +```text +get_docs(slug="library/json.html", version="3.12", anchor=, max_chars=1000, start_index=0) +``` + +Expected: + +- Result is section-scoped. +- `anchor == ` +- Content is not the full page. + +### 2.3 Empty anchor remains invalid + +Call: + +```text +get_docs(slug="library/json.html", version="3.12", anchor="", max_chars=1000, start_index=0) +``` + +Expected: + +- Controlled tool error / page-not-found style response. +- It must **not** return a cached full-page response. + +This specifically verifies the `anchor=None` vs `anchor=""` cache fix. + +## Test Area 3 — Persistent Cache Behavior + +Before running, locate cache path from platform cache dir. Expected filename: + +```text +retrieved-docs-cache.sqlite3 +``` + +Likely under: + +```text +~/.cache/mcp-python-docs/retrieved-docs-cache.sqlite3 +``` + +or the platform cache directory used by the app. + +### 3.1 Cache file creation + +1. Delete the cache file if present. +2. Start the MCP server/client. +3. Call `get_docs` for `library/json.html`. +4. Stop the server. + +Expected: + +- Cache file exists. +- SQLite table `retrieved_docs_cache` exists. +- At least one row is present. + +Suggested inspection: + +```bash +sqlite3 /retrieved-docs-cache.sqlite3 \ + "SELECT version, slug, anchor, max_chars, start_index, length(result_json) FROM retrieved_docs_cache;" +``` + +### 3.2 Cache survives restart + +1. Start server again. +2. Call the same `get_docs` request. + +Expected: + +- Same response content. +- No user-visible behavior change. +- If logs expose cache hits/misses, second call should be a hit. + +### 3.3 Cache key separates pagination/budget + +Call: + +```text +get_docs(slug="library/json.html", version="3.12", max_chars=500, start_index=0) +get_docs(slug="library/json.html", version="3.12", max_chars=1000, start_index=0) +get_docs(slug="library/json.html", version="3.12", max_chars=500, start_index=100) +``` + +Expected: + +- Separate cache rows for each identity. +- Results are not cross-contaminated. + +### 3.4 Corrupt cache is best-effort + +1. Stop server. +2. Replace cache file with invalid bytes: + +```bash +printf 'not sqlite' > /retrieved-docs-cache.sqlite3 +``` + +3. Start server. +4. Call `get_docs(slug="library/json.html", version="3.12")`. + +Expected: + +- Docs retrieval still succeeds. +- Warning is logged about disabled/skipped persistent cache. +- No internal server error. + +## Test Area 4 — `lookup_package_docs` Happy Path + +### 4.1 Known package with docs/source + +Call: + +```text +lookup_package_docs(package="requests") +``` + +Expected: + +- `metadata_source == "https://pypi.org/pypi/requests/json"` +- `trust_boundary == "pypi-declared-metadata"` +- `package` is canonical from PyPI if available. +- `version` is non-empty. +- `sources` includes PyPI project URL and likely homepage/source/docs links. +- Every source URL is `http://` or `https://`. +- No web search / unofficial mirror fallback. + +### 4.2 Normalization + +Call: + +```text +lookup_package_docs(package="Sample_Project") +``` + +Expected: + +- Metadata source normalizes to: + +```text +https://pypi.org/pypi/sample-project/json +``` + +- Returned package may be PyPI canonical name. + +### 4.3 Missing package + +Call: + +```text +lookup_package_docs(package="definitely-not-a-real-package-vision-test-xyz") +``` + +Expected: + +- `sources == []` +- note contains package not found / PyPI 404 style message. +- No internal error. + +## Test Area 5 — PyPI Failure Handling + +These may require monkeypatching/fake fetcher or temporary network blocking if not practical via live MCP. + +### 5.1 Non-404 HTTP errors + +Simulate PyPI `429` or `503`. + +Expected: + +- Controlled result: + +```text +sources=[] +note="PyPI returned HTTP 429." +``` + +or equivalent code. + +### 5.2 Network/JSON failure + +Simulate: + +- `URLError` +- timeout +- invalid JSON body + +Expected: + +- Controlled result note: + +```text +Unable to retrieve PyPI metadata: . +``` + +- No internal server error. + +### 5.3 Oversized PyPI JSON body + +Simulate a response larger than 5 MiB. + +Expected: + +- The service reads at most `5 MiB + 1 byte`. +- Controlled result: + +```text +sources=[] +note="PyPI metadata exceeded size limit." +``` + +## Test Area 6 — Scope / Trust Boundary + +Use a package or fake response with broad `project_urls`, e.g. labels: + +- `Documentation` +- `Homepage` +- `Source` +- `Repository` +- `Issues` +- `Changelog` +- `Community mirror` +- `Tutorial` + +Expected: + +Included: + +- Documentation +- Homepage +- Source +- Repository + +Excluded/skipped: + +- Issues +- Changelog +- Community mirror +- Tutorial + +Result note should mention ignored labels outside controlled allowlist. + +## Suggested Manual MCP Smoke Script + +If direct MCP client execution is awkward, use a minimal Python script that imports the services and mimics the tool layer: + +```python +from mcp_server_python_docs.services.package_docs import PackageDocsService + +for pkg in ["requests", "Sample_Project", "definitely-not-a-real-package-vision-test-xyz"]: + print(PackageDocsService().lookup(pkg).model_dump()) +``` + +For `get_docs`, prefer actual MCP invocation or server lifespan because cache path wiring happens there. + +## Pass / Fail Summary Template + +Return results in this format: + +```markdown +## PR #9 MCP Test Results + +### Environment +- Commit: +- OS: +- Python: +- Cache path: + +### Gates +- ruff: +- pyright src: +- pytest: +- uv build: + +### MCP Tool Tests +- Tool registration: +- get_docs full page: +- get_docs section: +- empty anchor behavior: +- cache file creation: +- cache survives restart: +- cache key separation: +- corrupt cache fallback: +- lookup_package_docs requests: +- missing package: +- failure simulation: +- scope/trust boundary: + +### Verdict +PASS / FAIL + +### Notes / Bugs Found +- ... +``` + +## Final Acceptance Criteria + +The PR is considered MCP-smoke-test ready if: + +- Local gates pass. +- Live MCP/tool invocation returns correct stdlib docs. +- Cache file is created and reused after restart. +- Corrupt cache does not break docs retrieval. +- PyPI lookup returns only controlled PyPI-declared metadata. +- PyPI expected failures return controlled notes. +- No internal errors are observed for expected failure modes. From 02d38217192a138837f8fe7a10fa6190b553eab8 Mon Sep 17 00:00:00 2001 From: "bluecloud-gilfoyle[bot]" <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com> Date: Fri, 8 May 2026 11:39:55 +0000 Subject: [PATCH 6/8] docs: add PR 9 smoke failure fix plan --- PR9_SMOKE_FAILURE_FIX_PLAN.md | 445 ++++++++++++++++++++++++++++++++++ 1 file changed, 445 insertions(+) create mode 100644 PR9_SMOKE_FAILURE_FIX_PLAN.md diff --git a/PR9_SMOKE_FAILURE_FIX_PLAN.md b/PR9_SMOKE_FAILURE_FIX_PLAN.md new file mode 100644 index 0000000..6a2c26e --- /dev/null +++ b/PR9_SMOKE_FAILURE_FIX_PLAN.md @@ -0,0 +1,445 @@ +# PR #9 Smoke Failure Fix + Retest Plan + +PR: https://github.com/ayhammouda/python-docs-mcp-server/pull/9 +Current branch: `fix/open-issues-cache-pypi-docs` +Current head when this plan was written: `a54a2fe` + +## Executive Summary + +The PR implementation itself passed unit/service-level checks, but the MCP smoke test failed for two infrastructure reasons: + +1. **MCP stdio transport bug** — protocol frames appear on `stderr` instead of `stdout`, so a real MCP client hangs. +2. **Local docs index is unusable for `get_docs` smoke testing** — installed index has version `3.13` but `0` documents, so full-page/section docs retrieval and real persistent cache writes cannot be validated. + +This plan fixes those issues and then reruns the MCP smoke test properly. + +```mermaid +flowchart TD + A[Smoke test failed] --> B[Fix stdio transport] + A --> C[Prepare contentful docs index] + B --> D[Add regression tests] + C --> E[Validate get_docs live] + D --> F[Run gates] + E --> F + F --> G[Run MCP smoke test] + G --> H{Pass?} + H -->|Yes| I[PR ready to merge] + H -->|No| J[File focused bug / fix] +``` + +## Issue 1 — MCP stdio responses emitted on `stderr` + +### Symptom + +Actual MCP stdio client invocation failed: + +- JSON-RPC frames were emitted on `stderr`, not `stdout`. +- Client could not consume responses and hung. +- Existing service-level tests passed, so this is a transport/runtime bug. + +### Likely Root Cause + +`src/mcp_server_python_docs/__main__.py` intentionally redirects fd `1` to fd `2` early to protect the MCP protocol pipe from import-time stdout pollution: + +```python +_saved_stdout_fd = os.dup(1) +os.dup2(2, 1) +sys.stdout = sys.stderr +``` + +Later, in `serve()`, it restores fd `1`: + +```python +os.dup2(saved_stdout_fd, 1) +os.close(saved_stdout_fd) +mcp_server.run(transport="stdio") +``` + +But `sys.stdout` still references `sys.stderr`. If FastMCP writes through `sys.stdout` rather than raw fd `1`, frames still go to stderr. + +### Fix Strategy + +After restoring fd `1`, also restore the Python-level stdout object before calling FastMCP: + +```python +os.dup2(saved_stdout_fd, 1) +os.close(saved_stdout_fd) +sys.stdout = sys.__stdout__ +``` + +If `sys.__stdout__` does not behave reliably under tests, use an explicit fd wrapper: + +```python +sys.stdout = os.fdopen( + os.dup(1), + "w", + buffering=1, + encoding=getattr(sys.__stdout__, "encoding", None) or "utf-8", + errors="replace", +) +``` + +Prefer the smaller `sys.stdout = sys.__stdout__` first. + +### Constraints + +Keep non-serve commands safe: + +- `doctor` +- `build-index` +- `--help` +- `--version` + +These should still avoid stdout pollution unless deliberately allowed by existing tests. + +### Regression Tests + +Add or update tests around `tests/test_stdio_smoke.py` / `tests/test_stdio_hygiene.py`. + +Required test behavior: + +1. Start server as subprocess: + +```bash +python -m mcp_server_python_docs serve +``` + +2. Send JSON-RPC initialize + tools/list frames through stdin. +3. Assert: + - protocol responses are present on `stdout`; + - `stderr` may contain logs but contains no JSON-RPC response frames; + - stdout contains only valid JSON-RPC frames. + +Suggested assertions: + +```python +assert b'"jsonrpc"' in result.stdout +assert b'"tools/list"' not in result.stderr +assert b'"result"' not in result.stderr # if robust enough +``` + +Better: parse both streams line-by-line and ensure all JSON-RPC response objects came from stdout only. + +### Manual Validation + +Run: + +```bash +uv run pytest tests/test_stdio_smoke.py tests/test_stdio_hygiene.py -q +``` + +Then run the full gates: + +```bash +uv run ruff check src/ tests/ +uv run pyright src/ +uv run pytest --tb=short -q +uv build +``` + +## Issue 2 — Local index has no documents + +### Symptom + +Gilfoyle reported: + +```text +/home/ahammouda/.cache/mcp-python-docs/index.db has only 3.13 and 0 documents +``` + +Observed behavior: + +- `get_docs(... version="3.12")` → version not found +- default `3.13` → page not found +- persistent cache DB exists but has `0` rows because `get_docs` never succeeds + +### Root Cause + +The currently installed index is likely symbol-only or incomplete. The smoke test requires contentful docs ingestion, not only `objects.inv` symbols. + +Attempted full 3.12 build failed because system Python lacks `ensurepip` / `python3.12-venv`. + +### Fix Strategy A — Preferred: Build a contentful index + +1. Run doctor: + +```bash +uv run python -m mcp_server_python_docs doctor +``` + +2. If it reports missing venv/ensurepip support, install the matching venv package. + +On Ubuntu/Debian with Python 3.12: + +```bash +sudo apt-get update +sudo apt-get install -y python3.12-venv +``` + +This requires elevated approval. + +3. Build a minimal contentful index for smoke tests. + +Prefer one version to keep runtime manageable: + +```bash +uv run python -m mcp_server_python_docs build-index --versions 3.13 +``` + +If the project specifically needs 3.12 too: + +```bash +uv run python -m mcp_server_python_docs build-index --versions 3.12,3.13 +``` + +4. Validate index content: + +```bash +sqlite3 ~/.cache/mcp-python-docs/index.db \ + "SELECT ds.version, COUNT(d.id) AS documents, COUNT(s.id) AS sections + FROM doc_sets ds + LEFT JOIN documents d ON d.doc_set_id = ds.id + LEFT JOIN sections s ON s.document_id = d.id + GROUP BY ds.version;" +``` + +Expected: + +- `documents > 0` +- `sections > 0` + +5. Confirm a known page exists: + +```bash +sqlite3 ~/.cache/mcp-python-docs/index.db \ + "SELECT slug, title FROM documents WHERE slug='library/json.html' LIMIT 1;" +``` + +Expected: + +- one row for `library/json.html` + +### Fix Strategy B — If full docs build is too slow/unavailable + +Use a **temporary synthetic content index** for smoke tests only. + +Do not replace the user’s real index permanently. Use an isolated `XDG_CACHE_HOME`: + +```bash +export XDG_CACHE_HOME="$(mktemp -d)" +``` + +Create a minimal `index.db` with: + +- one `doc_sets` row for `3.13` +- one `documents` row for `library/json.html` +- one or more `sections` rows, including a valid anchor like `top` +- rebuilt FTS tables if needed + +Then run MCP server with that same `XDG_CACHE_HOME`. + +This is acceptable for transport/cache smoke validation, but final release confidence should still include one real contentful index build when possible. + +### Regression Test Recommendation + +Add a new smoke fixture that creates a temporary contentful index and starts the server with isolated cache env. + +Test should not depend on the machine’s global `~/.cache` state. + +Suggested new test file: + +```text +tests/test_mcp_get_docs_cache_smoke.py +``` + +Test behavior: + +1. Create temp cache dir. +2. Create minimal valid index at `/mcp-python-docs/index.db`. +3. Start server subprocess with `XDG_CACHE_HOME=`. +4. Call `get_docs` through MCP stdio. +5. Assert returned content. +6. Stop server. +7. Inspect `retrieved-docs-cache.sqlite3` has one row. +8. Restart server. +9. Call same `get_docs`; assert same result. +10. Corrupt cache file; restart; assert `get_docs` still succeeds. + +This makes future smoke tests deterministic and prevents the exact failure seen here: “global index exists but has no documents.” + +## Issue 3 — Persistent cache live validation blocked + +### Symptom + +Cache service passed synthetic tests, but real cache creation/restart behavior could not be validated because `get_docs` failed before writing cache rows. + +### Fix Strategy + +After Issue 2 is fixed, run the cache smoke tests again. + +Manual commands: + +```bash +CACHE_DIR="$HOME/.cache/mcp-python-docs" +rm -f "$CACHE_DIR/retrieved-docs-cache.sqlite3" +``` + +Then call live MCP: + +```text +get_docs(slug="library/json.html", version="3.13", max_chars=1000, start_index=0) +``` + +Inspect: + +```bash +sqlite3 "$CACHE_DIR/retrieved-docs-cache.sqlite3" \ + "SELECT version, slug, anchor, max_chars, start_index, length(result_json) + FROM retrieved_docs_cache;" +``` + +Expected: + +- row exists for version `3.13` +- slug `library/json.html` +- anchor sentinel/internal representation is not user-visible, but row exists +- `result_json` length > 0 + +Restart server and repeat call. + +Expected: + +- same response +- if stats/logs expose cache hit, hit count increments + +## Issue 4 — Test the actual MCP path, not just services + +### Current State + +The smoke run had many service-level passes. Those are useful but not enough because MCP transport failed. + +### Required Post-Fix MCP Tests + +Use an actual MCP client or subprocess JSON-RPC harness. + +Minimum calls: + +1. `tools/list` +2. `tools/call search_docs` +3. `tools/call get_docs` full page +4. `tools/call get_docs` valid section +5. `tools/call get_docs` empty anchor +6. `tools/call lookup_package_docs requests` +7. `tools/call lookup_package_docs missing package` + +### Expected Results + +| Test | Expected | +|---|---| +| tools/list | five tools, `lookup_package_docs` present | +| search_docs | returns hits from local index | +| get_docs full page | returns content and writes persistent cache row | +| get_docs section | returns section content | +| get_docs empty anchor | controlled not-found error, not cached full page | +| lookup_package_docs requests | PyPI-declared sources + trust boundary | +| missing package | `sources=[]`, controlled note | + +## Proposed Work Breakdown for Gilfoyle + +### Phase 1 — Fix stdio transport + +- Patch `serve()` to restore `sys.stdout` after fd restoration. +- Add regression test proving JSON-RPC responses are on stdout, not stderr. +- Run stdio tests. + +### Phase 2 — Make smoke index deterministic + +Choose one: + +- **Option A:** install `python3.12-venv` and build a real contentful index. +- **Option B:** add a temp synthetic-index smoke test harness using isolated `XDG_CACHE_HOME`. + +Recommendation: do both if time allows. Option B should become the permanent CI regression test; Option A validates real user setup. + +### Phase 3 — Rerun PR #9 MCP smoke plan + +Use `PR9_MCP_TEST_PLAN.md` again after the fixes. + +### Phase 4 — Report + +Return a result summary: + +```markdown +## PR #9 Transport/Index Fix Results + +### Fixes Applied +- ... + +### Gates +- ruff: +- pyright src: +- pytest: +- uv build: + +### MCP Stdio +- initialize: +- tools/list: +- stdout/stderr framing: + +### Index +- index build/fixture: +- versions: +- document count: +- section count: + +### get_docs + Cache +- full page: +- section: +- empty anchor: +- cache file rows: +- restart cache reuse: +- corrupt cache fallback: + +### lookup_package_docs +- requests: +- missing package: +- failure simulations: + +### Verdict +PASS / FAIL + +### Remaining Blockers +- ... +``` + +## Definition of Done + +The issues are fixed when: + +1. MCP JSON-RPC responses are emitted on stdout only. +2. stderr contains logs only, not protocol response frames. +3. Smoke tests use a contentful index, either real or deterministic synthetic fixture. +4. `get_docs` works through actual MCP, not just service calls. +5. Persistent cache file is created after real `get_docs` MCP call. +6. Cache survives server restart. +7. Corrupt cache does not break real MCP `get_docs`. +8. `lookup_package_docs` works through actual MCP. +9. Full validation passes: + +```bash +uv run ruff check src/ tests/ +uv run pyright src/ +uv run pytest --tb=short -q +uv build +``` + +10. GitHub CI is green. + +## Merge Guidance + +Do **not** merge PR #9 until either: + +- the stdio transport bug is fixed in this PR/branch and live MCP smoke tests pass; or +- the stdio transport bug is confirmed pre-existing, tracked separately, and Aymen explicitly accepts merging PR #9 with only service-level validation. + +My recommendation: fix stdio now. An MCP server whose protocol writes to the wrong stream is a red-alert goblin, not a “later maybe.” From d8c5c7e2e199ac8464a9c3e2f8060f3448851b11 Mon Sep 17 00:00:00 2001 From: "bluecloud-gilfoyle[bot]" <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com> Date: Fri, 8 May 2026 11:46:42 +0000 Subject: [PATCH 7/8] fix: restore stdio transport stdout --- src/mcp_server_python_docs/__main__.py | 4 + tests/test_mcp_get_docs_cache_smoke.py | 207 +++++++++++++++++++++++++ tests/test_stdio_smoke.py | 76 ++++++--- 3 files changed, 262 insertions(+), 25 deletions(-) create mode 100644 tests/test_mcp_get_docs_cache_smoke.py diff --git a/src/mcp_server_python_docs/__main__.py b/src/mcp_server_python_docs/__main__.py index 35fc016..a4baa5f 100644 --- a/src/mcp_server_python_docs/__main__.py +++ b/src/mcp_server_python_docs/__main__.py @@ -98,6 +98,10 @@ def serve() -> None: # print to stdout during MCP communication. os.dup2(saved_stdout_fd, 1) os.close(saved_stdout_fd) + # FastMCP writes JSON-RPC frames through Python's stdout object. + # Restore it as well as fd 1, otherwise protocol frames are emitted + # on stderr because module import redirected sys.stdout there. + sys.stdout = sys.__stdout__ try: mcp_server.run(transport="stdio") diff --git a/tests/test_mcp_get_docs_cache_smoke.py b/tests/test_mcp_get_docs_cache_smoke.py new file mode 100644 index 0000000..2d4761c --- /dev/null +++ b/tests/test_mcp_get_docs_cache_smoke.py @@ -0,0 +1,207 @@ +"""MCP subprocess smoke tests for get_docs and persistent cache behavior.""" +from __future__ import annotations + +import sqlite3 +import subprocess +import sys +from pathlib import Path + +from mcp_server_python_docs.storage.db import bootstrap_schema, get_readwrite_connection +from tests.test_stdio_smoke import ( + _assert_protocol_on_stdout_only, + _find_response, + _isolated_cache_env, + _make_notification, + _make_request, +) + + +def _create_contentful_json_index(cache_dir: Path) -> Path: + """Create a deterministic contentful docs index for subprocess smoke tests.""" + cache_dir.mkdir(parents=True, exist_ok=True) + db_path = cache_dir / "index.db" + conn = get_readwrite_connection(db_path) + bootstrap_schema(conn) + conn.execute( + "INSERT INTO doc_sets (source, version, language, label, is_default, base_url) " + "VALUES ('python-docs', '3.13', 'en', 'Python 3.13', 1, " + "'https://docs.python.org/3.13/')" + ) + doc_set_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0] + conn.execute( + "INSERT INTO symbols (doc_set_id, qualified_name, normalized_name, " + "module, symbol_type, uri, anchor) " + "VALUES (?, 'json.dumps', 'json_dumps', 'json', 'function', " + "'library/json.html#json.dumps', 'json.dumps')", + (doc_set_id,), + ) + conn.execute( + "INSERT INTO documents (doc_set_id, uri, slug, title, content_text, char_count) " + "VALUES (?, 'library/json.html', 'library/json.html', " + "'json — JSON encoder and decoder', " + "'The json module exposes APIs for encoding and decoding JSON data.', 64)", + (doc_set_id,), + ) + doc_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0] + conn.executemany( + "INSERT INTO sections (document_id, uri, anchor, heading, level, ordinal, " + "content_text, char_count) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", + [ + ( + doc_id, + "library/json.html#top", + "top", + "json — JSON encoder and decoder", + 1, + 1, + "The json module exposes APIs for encoding and decoding JSON data.", + 64, + ), + ( + doc_id, + "library/json.html#json.dumps", + "json.dumps", + "json.dumps", + 2, + 2, + "Serialize obj to a JSON formatted str using a conversion table.", + 62, + ), + ], + ) + conn.commit() + conn.execute("INSERT INTO symbols_fts(symbols_fts) VALUES('rebuild')") + conn.execute("INSERT INTO sections_fts(sections_fts) VALUES('rebuild')") + conn.execute("INSERT INTO examples_fts(examples_fts) VALUES('rebuild')") + conn.commit() + conn.close() + return db_path + + +def _run_server(stdin_data: bytes, env: dict[str, str]) -> subprocess.CompletedProcess: + return subprocess.run( + [sys.executable, "-m", "mcp_server_python_docs", "serve"], + input=stdin_data, + capture_output=True, + timeout=15, + env=env, + ) + + +def _initialized_tool_call(name: str, arguments: dict, req_id: int = 2) -> bytes: + return ( + _make_request( + "initialize", + { + "protocolVersion": "2024-11-05", + "capabilities": {}, + "clientInfo": {"name": "test", "version": "0.1"}, + }, + req_id=1, + ) + + _make_notification("notifications/initialized") + + _make_request("tools/call", {"name": name, "arguments": arguments}, req_id=req_id) + ) + + +def _tool_structured_content(result: subprocess.CompletedProcess, req_id: int = 2) -> dict: + responses = _assert_protocol_on_stdout_only(result) + response = _find_response(responses, req_id) + assert response is not None, f"Missing tools/call response: {responses}" + assert "result" in response, response + assert response["result"].get("isError") is not True, response + return response["result"]["structuredContent"] + + +def _tool_error_text(result: subprocess.CompletedProcess, req_id: int = 2) -> str: + responses = _assert_protocol_on_stdout_only(result) + response = _find_response(responses, req_id) + assert response is not None, f"Missing tools/call response: {responses}" + assert response["result"].get("isError") is True, response + return "\n".join(item.get("text", "") for item in response["result"].get("content", [])) + + +def test_get_docs_cache_restart_and_corrupt_cache_fallback(tmp_path: Path): + """Exercise get_docs through real MCP stdio with isolated contentful cache.""" + env, cache_dir = _isolated_cache_env(tmp_path) + _create_contentful_json_index(cache_dir) + cache_path = cache_dir / "retrieved-docs-cache.sqlite3" + + full_page = _tool_structured_content( + _run_server( + _initialized_tool_call( + "get_docs", + {"slug": "library/json.html", "version": "3.13"}, + ), + env, + ) + ) + assert full_page["slug"] == "library/json.html" + assert full_page["anchor"] is None + assert "json module" in full_page["content"] + + with sqlite3.connect(cache_path) as conn: + rows = conn.execute( + "SELECT version, slug, anchor, max_chars, start_index, length(result_json) " + "FROM retrieved_docs_cache" + ).fetchall() + assert len(rows) == 1 + version, slug, anchor, max_chars, start_index, result_json_length = rows[0] + assert (version, slug, anchor, max_chars, start_index) == ( + "3.13", + "library/json.html", + "\x00mcp-python-docs:no-anchor\x00", + 8000, + 0, + ) + assert result_json_length > 0 + + restarted_page = _tool_structured_content( + _run_server( + _initialized_tool_call( + "get_docs", + {"slug": "library/json.html", "version": "3.13"}, + ), + env, + ) + ) + assert restarted_page == full_page + + section = _tool_structured_content( + _run_server( + _initialized_tool_call( + "get_docs", + { + "slug": "library/json.html", + "version": "3.13", + "anchor": "json.dumps", + }, + ), + env, + ) + ) + assert section["anchor"] == "json.dumps" + assert "Serialize obj" in section["content"] + + empty_anchor_error = _tool_error_text( + _run_server( + _initialized_tool_call( + "get_docs", + {"slug": "library/json.html", "version": "3.13", "anchor": ""}, + ), + env, + ) + ) + assert "Section '' not found" in empty_anchor_error + + cache_path.write_bytes(b"not a sqlite database") + after_corrupt_cache = _tool_structured_content( + _run_server( + _initialized_tool_call( + "get_docs", + {"slug": "library/json.html", "version": "3.13"}, + ), + env, + ) + ) + assert after_corrupt_cache == full_page diff --git a/tests/test_stdio_smoke.py b/tests/test_stdio_smoke.py index f2e7541..1b10563 100644 --- a/tests/test_stdio_smoke.py +++ b/tests/test_stdio_smoke.py @@ -16,6 +16,7 @@ import os import subprocess import sys +import time from pathlib import Path import pytest @@ -131,6 +132,36 @@ def _read_responses(stdout_data: bytes) -> list[dict]: return responses +def _jsonrpc_frames(stream_data: bytes) -> list[dict]: + """Return JSON-RPC objects parsed from a stream, ignoring log lines.""" + frames: list[dict] = [] + for line in stream_data.split(b"\n"): + line = line.strip() + if not line: + continue + try: + parsed = json.loads(line) + except json.JSONDecodeError: + continue + if isinstance(parsed, dict) and parsed.get("jsonrpc") == "2.0": + frames.append(parsed) + return frames + + +def _assert_protocol_on_stdout_only(result: subprocess.CompletedProcess) -> list[dict]: + """Assert JSON-RPC protocol frames are emitted only on stdout.""" + responses = _read_responses(result.stdout) + assert responses, f"No JSON-RPC responses on stdout; stderr was: {result.stderr!r}" + + for resp in responses: + assert "_raw" not in resp, f"Non-JSON stdout pollution: {resp.get('_raw')}" + assert resp.get("jsonrpc") == "2.0", f"Missing JSON-RPC frame on stdout: {resp}" + + stderr_frames = _jsonrpc_frames(result.stderr) + assert stderr_frames == [], f"JSON-RPC frames leaked to stderr: {stderr_frames}" + return responses + + def _find_response(responses: list[dict], req_id: int) -> dict | None: """Find a JSON-RPC response matching the given request id.""" for resp in responses: @@ -152,14 +183,23 @@ def _setup_test_env(self, tmp_path): def _run_server_with_input( self, stdin_data: bytes, timeout: int = 15, ) -> subprocess.CompletedProcess: - """Run the server subprocess with the given stdin and return the result.""" - return subprocess.run( + """Run the server subprocess with line-paced stdin and return the result.""" + proc = subprocess.Popen( [sys.executable, "-m", "mcp_server_python_docs", "serve"], - input=stdin_data, - capture_output=True, - timeout=timeout, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, env=self.env, ) + assert proc.stdin is not None + for index, line in enumerate(stdin_data.splitlines(keepends=True)): + proc.stdin.write(line) + proc.stdin.flush() + time.sleep(0.3 if index == 0 else 0.05) + proc.stdin.close() + proc.stdin = None + stdout, stderr = proc.communicate(timeout=timeout) + return subprocess.CompletedProcess(proc.args, proc.returncode, stdout, stderr) def test_server_lists_tools_no_stdout_pollution(self): """Server returns tool list and stdout has no non-JSON-RPC bytes.""" @@ -175,16 +215,11 @@ def test_server_lists_tools_no_stdout_pollution(self): result = self._run_server_with_input(stdin_data) - responses = _read_responses(result.stdout) - - # Every line on stdout must be valid JSON-RPC - for resp in responses: - assert "_raw" not in resp, f"Non-JSON stdout pollution: {resp.get('_raw')}" + responses = _assert_protocol_on_stdout_only(result) # Find the tools/list response tools_resp = _find_response(responses, 2) - if tools_resp is None: - pytest.skip("Server exited before returning tools/list response") + assert tools_resp is not None, f"Server exited before tools/list response: {responses}" assert "result" in tools_resp, f"tools/list error: {tools_resp}" tool_names = [t["name"] for t in tools_resp["result"].get("tools", [])] assert "search_docs" in tool_names @@ -209,16 +244,11 @@ def test_search_docs_round_trip(self): ) result = self._run_server_with_input(stdin_data) - responses = _read_responses(result.stdout) - - # No stdout pollution - for resp in responses: - assert "_raw" not in resp, f"Stdout pollution: {resp.get('_raw')}" + responses = _assert_protocol_on_stdout_only(result) # Find the tools/call response call_resp = _find_response(responses, 2) - if call_resp is None: - pytest.skip("Server exited before returning search_docs response") + assert call_resp is not None, f"Server exited before search_docs response: {responses}" assert "result" in call_resp, f"tools/call error: {call_resp}" content = call_resp["result"].get("content", []) assert len(content) >= 1, "search_docs returned no content" @@ -239,14 +269,10 @@ def test_list_versions_round_trip(self): ) result = self._run_server_with_input(stdin_data) - responses = _read_responses(result.stdout) - - for resp in responses: - assert "_raw" not in resp, f"Stdout pollution: {resp.get('_raw')}" + responses = _assert_protocol_on_stdout_only(result) call_resp = _find_response(responses, 2) - if call_resp is None: - pytest.skip("Server exited before returning list_versions response") + assert call_resp is not None, f"Server exited before list_versions response: {responses}" assert "result" in call_resp, f"tools/call error: {call_resp}" def test_all_stdout_is_valid_jsonrpc(self): From 694c2057ee4c5af2465439ad966dd65a02379879 Mon Sep 17 00:00:00 2001 From: Aymen Hammouda Date: Fri, 8 May 2026 23:08:46 +0200 Subject: [PATCH 8/8] fix: return retrievable slugs for symbol hits --- .../retrieval/ranker.py | 118 +++++++++++++++--- tests/test_services.py | 35 ++++++ 2 files changed, 137 insertions(+), 16 deletions(-) diff --git a/src/mcp_server_python_docs/retrieval/ranker.py b/src/mcp_server_python_docs/retrieval/ranker.py index b203277..e42c4a5 100644 --- a/src/mcp_server_python_docs/retrieval/ranker.py +++ b/src/mcp_server_python_docs/retrieval/ranker.py @@ -16,6 +16,89 @@ logger = logging.getLogger(__name__) +def _page_uri(uri: str) -> str: + """Return the page part of an objects.inv URI.""" + return uri.split("#", 1)[0] + + +def _document_candidates(uri: str) -> tuple[str, ...]: + """Return possible document slugs for a symbol URI. + + ``objects.inv`` entries use HTML paths such as ``library/json.html``, + while Sphinx JSON content is ingested with extensionless slugs such as + ``library/json``. Prefer the exact URI first, then the extensionless form. + """ + page_uri = _page_uri(uri) + if page_uri.endswith(".html"): + return (page_uri, page_uri[:-5]) + return (page_uri,) + + +def _resolve_symbol_location( + conn: sqlite3.Connection, + row: sqlite3.Row, +) -> tuple[str, str]: + """Resolve a symbol row to a get_docs-compatible slug and anchor.""" + uri = str(row["uri"]) + fallback_slug = _page_uri(uri) + fallback_anchor = str(row["anchor"] or "") + + section_id = row["section_id"] + if section_id is not None: + section_row = conn.execute( + """ + SELECT d.slug, s.anchor + FROM sections s + JOIN documents d ON s.document_id = d.id + WHERE s.id = ? + LIMIT 1 + """, + (section_id,), + ).fetchone() + if section_row is not None: + return section_row["slug"], section_row["anchor"] or "" + + doc_row = None + document_id = row["document_id"] + if document_id is not None: + doc_row = conn.execute( + "SELECT id, slug FROM documents WHERE id = ? LIMIT 1", + (document_id,), + ).fetchone() + + if doc_row is None: + for candidate in _document_candidates(uri): + doc_row = conn.execute( + """ + SELECT id, slug + FROM documents + WHERE doc_set_id = ? AND (slug = ? OR uri = ?) + LIMIT 1 + """, + (row["doc_set_id"], candidate, candidate), + ).fetchone() + if doc_row is not None: + break + + if doc_row is None: + return fallback_slug, fallback_anchor + + if fallback_anchor: + section_row = conn.execute( + """ + SELECT anchor + FROM sections + WHERE document_id = ? AND anchor = ? + LIMIT 1 + """, + (doc_row["id"], fallback_anchor), + ).fetchone() + if section_row is not None: + return doc_row["slug"], section_row["anchor"] or "" + + return doc_row["slug"], "" + + def _normalize_scores(hits: list[SymbolHit]) -> list[SymbolHit]: """Normalize BM25 scores to [0.1, 1.0] range. @@ -129,7 +212,8 @@ def search_symbols( try: cursor = conn.execute( """ - SELECT sym.id, sym.qualified_name, sym.symbol_type, sym.uri, + SELECT sym.id, sym.doc_set_id, sym.document_id, sym.section_id, + sym.qualified_name, sym.symbol_type, sym.uri, sym.anchor, sym.module, d.version, bm25(symbols_fts, 10.0, 1.0) as score, snippet(symbols_fts, 0, '**', '**', '...', 32) as snippet_text @@ -148,19 +232,19 @@ def search_symbols( logger.warning("FTS5 query failed for symbols: %r", match_expr) return [] - hits = [ - SymbolHit( + hits = [] + for row in rows: + slug, anchor = _resolve_symbol_location(conn, row) + hits.append(SymbolHit( uri=row["uri"], title=row["qualified_name"], kind=row["symbol_type"] or "symbol", snippet=row["snippet_text"] or "", score=row["score"], version=row["version"], - slug=row["uri"].split("#")[0] if "#" in row["uri"] else row["uri"], - anchor=row["anchor"] or "", - ) - for row in rows - ] + slug=slug, + anchor=anchor, + )) return _normalize_scores(hits) @@ -251,7 +335,8 @@ def lookup_symbols_exact( cursor = conn.execute( """ - SELECT s.qualified_name, s.symbol_type, s.uri, s.anchor, + SELECT s.doc_set_id, s.document_id, s.section_id, + s.qualified_name, s.symbol_type, s.uri, s.anchor, s.module, d.version FROM symbols s JOIN doc_sets d ON s.doc_set_id = d.id @@ -264,16 +349,17 @@ def lookup_symbols_exact( ) rows = cursor.fetchall() - return [ - SymbolHit( + hits = [] + for row in rows: + slug, anchor = _resolve_symbol_location(conn, row) + hits.append(SymbolHit( uri=row["uri"], title=row["qualified_name"], kind=row["symbol_type"] or "symbol", snippet="", score=1.0 if row["qualified_name"] == query else 0.8, version=row["version"], - slug=row["uri"].split("#")[0] if "#" in row["uri"] else row["uri"], - anchor=row["anchor"] or "", - ) - for row in rows - ] + slug=slug, + anchor=anchor, + )) + return hits diff --git a/tests/test_services.py b/tests/test_services.py index b084775..f563cdc 100644 --- a/tests/test_services.py +++ b/tests/test_services.py @@ -102,6 +102,41 @@ def test_search_symbol_fast_path(self, populated_with_content): assert len(result.hits) >= 1 assert result.hits[0].title == "asyncio.TaskGroup" + def test_symbol_hit_slug_is_retrievable_when_content_slug_is_extensionless( + self, + populated_db, + ): + db = populated_db + doc_set_id = db.execute("SELECT id FROM doc_sets LIMIT 1").fetchone()[0] + db.execute( + "INSERT INTO documents (doc_set_id, uri, slug, title, content_text, char_count) " + "VALUES (?, 'library/json', 'library/json', 'json', " + "'json page content', 17)", + (doc_set_id,), + ) + doc_id = db.execute("SELECT last_insert_rowid()").fetchone()[0] + db.execute( + "INSERT INTO sections (document_id, uri, anchor, heading, level, ordinal, " + "content_text, char_count) VALUES (?, 'library/json', '', 'Introduction', " + "1, 0, 'json.dumps guidance for command line tools.', 43)", + (doc_id,), + ) + db.execute( + "INSERT INTO symbols (doc_set_id, qualified_name, normalized_name, " + "symbol_type, uri, anchor, module) VALUES (?, 'json.dumps', " + "'json.dumps', 'function', 'library/json.html#json.dumps', " + "'json.dumps', 'json')", + (doc_set_id,), + ) + db.commit() + + hit = SearchService(db, {}).search("json.dumps", version="3.13", kind="symbol").hits[0] + + assert hit.slug == "library/json" + assert hit.anchor == "" + docs = ContentService(db).get_docs(hit.slug, hit.version, hit.anchor or None) + assert "json.dumps guidance" in docs.content + def test_search_no_results(self, populated_with_content): svc = SearchService(populated_with_content, {}) result = svc.search("nonexistent_xyz_symbol", kind="symbol")