diff --git a/.github/workflows/publish-npm-on-release.yml b/.github/workflows/publish-npm-on-release.yml index be62c13..023126a 100644 --- a/.github/workflows/publish-npm-on-release.yml +++ b/.github/workflows/publish-npm-on-release.yml @@ -8,9 +8,9 @@ on: workflow_dispatch: inputs: tag: - description: 'Tag to publish (e.g. v1.6.2)' + description: 'Tag to publish (e.g. v2.2.0)' required: true - default: 'v1.6.2' + default: 'v2.2.0' permissions: contents: read diff --git a/.release-please-manifest.json b/.release-please-manifest.json index d9246dd..a5d1cf2 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "1.10.0" + ".": "2.2.0" } diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bfbaab..dbff7fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,18 @@ # Changelog -## Unreleased +## [2.2.0](https://github.com/PatrickSys/codebase-context/compare/v1.10.0...v2.2.0) (2026-04-17) + +### Features + +* relaunch around a bounded conventions map and local-pattern discovery for `map + find` +* add explicit full-map resources while keeping the default first-call map bounded and action-oriented +* align public proof surfaces to the discovery-only benchmark posture (`pending_evidence`, `claimAllowed: false`) + +### Bug Fixes + +* make the packaged README tarball-safe by sending benchmark, demo, motivation, and contributing links to stable GitHub URLs +* quarantine historical v1.8.x launch-planning docs so they no longer read as current release guidance +* stop the built CLI entrypoint from eagerly importing MCP server runtime modules before CLI subcommand dispatch ## [1.10.0](https://github.com/PatrickSys/codebase-context/compare/v1.9.0...v1.10.0) (2026-04-14) diff --git a/README.md b/README.md index 9ec1d4c..92453b4 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,26 @@ # codebase-context -## Stop paying for AI agents to explore your codebase. codebase-context pre-maps the architecture, conventions, and team memory so they don't have to. +## Map your team's conventions before your AI agent starts searching. -[![npm version](https://img.shields.io/npm/v/codebase-context)](https://www.npmjs.com/package/codebase-context) [![license](https://img.shields.io/npm/l/codebase-context)](./LICENSE) [![node](https://img.shields.io/node/v/codebase-context)](./package.json) +[![npm version](https://img.shields.io/npm/v/codebase-context)](https://www.npmjs.com/package/codebase-context) [![license](https://img.shields.io/npm/l/codebase-context)](./LICENSE) [![node](https://img.shields.io/node/v/codebase-context)](https://github.com/PatrickSys/codebase-context/blob/master/package.json) -You're tired of AI agents writing code that 'just works' but fits like a square peg in a round hole - not your conventions, not your architecture, not your repo. Even with well-curated instructions. You correct the agent, it doesn't remember. Next session, same mistakes. +You're tired of AI agents writing code that "just works" but still misses how your team actually builds things. They search too broadly, pick generic examples, and spend tokens exploring before they understand the shape of the repo. -This MCP gives agents _just enough_ context so they match _how_ your team codes, know _why_, and _remember_ every correction. +`codebase-context` changes the first step. Start with a bounded conventions map that shows the architecture, dominant patterns, and strongest local examples. Then search for the exact file, symbol, or workflow you need. Here's what codebase-context does: -**Finds the right context** - Search that doesn't just return code. Each result comes back with analyzed and quantified coding patterns and conventions, related team memories, file relationships, and quality indicators. It knows whether you're looking for a specific file, a concept, or how things wire together - and filters out the noise (test files, configs, old utilities) before the agent sees them. The agent gets curated context, not raw hits. +**Starts with a bounded conventions map** - The first call shows architecture layers, active patterns, golden files, and next calls without dumping vendored repos, fixtures, generated output, or oversized entrypoint lists into the default surface. -**Knows your conventions** - Detected from your code and git history, not only from rules you wrote. Seeks team consensus and direction by adoption percentages and trends (rising/declining), golden files. Tells the difference between code that's _common_ and code that's _current_ - what patterns the team is moving toward and what's being left behind. +**Finds the right local example** - Search does not just return code. Each result comes back with pattern signals, file relationships, and quality indicators so the agent can move from the map to the most relevant local example instead of wandering through raw hits. -**Remembers across sessions** - Decisions, failures, workarounds that look wrong but exist for a reason - the battle scars that aren't in the comments. Recorded once, surfaced automatically so the agent doesn't "clean up" something you spent a week getting right. Conventional git commits (`refactor:`, `migrate:`, `fix:`) auto-extract into memory with zero effort. Stale memories decay and get flagged instead of blindly trusted. +**Knows what is current** - Conventions are detected from your code and git history, not only from rules you wrote. The map distinguishes what is common from what is rising or declining, and points at the files that best represent the current direction. -**Checks before editing** - Before editing something, you get a decision card showing whether there's enough evidence to proceed. If a symbol has four callers and only two appear in your search results, the card shows that coverage gap. If coverage is low, `whatWouldHelp` lists the specific searches to run before you touch anything. +**Adds support signals when you need them** - Team memory and edit-readiness checks stay available, but as supporting context after the map and search have already narrowed the work. -One tool call returns all of it. Local-first - your code never leaves your machine by default. +Map first, search second, local-first throughout. Your code never leaves your machine by default. -See the [current discovery benchmark](./docs/benchmark.md) for the checked-in proof results and current gate truth. +See the [current discovery benchmark](https://github.com/PatrickSys/codebase-context/blob/master/docs/benchmark.md) for the checked-in discovery-only proof. The gate is still `pending_evidence`, and `claimAllowed` remains `false`. ### What it looks like @@ -38,7 +38,7 @@ This is the part most tools miss: what the team is doing now, what it is moving When the agent searches with edit intent, it gets a compact decision card: confidence, whether it's safe to proceed, which patterns apply, the best example, and which files are likely to be affected. -More CLI examples in [`docs/cli.md`](./docs/cli.md). Full walkthrough: [`docs/demo.md`](./docs/demo.md). +More CLI examples in [`docs/cli.md`](./docs/cli.md). Full walkthrough: [demo.md on GitHub](https://github.com/PatrickSys/codebase-context/blob/master/docs/demo.md). ## Quick Start @@ -71,7 +71,7 @@ Full per-client setup, HTTP server instructions, and local build testing: [`docs ## First Use -Get a conventions map of your codebase before exploring or searching: +Get a conventions map of your codebase before exploring or editing: ```bash # See your codebase conventions — architecture layers, patterns, golden files @@ -85,20 +85,20 @@ Your AI agent uses the same map via the `codebase://context` MCP resource on fir ## Common First Commands -Three commands to get what usually takes a new developer weeks to piece together: +Three commands to understand a repo before you edit it: ```bash -# What tech stack, architecture, and file count? -npx -y codebase-context metadata +# What are the main conventions and best examples? +npx -y codebase-context map -# What does the team actually code like right now? -npx -y codebase-context patterns +# Then search for the local example you need +npx -y codebase-context search --query "auth middleware" -# What team decisions were made (and why)? -npx -y codebase-context memory list +# What patterns is the team actually using right now? +npx -y codebase-context patterns ``` -This is also what your AI agent consumes automatically via MCP tools; the CLI is the human-readable version. +This is also what your AI agent consumes automatically via MCP tools; the CLI is the human-readable version of the same map-plus-search flow. ## What it does @@ -224,14 +224,14 @@ These are the behaviors that make the most difference day-to-day. Copy, trim wha ## Links -- [Benchmark](./docs/benchmark.md) — current discovery suite results and gate truth -- [Demo](./docs/demo.md) — real CLI walkthrough +- [Benchmark](https://github.com/PatrickSys/codebase-context/blob/master/docs/benchmark.md) — current discovery suite results and gate truth +- [Demo](https://github.com/PatrickSys/codebase-context/blob/master/docs/demo.md) — real CLI walkthrough - [Client Setup](./docs/client-setup.md) — per-client config, HTTP setup, local build testing - [Capabilities Reference](./docs/capabilities.md) — tool API, retrieval pipeline, decision card schema - [CLI Gallery](./docs/cli.md) — formatted command output examples -- [Motivation](./MOTIVATION.md) — research and design rationale -- [Contributing](./CONTRIBUTING.md) — dev setup and eval harness -- [Changelog](./CHANGELOG.md) +- [Motivation](https://github.com/PatrickSys/codebase-context/blob/master/MOTIVATION.md) — research and design rationale +- [Contributing](https://github.com/PatrickSys/codebase-context/blob/master/CONTRIBUTING.md) — dev setup and eval harness +- [Changelog](https://github.com/PatrickSys/codebase-context/blob/master/CHANGELOG.md) ## License diff --git a/docs/benchmark.md b/docs/benchmark.md index 4b9a5e8..d48f807 100644 --- a/docs/benchmark.md +++ b/docs/benchmark.md @@ -37,7 +37,7 @@ From `results/gate-evaluation.json`: - `claimAllowed`: `false` - `totalTasks`: `24` - `averageUsefulness`: `0.75` -- `averageEstimatedTokens`: `1822.25` +- `averageEstimatedTokens`: `1827.0833` - `bestExampleUsefulnessRate`: `0.125` Repo-level outputs from the same rerun: @@ -53,8 +53,10 @@ The gate is intentionally still blocked. - The combined suite covers both public repos. - `claimAllowed` remains `false` because comparator evidence still does not support a benchmark-win claim. -- Two comparator lanes now return `status: "ok"`, but both are effectively near-empty on the frozen tasks and contribute `0` average usefulness. -- Three comparator lanes still fail setup entirely. +- Two comparator artifacts now return `status: "ok"`, but that does not yet close the gate: + - `raw Claude Code` still leaves the baseline `pending_evidence` because `averageFirstRelevantHit` is `null` + - `codebase-memory-mcp` now has real current metrics, but the gate still marks it `failed` on the frozen tolerance rule +- Three comparator lanes still fail setup entirely: `GrepAI`, `jCodeMunch`, and `CodeGraphContext`. ## Comparator Reality @@ -62,11 +64,11 @@ The current comparator artifact records incomplete comparator evidence, not benc | Comparator | Status | Current reason | | --- | --- | --- | -| `codebase-memory-mcp` | `ok` | Runs, but the checked-in artifact still averages `0` usefulness and `5` estimated tokens per task, so it does not yet contribute meaningful benchmark evidence | +| `codebase-memory-mcp` | comparator artifact: `ok`; gate: `failed` | Runs through the repaired graph-backed path and now records real metrics (`averageUsefulness: 0.1875`, `averageFirstRelevantHit: 1.2857`, `bestExampleUsefulnessRate: 0.5`), but the frozen gate still fails it on the required usefulness comparisons | | `jCodeMunch` | `setup_failed` | `MCP error -32000: Connection closed` | | `GrepAI` | `setup_failed` | Local Go binary and Ollama model path not present | | `CodeGraphContext` | `setup_failed` | `MCP error -32000: Connection closed` | -| `raw Claude Code` | `ok` | Runs, but the checked-in artifact still averages `0` usefulness and only `18.5` estimated tokens per task, so it does not yet contribute meaningful benchmark evidence | +| `raw Claude Code` | comparator artifact: `ok`; gate: `pending_evidence` | The explicit Haiku CLI runner now returns current metrics (`averageUsefulness: 0.0278`, `averageEstimatedTokens: 32.1667`), but the baseline still lacks `averageFirstRelevantHit`, so the gate keeps this lane as missing evidence | `CodeGraphContext` remains part of the frozen comparison frame. It is not omitted from the public story just because the lane still fails to start. @@ -74,9 +76,9 @@ The current comparator artifact records incomplete comparator evidence, not benc - This benchmark measures discovery usefulness and payload cost only. - It does not measure implementation correctness, patch quality, or end-to-end task completion. -- Comparator setup remains environment-sensitive, and the checked-in comparator outputs are still too weak to justify a claim. +- Comparator setup remains environment-sensitive, and the checked-in comparator outputs still do not satisfy the frozen claim gate. - The reranker cache is currently corrupted on this machine. During the proof rerun, search fell back to original ordering after `Protobuf parsing failed` while still completing the harness. -- `averageFirstRelevantHit` remains `null` in the current gate output because this compact response surface does not expose a comparable ranked-hit metric across the incomplete comparator set. +- `averageFirstRelevantHit` remains `null` in the current gate output, which is enough to keep the raw-Claude baseline in `pending_evidence`. ## What This Proof Can Support diff --git a/docs/capabilities.md b/docs/capabilities.md index 03db7ca..9085a07 100644 --- a/docs/capabilities.md +++ b/docs/capabilities.md @@ -1,6 +1,6 @@ # Capabilities Reference -Technical reference for what `codebase-context` ships today. For the user-facing overview, see [README.md](../README.md). +Technical reference for what `codebase-context` ships today. The public product posture is map first, find second: the bounded conventions map is the first-call surface, and search narrows to the right local example after that. For the user-facing overview, see [README.md](../README.md). ## Transport Modes @@ -298,6 +298,7 @@ Reproducible evaluation is shipped as a CLI entrypoint backed by shared scoring/ - **Retrieval metrics:** Top-1 accuracy, Top-3 recall, spec contamination rate, and a gate pass/fail - **Discovery metrics:** usefulness score, payload bytes, estimated tokens, first relevant hit, and best-example usefulness - **Discovery gate:** discovery mode evaluates the frozen ship gate only when the full public suite and comparator metrics are available; missing comparator evidence is reported as pending, not silently treated as pass/fail +- **Current checked-in gate truth:** `results/gate-evaluation.json` remains `pending_evidence` with `claimAllowed: false`; the raw-Claude baseline still lacks `averageFirstRelevantHit`, `codebase-memory-mcp` still fails the frozen usefulness comparisons, and the remaining named lanes are still `setup_failed` - **Limits:** discovery mode is discovery-only, uses current shipped surfaces only, and does not claim implementation quality; named competitor runs remain a documented hybrid/manual lane rather than a built-in automated benchmark ## Limitations diff --git a/docs/cli.md b/docs/cli.md index d50814d..5025f65 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -1,8 +1,9 @@ # CLI Gallery (Human-readable) -`codebase-context` exposes its tools as a local CLI so humans can: +`codebase-context` exposes its tools as a local CLI so humans can follow the same map-first workflow the MCP server gives to agents: -- Get the conventions map before exploring or editing (`map`) +- Get the bounded conventions map before exploring or editing (`map`) +- Search for the right local example after the map narrows the repo shape - Onboard themselves onto an unfamiliar repo - Debug what the MCP server is doing - Use outputs in CI/scripts (via `--json`) @@ -50,7 +51,7 @@ CODEBASE_CONTEXT_ASCII=1 npx -y codebase-context patterns npx -y codebase-context map ``` -The conventions map — run this first on an unfamiliar repo. Shows architecture layers, active patterns with adoption rates and trend direction, and the golden files the team treats as the strongest examples. This is also what the MCP server delivers to AI agents via the `codebase://context` resource on first call. +The conventions map - run this first on an unfamiliar repo. It shows architecture layers, active patterns with adoption rates and trend direction, and the golden files the team treats as the strongest examples. This is also what the MCP server delivers to AI agents via the `codebase://context` resource on first call, before search narrows to a specific local example. Example output (truncated): diff --git a/docs/client-setup.md b/docs/client-setup.md index 23304e6..032b0cf 100644 --- a/docs/client-setup.md +++ b/docs/client-setup.md @@ -1,6 +1,6 @@ # Client Setup -Full setup instructions for each AI client. For the quick-start summary, see [README.md](../README.md). +Full setup instructions for each AI client. This guide is about transport and wiring, not a different product mode: each client gets the same bounded conventions map first and local-pattern discovery second. For the quick-start summary, see [README.md](../README.md). ## Transport modes diff --git a/docs/comparison-table.md b/docs/comparison-table.md index 2d30c95..aee2eaa 100644 --- a/docs/comparison-table.md +++ b/docs/comparison-table.md @@ -5,17 +5,19 @@ It is a setup-status table first, not a marketing scoreboard. | Comparator | Intended role in gate | Current status | Evidence summary | | --- | --- | --- | --- | -| `raw Claude Code` | Baseline for payload cost and at least one usefulness comparison | `setup_failed` | The local `claude` CLI baseline is unavailable in this environment, so the gate records missing baseline metrics. | +| `raw Claude Code` | Baseline for payload cost and at least one usefulness comparison | comparator artifact: `ok`; gate: `pending_evidence` | The Haiku-backed Claude CLI runner now returns current payloads, but the checked-in baseline still has `averageFirstRelevantHit: null`, so the gate still records missing baseline metrics. | | `GrepAI` | Named MCP comparator | `setup_failed` | Requires the GrepAI binary plus a local Ollama embedding setup that is not present in this proof environment. | | `jCodeMunch` | Named MCP comparator | `setup_failed` | The MCP server still closes on startup during the current rerun, so no comparable discovery metrics were produced. | -| `codebase-memory-mcp` | Named MCP comparator | `setup_failed` | The documented install path still depends on the external shell installer instead of a working local benchmark path. | +| `codebase-memory-mcp` | Named MCP comparator | comparator artifact: `ok`; gate: `failed` | The repaired graph-backed runner now produces real current metrics, but the frozen gate still fails this lane because `codebase-context` does not stay within tolerance on every required usefulness metric. | | `CodeGraphContext` | Graph-native comparator in the relaunch frame | `setup_failed` | The MCP server still closes on startup during the current rerun, so this lane remains missing evidence. | ## Reading This Table - `setup_failed` means the lane was attempted and did not reach a credible metric-producing state. +- `pending_evidence` in the gate means the lane is still missing one or more required metrics. +- `failed` in the gate means the lane has real metrics, but the frozen comparison rule still does not pass. - A missing metric is not treated as a win for `codebase-context`. -- The combined gate in `results/gate-evaluation.json` remains `pending_evidence` until these lanes produce real metrics. +- The combined gate in `results/gate-evaluation.json` remains `pending_evidence`, and `claimAllowed` stays `false`, until these lanes produce real metrics. ## Current codebase-context result @@ -25,8 +27,8 @@ For reference, the current combined discovery output across `angular-spotify` an | --- | ---: | | `totalTasks` | 24 | | `averageUsefulness` | 0.75 | -| `averagePayloadBytes` | 3613.6667 | -| `averageEstimatedTokens` | 903.7083 | +| `averagePayloadBytes` | 7306.4583 | +| `averageEstimatedTokens` | 1827.0833 | | `bestExampleUsefulnessRate` | 0.125 | | `gate.status` | `pending_evidence` | diff --git a/docs/demo.md b/docs/demo.md index 8c6285e..825b2ce 100644 --- a/docs/demo.md +++ b/docs/demo.md @@ -1,7 +1,7 @@ # Demo Script This walkthrough uses real CLI output captured against `repos/angular-spotify` during the Phase 10 proof rerun. -Run it from the repo root with `CODEBASE_ROOT` pointed at the frozen sample repo. +Run it from the repo root with `CODEBASE_ROOT` pointed at the frozen sample repo. The public flow is simple: start with the conventions map, then search for the local example you need. ## 1. Start With The Conventions Map @@ -75,7 +75,7 @@ Captured output excerpt: What this shows: -- Search remains the second step after the map. +- Search is the second step after the map, not a separate headline workflow. - `intent=edit` adds preflight evidence instead of forcing a separate call. - The response stays compact while still surfacing a best example and impact hints. @@ -117,4 +117,4 @@ What this shows: ## Caveats - These excerpts were captured from the current local proof run and will change if the frozen sample repo or index state changes. -- The benchmark gate is still `pending_evidence`, so this walkthrough demonstrates shipped behavior, not a released performance claim. +- The discovery benchmark gate is still `pending_evidence`, and `claimAllowed` remains `false`, so this walkthrough demonstrates shipped behavior, not a released performance claim. diff --git a/docs/registry-sync-checklist.md b/docs/registry-sync-checklist.md index 0a1d5a9..5cb5a59 100644 --- a/docs/registry-sync-checklist.md +++ b/docs/registry-sync-checklist.md @@ -1,6 +1,6 @@ # Registry Sync Checklist -Use this checklist before publishing any Phase 10-facing metadata or registry copy. +Use this checklist before publishing any relaunch-facing metadata or registry copy. The purpose is to keep the public surface aligned with the current proof bundle. ## Required Artifacts @@ -23,9 +23,12 @@ The purpose is to keep the public surface aligned with the current proof bundle. ## Required Truth Checks - If the gate is `pending_evidence`, say so explicitly. +- If the raw-Claude baseline still has `averageFirstRelevantHit: null`, say the baseline remains `pending_evidence`. +- If `codebase-memory-mcp` still reads comparator artifact `ok` but gate `failed`, say so explicitly. - If any comparator lane is `setup_failed`, say so explicitly. - Do not claim benchmark wins against `raw Claude Code`, `GrepAI`, `jCodeMunch`, `codebase-memory-mcp`, or `CodeGraphContext` without real metrics in `results/comparator-evidence.json`. - Do not claim implementation quality from this discovery benchmark. +- Do not turn this discovery-only proof into relaunch-release, risky-edit, or patch-quality proof language. - Do not omit the current reranker fallback limitation if the proof run still shows `Protobuf parsing failed`. ## Before Registry Or README Updates diff --git a/package.json b/package.json index 68780ad..20ad3c6 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "codebase-context", - "version": "1.10.0", - "description": "Pre-maps your codebase architecture, conventions, and team memory so AI agents navigate with precision instead of exploring. Local-first MCP server with AST-backed hybrid search.", + "version": "2.2.0", + "description": "Bounded conventions map and local-pattern discovery for AI coding agents. Local-first MCP server with AST-backed hybrid search.", "type": "module", "main": "./dist/lib.js", "types": "./dist/lib.d.ts", @@ -61,9 +61,9 @@ "mcp-server", "model-context-protocol", "codebase-context", - "code-intelligence", "code-patterns", "team-conventions", + "conventions-map", "pattern-detection", "semantic-search", "vector-search", @@ -76,8 +76,6 @@ "local-first", "privacy-first", "embeddings", - "preflight", - "evidence-scoring", "golden-files", "ai-coding", "ai-agents", @@ -93,9 +91,7 @@ "developer-tools", "static-analysis", "code-quality", - "team-memory", - "code-search", - "codebase-intelligence" + "code-search" ], "repository": { "type": "git", diff --git a/tests/proof-truth-surfaces.test.ts b/tests/proof-truth-surfaces.test.ts new file mode 100644 index 0000000..027366c --- /dev/null +++ b/tests/proof-truth-surfaces.test.ts @@ -0,0 +1,122 @@ +import { readFileSync } from 'node:fs'; +import { resolve } from 'node:path'; +import { describe, expect, it } from 'vitest'; + +const root = resolve(import.meta.dirname, '..'); + +type GateComparator = { + comparatorName: string; + status: string; +}; + +type GateArtifact = { + gate: { + status: string; + claimAllowed: boolean; + baseline: { + status: string; + missingMetrics?: string[]; + }; + comparators: GateComparator[]; + }; +}; + +type ComparatorArtifact = { + status: string; + averageFirstRelevantHit?: number | null; +}; + +type ComparatorEvidence = Record; + +function readText(relPath: string): string { + return readFileSync(resolve(root, relPath), 'utf8'); +} + +function readJson(relPath: string): T { + return JSON.parse(readText(relPath)) as T; +} + +function expectContains(text: string, snippets: string[]): void { + for (const snippet of snippets) { + expect(text).toContain(snippet); + } +} + +describe('proof truth surfaces', () => { + const gateArtifact = readJson('results/gate-evaluation.json'); + const comparatorEvidence = readJson('results/comparator-evidence.json'); + const benchmarkDoc = readText('docs/benchmark.md'); + const comparisonDoc = readText('docs/comparison-table.md'); + const registryChecklist = readText('docs/registry-sync-checklist.md'); + const readme = readText('README.md'); + const capabilities = readText('docs/capabilities.md'); + const demo = readText('docs/demo.md'); + + it('reads the current blocked discovery artifacts', () => { + expect(gateArtifact.gate.status).toBeTruthy(); + expect(typeof gateArtifact.gate.claimAllowed).toBe('boolean'); + expect(comparatorEvidence['raw Claude Code']).toBeDefined(); + expect(comparatorEvidence['codebase-memory-mcp']).toBeDefined(); + }); + + it('keeps the proof docs aligned to the current gate artifact', () => { + expectContains(benchmarkDoc, [ + 'discovery benchmark', + `\`${gateArtifact.gate.status}\``, + '`claimAllowed`' + ]); + expectContains(comparisonDoc, [ + 'Comparator Summary', + `\`${gateArtifact.gate.status}\``, + `claimAllowed\` stays \`${String(gateArtifact.gate.claimAllowed)}\`` + ]); + expectContains(registryChecklist, [ + `claimAllowed: ${String(gateArtifact.gate.claimAllowed)}`, + gateArtifact.gate.status + ]); + }); + + it('documents the raw-Claude missing-metric caveat when the artifact still lacks ranked-hit evidence', () => { + const rawClaude = comparatorEvidence['raw Claude Code']; + const rawClaudeGate = gateArtifact.gate.baseline; + + if (rawClaude.averageFirstRelevantHit === null) { + expect(rawClaudeGate.status).toBe('pending_evidence'); + expect(rawClaudeGate.missingMetrics ?? []).toContain('averageFirstRelevantHit'); + expect(benchmarkDoc).toMatch(/raw Claude Code[\s\S]*averageFirstRelevantHit[\s\S]*null/i); + expect(comparisonDoc).toMatch(/raw Claude Code[\s\S]*pending_evidence/i); + expect(registryChecklist).toContain('averageFirstRelevantHit: null'); + } + }); + + it('reflects comparator gate failures and setup failures from the checked-in evidence', () => { + const codebaseMemoryGate = gateArtifact.gate.comparators.find( + (comparator) => comparator.comparatorName === 'codebase-memory-mcp' + ); + + if (codebaseMemoryGate?.status === 'failed') { + expect(benchmarkDoc).toMatch(/codebase-memory-mcp[\s\S]*gate: `failed`/i); + expect(comparisonDoc).toMatch(/codebase-memory-mcp[\s\S]*gate: `failed`/i); + expect(registryChecklist).toContain('comparator artifact `ok` but gate `failed`'); + } + + const setupFailedComparators = Object.entries(comparatorEvidence) + .filter(([, artifact]) => artifact.status === 'setup_failed') + .map(([name]) => name); + + for (const comparatorName of setupFailedComparators) { + expect(benchmarkDoc).toContain(`\`${comparatorName}\``); + expect(comparisonDoc).toContain(`\`${comparatorName}\``); + } + }); + + it('keeps package-facing proof mentions secondary and discovery-only', () => { + expectContains(readme, ['discovery-only proof', gateArtifact.gate.status, 'claimAllowed']); + expectContains(capabilities, [ + 'discovery-only', + gateArtifact.gate.status, + `claimAllowed: ${String(gateArtifact.gate.claimAllowed)}` + ]); + expectContains(demo, [gateArtifact.gate.status, 'claimAllowed']); + }); +}); diff --git a/tests/release-truth-surfaces.test.ts b/tests/release-truth-surfaces.test.ts new file mode 100644 index 0000000..c855a6b --- /dev/null +++ b/tests/release-truth-surfaces.test.ts @@ -0,0 +1,102 @@ +import { readFileSync } from 'node:fs'; +import { resolve } from 'node:path'; +import { describe, expect, it } from 'vitest'; + +const root = resolve(import.meta.dirname, '..'); + +type PackageJson = { + version: string; + files?: string[]; +}; + +type ReleaseManifest = { + '.': string; +}; + +function readText(relPath: string): string { + return readFileSync(resolve(root, relPath), 'utf8'); +} + +function readJson(relPath: string): T { + return JSON.parse(readText(relPath)) as T; +} + +function normalizePath(target: string): string { + return target.replace(/^\.\/+/, '').replace(/\\/g, '/'); +} + +function stripFragment(target: string): string { + return target.split('#', 1)[0] ?? target; +} + +function isStableExternalUrl(target: string): boolean { + return /^https?:\/\//.test(target); +} + +function isPackagedPath(target: string, packagedPaths: string[]): boolean { + const normalizedTarget = normalizePath(stripFragment(target)); + return packagedPaths.some((entry) => { + const normalizedEntry = normalizePath(entry); + return ( + normalizedTarget === normalizedEntry || normalizedTarget.startsWith(`${normalizedEntry}/`) + ); + }); +} + +function extractMarkdownLinks(markdown: string): string[] { + const matches = markdown.matchAll(/\[[^\]]+\]\(([^)]+)\)/g); + const links: string[] = []; + + for (const match of matches) { + if (match.index != null && match.index > 0 && markdown[match.index - 1] === '!') { + continue; + } + + const href = match[1]?.trim(); + if (href) { + links.push(href); + } + } + + return links; +} + +describe('release truth surfaces', () => { + const packageJson = readJson('package.json'); + const releaseManifest = readJson('.release-please-manifest.json'); + const changelog = readText('CHANGELOG.md'); + const readme = readText('README.md'); + const workflow = readText('.github/workflows/publish-npm-on-release.yml'); + const benchmarkDoc = readText('docs/benchmark.md'); + const packagedPaths = ['README.md', 'LICENSE', ...(packageJson.files ?? [])]; + + it('keeps package metadata, release manifest, and changelog on 2.2.0', () => { + expect(packageJson.version).toBe('2.2.0'); + expect(releaseManifest['.']).toBe('2.2.0'); + expect(changelog).toContain('## [2.2.0]'); + expect(changelog).not.toContain('## Unreleased'); + }); + + it('limits packaged README links to shipped files or stable external URLs', () => { + const invalidLinks = extractMarkdownLinks(readme).filter((href) => { + if (href.startsWith('#')) { + return false; + } + + if (isStableExternalUrl(href)) { + return false; + } + + return !isPackagedPath(href, packagedPaths); + }); + + expect(invalidLinks).toEqual([]); + }); + + it('keeps the manual publish fallback and public proof references aligned to v2.2.0 truth', () => { + expect(workflow).toContain("description: 'Tag to publish (e.g. v2.2.0)'"); + expect(workflow).toContain("default: 'v2.2.0'"); + expect(benchmarkDoc).toContain('claimAllowed'); + expect(readme).toContain('pending_evidence'); + }); +});