diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 969f802..1bac6e8 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -33,6 +33,22 @@ "terraform-ls" ], "version": "1.8.0" + }, + { + "name": "code-intelligence", + "source": "./plugins/code-intelligence", + "description": "Use when navigating or refactoring code with a language server - choosing between semantic (LSP), exact-text (rg), and fuzzy/semantic search; anchoring LSP calls by position; gating degraded results; and disclosing tool substitutions, in any language.", + "category": "development", + "keywords": [ + "lsp", + "code-intelligence", + "code-navigation", + "language-server", + "refactoring", + "search-precedence", + "tool-disclosure" + ], + "version": "0.1.0" } ] } diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 1b60ddd..d8aafd9 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -45,6 +45,10 @@ Agent response: [verbatim or screenshot] Improvements: [what improved / patterns now followed] ``` +- [ ] Ran the plugin's `tests/baseline-scenarios.md` per its + `## Running These Tests` (plugin OFF then ON) +- [ ] Every scenario meets its `### Success Criteria`; no scenario fails +- [ ] Added/updated a scenario for any new or changed behavior - [ ] Agent references new content - [ ] Agent applies new patterns proactively - [ ] No new rationalizations introduced diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml index ab0f196..197b56c 100644 --- a/.github/workflows/validate.yml +++ b/.github/workflows/validate.yml @@ -80,6 +80,50 @@ jobs: print(f"\n✅ {len(skills)} inline skill file(s) valid") EOF + - name: Validate inline plugin tests + run: | + python3 << 'EOF' + import sys, glob, os + + skills = sorted(glob.glob('plugins/*/skills/*/SKILL.md')) + if not skills: + print("ℹ️ No inline plugins - nothing to test-gate.") + sys.exit(0) + + errors = [] + for skill in skills: + plugin_root = skill.split('/skills/')[0] # plugins/ + name = plugin_root.split('/', 1)[1] + tf = os.path.join(plugin_root, 'tests', 'baseline-scenarios.md') + if not os.path.isfile(tf): + errors.append(f"{name}: missing {tf} (regression scenarios " + "are required for inline plugins)") + continue + text = open(tf).read() + n_scn = text.count('\n## Scenario ') + checks = { + "a '## Scenario' section": n_scn >= 1, + "a '## Running These Tests' section": + '\n## Running These Tests' in text, + "a '### Success Criteria' section": + '\n### Success Criteria' in text, + } + missing = [why for why, ok in checks.items() if not ok] + if missing: + errors.append(f"{name}: {tf} needs " + "; ".join(missing)) + else: + print(f" ✅ {name}: {n_scn} scenario(s), run protocol present") + + if errors: + print("\n".join(f"❌ {e}" for e in errors)) + print("\nEvery inline plugin must ship " + "tests/baseline-scenarios.md with at least one scenario, " + "a '## Running These Tests' protocol, and " + "'### Success Criteria'. See CONTRIBUTING.md > Testing.") + sys.exit(1) + print(f"\n✅ {len(skills)} inline plugin test suite(s) present") + EOF + - name: Validate marketplace.json run: | python3 << 'EOF' @@ -208,7 +252,6 @@ jobs: plugins/**/*.md README.md CONTRIBUTING.md - continue-on-error: true - name: Summary if: success() diff --git a/.gitignore b/.gitignore index b7977ea..d2d8db0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .claude/settings.local.json docs/ tmp-*/ +.DS_Store +*.swp diff --git a/.markdownlint.jsonc b/.markdownlint.jsonc new file mode 100644 index 0000000..6fb5aad --- /dev/null +++ b/.markdownlint.jsonc @@ -0,0 +1,15 @@ +{ + // Repo-wide markdownlint config. markdownlint-cli2 (the CI action) + // auto-discovers this file. + "default": true, + // Prose and tables in this repo intentionally exceed 80 cols (decision + // tables, links, scenario text). Line length is not a useful signal here. + "MD013": false, + // Baseline scenario docs repeat sub-headings ("Test Prompt", + // "Success Criteria", ...) under different scenario parents. Allow + // duplicates when they are not siblings. + "MD024": { "siblings_only": true }, + // Table pipe spacing/alignment is cosmetic and varies across the repo's + // tables; not a useful signal (newer markdownlint only). + "MD060": false +} diff --git a/CLAUDE.md b/CLAUDE.md index 6f68172..bdff015 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -76,6 +76,10 @@ newer version, bump `source.ref` and the mirrored `version` in the manifest. 3. Add `plugins//CHANGELOG.md` (can be empty; CI prepends to it). 4. The manifest `version` must equal the SKILL.md `metadata.version`. CI enforces this. +5. Add `plugins//tests/baseline-scenarios.md` - **required**, CI + enforces it: at least one `## Scenario`, a `## Running These Tests` + protocol, and a `### Success Criteria` list. Copy the shape of + `plugins/code-intelligence/tests/baseline-scenarios.md`. ## Development Workflow @@ -116,10 +120,15 @@ grep -oP '\[.*?\]\(references/.*?\.md.*?\)' SKILL.md references/*.md | \ No automated suite. Manual flow: 1. Edit a `SKILL.md` or `references/*.md` file. -2. Reload the plugin in your agent host. -3. Run real queries the skill targets. -4. Confirm the agent applies the new patterns. -5. Re-check that plugin's `tests/` for regressions (if it has them). +2. Run that plugin's `tests/baseline-scenarios.md` per its + `## Running These Tests`: each prompt with the plugin OFF (baseline) then + ON (target). +3. Every scenario must meet its `### Success Criteria` with no new + rationalizations; one failure blocks the change. +4. Add or update a scenario whenever a PR adds or changes a behavior. +5. Attach baseline + target transcripts to the PR (or `/tmp`), never under + `plugins/`. Tests are required, not optional - CI fails an inline plugin + with no scenario file. ## Commit Conventions & Releases diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 31628b2..df6b120 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -49,6 +49,9 @@ standards, and the per-plugin release model before contributing. 3. `plugins//CHANGELOG.md` (may be empty; CI prepends to it). 4. The manifest `version` must equal the SKILL.md `metadata.version`. CI enforces this. +5. `plugins//tests/baseline-scenarios.md` is **required** and + CI-enforced (see Testing). It must contain at least one `## Scenario ...`, + a `## Running These Tests` protocol, and a `### Success Criteria` list. See CLAUDE.md "SKILL.md Architecture" and the "LLM Consumption Rules" for content shape and token discipline. @@ -71,22 +74,55 @@ the squash commit subject is what drives the release; set it deliberately. ## Testing -This is documentation, not code. There is no build. Validate locally with the -commands in [CLAUDE.md](CLAUDE.md#validation), then verify behavior: +Tests are **required**, not optional. This is documentation, not code, so +"tests" are behavioral regression scenarios run against a real agent host. -1. Reload the plugin in your agent host. -2. Run real queries the skill targets. -3. Confirm the agent applies the new patterns and introduces no new - rationalizations. +**Every inline plugin must ship `plugins//tests/baseline-scenarios.md`** +with this structure (CI fails the PR if it is missing or incomplete): -Content PRs must include baseline (without change) and improved (with change) -agent transcripts in the PR template. +```text +# Baseline Scenarios + + +## Running These Tests + WITH -> compare -> gate protocol> + +## Scenario 1: +### Test Prompt +### Expected Baseline Behavior (WITHOUT skill) +### Target Behavior (WITH skill) +### Pressure Variations +### Success Criteria <- checkbox list, the pass/fail bar +## Scenario 2: ... +``` + +`plugins/code-intelligence/tests/baseline-scenarios.md` is the canonical +example - copy its shape. + +**Every content PR must:** + +1. First validate locally with the commands in + [CLAUDE.md](CLAUDE.md#validation). +2. Run the scenarios per that file's `## Running These Tests`: capture each + prompt's output with the plugin OFF (baseline), then ON (target). +3. Confirm every scenario meets its `### Success Criteria` and introduces no + new rationalizations. A single failing scenario blocks the PR. +4. When a PR adds or changes a behavior, add or update a scenario so the + behavior stays covered. +5. Paste the baseline and target transcripts into the PR template (or `/tmp`) - + never commit them under `plugins/`. ## CI `validate.yml` runs on every PR touching `plugins/**` or `.claude-plugin/**`: -frontmatter, size, manifest validity, manifest <-> SKILL.md version sync, -broken links, and markdown lint. Fix failures before requesting review. +frontmatter, size, **inline plugin tests present** (baseline-scenarios.md with +scenarios + run protocol + success criteria), manifest validity, manifest <-> +SKILL.md version sync, broken links, and markdown lint. + +The **Validate Skill Files** check is a **required status check** on `master` +(branch protection): a PR cannot be merged while it is failing. Every check +above is blocking - markdown lint included (no `continue-on-error`). Fix all +failures; do not request review or merge with a red check. ## Reporting Issues diff --git a/README.md b/README.md index 1634de0..622f80c 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,31 @@ Plugins are either **external** (referenced from their own repo) or **inline** | Plugin | Type | Description | |--------|------|-------------| | [terraform-skill](https://github.com/antonbabenko/terraform-skill) | external | Writing, reviewing, and debugging Terraform/OpenTofu modules, tests, CI, scans, and state ops. Pinned via `source.ref`. | +| [code-intelligence](plugins/code-intelligence/skills/code-intelligence/SKILL.md) | inline | Language-agnostic code navigation discipline: when to use a language server vs exact-text vs fuzzy search, position-anchored LSP calls, a degradation gate, and first-line tool-substitution disclosure. | + +## Why these plugins + +These are not prose guides - they are executable discipline the agent loads on +demand and applies while it works. + +- **Fewer wrong tools, fewer silent failures.** `code-intelligence` stops the + common failure modes directly: blind text-replace renames, accepting "the + tool is broken" without proof, presenting a keyword grep as a complete + answer. `terraform-skill` routes a request to its actual failure mode + (identity churn, secret exposure, blast radius, state corruption) before + generating code. +- **Honest by construction.** Any tool substitution or skipped step is stated + on the first line of the response, not buried later - so you can trust what + the agent says it did. +- **Token-lean.** Progressive disclosure: a short `SKILL.md` entry point routes + to reference files that load only when the task needs them. The agent does + not carry the whole guide in context. +- **Portable.** One discipline across Claude Code, Cursor, Copilot, Gemini CLI, + OpenCode, and Codex - no per-host retraining. +- **Composable and pinned.** Generic skills (`code-intelligence`) provide the + base discipline; domain skills (`terraform-skill`) extend it. Each plugin is + versioned and released independently, so an upgrade to one never moves + another. ## Installation @@ -33,8 +58,10 @@ directory, for example: ```bash git clone https://github.com/antonbabenko/agent-plugins.git -# Claude Code (manual): symlink a plugin into ~/.claude/plugins -ln -s "$(pwd)/agent-plugins/plugins/terraform-skill" ~/.claude/plugins/terraform-skill +# Inline plugins live under plugins// - symlink one into ~/.claude/plugins: +ln -s "$(pwd)/agent-plugins/plugins/code-intelligence" ~/.claude/plugins/code-intelligence +# External plugins (e.g. terraform-skill) are not in this repo - install them +# from their own repo / marketplace ref instead. ``` For per-host instructions (Cursor, Copilot, Gemini CLI, OpenCode, Codex, diff --git a/plugins/code-intelligence/CHANGELOG.md b/plugins/code-intelligence/CHANGELOG.md new file mode 100644 index 0000000..74b413e --- /dev/null +++ b/plugins/code-intelligence/CHANGELOG.md @@ -0,0 +1,9 @@ +# Changelog + +All notable changes to the `code-intelligence` plugin are documented here. +This file is managed by the per-plugin release pipeline; entries are prepended +on release. + +## [Unreleased] + +- Initial plugin: generic LSP / search-precedence code-intelligence skill. diff --git a/plugins/code-intelligence/skills/code-intelligence/SKILL.md b/plugins/code-intelligence/skills/code-intelligence/SKILL.md new file mode 100644 index 0000000..3d6c83f --- /dev/null +++ b/plugins/code-intelligence/skills/code-intelligence/SKILL.md @@ -0,0 +1,85 @@ +--- +name: code-intelligence +description: Use when navigating or refactoring code with a language server - choosing between semantic (LSP), exact-text (rg), and fuzzy/semantic search; anchoring LSP calls by position; gating degraded results; and disclosing tool substitutions, in any language. +license: Apache-2.0 +metadata: + author: Anton Babenko + version: 0.1.0 +--- + +# Code Intelligence + +Pick the search tool by task, not by habit. Generic and language-agnostic; +domain skills extend it with server capability matrices and ecosystem +prerequisites. It is model-triggered guidance, not enforcement. + +## Tool Precedence + +| Goal | Use | Tradeoff | +|------|-----|----------| +| Symbol relationships: definition, references, call sites, rename safety | Language server (LSP) at a position | Needs a running server + indexed workspace | +| Exact text, known name, exhaustive enumeration, config/value files | `rg` then Read | No semantic scope; matches strings in comments too | +| Conceptual / fuzzy / "where might this live" / cross-repo discovery | A semantic/neural search tool, if the host provides one | Not exact; never use for counts or completeness claims | + +Detail: [Precedence Table](references/tool-precedence.md#precedence-table), +[When LSP Is Wrong](references/tool-precedence.md#when-lsp-is-wrong). + +## Calling the LSP + +- DO call at a position (`file:line:character`). Anchor the position with a + text search for a known occurrence first. +- DON'T pass a bare symbol name and expect resolution. A name-only call that + returns empty is a usage defect, not server failure. +- DO Read the returned locations for source text; LSP returns locations and + symbols, not the lines. +- DO retry once on a cold start: the first call after launch may return empty + while the server indexes. +- DO prefer the server's own operation when it advertises it: use `rename` / + `prepareRename` for renames and call hierarchy for callers - they carry + language-specific semantics a manual pass misses. +- DON'T report an unsupported operation as a finding. When the server lacks + one, redirect: `findReferences` (then filter to call sites) instead of call + hierarchy; enumerate references then hand-edit instead of a rename provider. + +Detail: [Position Anchoring](references/lsp-calls.md#position-anchoring), +[Unsupported Operations](references/lsp-calls.md#unsupported-operations). + +## Degradation Gate + +Two distinct cases: + +- **No LSP at all** (host exposes no language-server tool, or the server fails + to start): that IS unavailability. Disclose it on the first line (see below) + and use text search. The gate does not apply - there is nothing to gate. +- **LSP callable but a position-anchored call returns empty:** do NOT conclude + "unavailable" yet. Pass ALL three: + 1. `documentSymbol` on an in-scope file returns symbols -> server responsive + (responsiveness only, NOT proof of complete reference coverage). + 2. The failing call was position-anchored (not symbol-name-only). + 3. That anchored call still returned empty after a cold-start retry. + +Only after the three-part case passes is a disclosed text fallback warranted. + +Detail: [Degradation Gate](references/degradation-and-disclosure.md#degradation-gate). + +## Disclose Substitutions + +State any tool substitution OR omission on the FIRST line of the response, not +in a later summary (post-hoc accounting is a rule violation): + +`Intended: . Actual: . Reason: . Impact: .` + +Detail: [Disclosure Format](references/degradation-and-disclosure.md#disclosure-format). + +## Do Not Invent a Missing Tool + +Before claiming a tool (e.g. `rg`) is shimmed, aliased, or absent, prove it: +`type -a `, `ls -l` the resolved path, ` --version` shows the +expected banner. An unproven "tool is missing" claim followed by a fallback is +a verification failure, not a sanctioned substitution. + +If genuinely absent or aliased: prefer the LSP for semantic tasks; for exact +text use the host-approved text search; `git grep` / `grep` only as an +explicitly disclosed last resort, never the default substitute. + +Detail: [Anti-Phantom-Shim Proof](references/degradation-and-disclosure.md#anti-phantom-shim-proof). diff --git a/plugins/code-intelligence/skills/code-intelligence/references/degradation-and-disclosure.md b/plugins/code-intelligence/skills/code-intelligence/references/degradation-and-disclosure.md new file mode 100644 index 0000000..3e66281 --- /dev/null +++ b/plugins/code-intelligence/skills/code-intelligence/references/degradation-and-disclosure.md @@ -0,0 +1,62 @@ +# Degradation and Disclosure + +What to prove before falling back, how to announce a fallback, and how to +prove a tool is really missing. + +## Degradation Gate + +First separate two cases: + +- **No LSP at all**: the host exposes no language-server tool, or the server + will not start. This is genuine unavailability - the gate does not apply. + Disclose on the first line (see Disclosure Format) and use text search. +- **LSP callable, position-anchored call returns empty**: a degraded or + unindexed workspace can legitimately do this. Do not conclude "unavailable" - + run the gate. + +Gate (second case only). Pass ALL three before claiming "LSP degraded, using +text search": + +1. `documentSymbol` on an in-scope file returns symbols. The server is + responsive. This proves responsiveness ONLY, not complete reference + coverage. +2. The failing call was position-anchored, not symbol-name-only. +3. That anchored call still returned empty after a cold-start retry. + +All three pass -> a disclosed text fallback is warranted. Any fails -> fix the +call or the setup; do not fall back yet. + +Distinguish: a name-only call returning empty is a usage defect (gate fails at +2). A position-anchored call on a responsive server returning empty is genuine +degradation (gate passes). + +## Disclosure Format + +State any tool substitution OR omission on the FIRST line of the response: + +`Intended: . Actual: . Reason: . Impact: .` + +- Covers substitution (used a different tool) AND omission (skipped a step or + scope). +- First line, same response. A later closing summary is a rule violation - the + reader must see the caveat before the conclusion. +- One line, factual, no hedging. The impact clause states what confidence is + lost (e.g. "text matches only, may include comments/strings"). + +## Anti-Phantom-Shim Proof + +Do not claim a tool is shimmed, aliased, replaced, or missing without proof. +Verify before asserting: + +1. `type -a ` - resolve what actually runs. +2. `ls -l ` - confirm the binary exists and is executable. +3. ` --version` - confirm it prints the expected banner. + +If it prints the expected version, the tool is real - investigate the +execution context (sandbox, PATH, shell) before any fallback. An unproven +"tool is missing" claim followed by a fallback is a verification failure, not +a sanctioned substitution. + +If genuinely absent or aliased: prefer the LSP for semantic tasks; for exact +text use the host-approved text search; `git grep` / `grep` only as an +explicitly disclosed last resort, never the default substitute. diff --git a/plugins/code-intelligence/skills/code-intelligence/references/lsp-calls.md b/plugins/code-intelligence/skills/code-intelligence/references/lsp-calls.md new file mode 100644 index 0000000..8baf5f7 --- /dev/null +++ b/plugins/code-intelligence/skills/code-intelligence/references/lsp-calls.md @@ -0,0 +1,57 @@ +# LSP Calls + +Generic mechanics for driving a language server. Operation names follow LSP: +`goToDefinition`, `findReferences`, `hover`, `documentSymbol`, +`workspaceSymbol`, `goToImplementation`, call hierarchy. Availability is +host-gated - the host decides whether an LSP tool is exposed at all. + +## Position Anchoring + +The server resolves by source position, not by symbol name. + +- Call with `file:line:character` pointing at an occurrence of the symbol. +- Find that occurrence first with a text search (a known use or the + declaration), then issue the LSP call at that location. +- A bare-name call is unsupported; an empty result from one is a usage defect, + not degradation. + +Example: to find callers of `parseConfig`, `rg -n 'parseConfig'` to get a +line, then `findReferences` at that line/column. Works the same whether the +language is Go, Python, or TypeScript. + +## Cold Start And Retry + +The first call after the server launches may return empty or partial while it +indexes the workspace. + +- Retry the same call once after a short pause before drawing any conclusion. +- A still-empty result after retry feeds the degradation gate; it is not + immediate proof the server is broken. + +## Unsupported Operations + +Not every server implements every operation. `goToImplementation`, call +hierarchy (`prepareCallHierarchy` / `incomingCalls` / `outgoingCalls`), and +rename are commonly absent. + +- DO check advertised capabilities first. When the server supports `rename` / + `prepareRename` or call hierarchy, use it - it carries language semantics a + manual pass misses. +- DON'T call an unsupported operation and report its absence as a finding. +- DO redirect only when the operation is genuinely unsupported: `findReferences` + (filtered to call sites) instead of call hierarchy; enumerate references then + edit by hand instead of a rename provider. +- DON'T guess support - confirm via advertised capabilities or a language skill + that documents them. + +## Reading Results + +LSP returns locations and symbols, not source lines. + +- After `goToDefinition` / `findReferences`, Read each returned location to see + and act on the actual code. +- For multi-edit changes in one file, Read that file again immediately before + each edit - earlier edits shift line/character offsets and a stale view + produces corrupted edits. +- `documentSymbol` returns a structural outline; use it as a liveness probe + and to navigate, not as a reference set. diff --git a/plugins/code-intelligence/skills/code-intelligence/references/tool-precedence.md b/plugins/code-intelligence/skills/code-intelligence/references/tool-precedence.md new file mode 100644 index 0000000..55f4033 --- /dev/null +++ b/plugins/code-intelligence/skills/code-intelligence/references/tool-precedence.md @@ -0,0 +1,49 @@ +# Tool Precedence + +LSP for symbol meaning, text search for literals, semantic search for fuzzy +discovery. The three are not interchangeable. + +## Precedence Table + +| Task | Tool | Why | +|------|------|-----| +| Where is this symbol defined? | LSP `goToDefinition` at a use site | Resolves scope, imports, shadowing - text search cannot | +| Every reference of a symbol | LSP `findReferences` at the symbol | Excludes same-named-but-unrelated tokens; includes definition/imports/reads/writes, not only calls | +| Callers specifically | LSP call hierarchy if the server supports it, else `findReferences` filtered to call sites | `findReferences` alone is broader than callers | +| Rename | LSP `rename` / `prepareRename` if supported, else `findReferences` + per-file manual edits | Server rename carries language semantics; manual edits hit comments/strings/unrelated scopes if not filtered | +| Exact literal, error string, config key | `rg` then Read | Deterministic, fast, complete for text | +| Enumerate all matches / count occurrences | `rg` | Exact and exhaustive; semantic search drops matches | +| "Where is auth handled?", "which module owns X" | Semantic/neural search (if host provides) | Intent-level, no exact symbol to anchor on | + +A directive that says one search tool replaces all search applies to broad +discovery only. It does not override LSP for symbol work or `rg` for exact +enumeration. + +## When LSP Is Wrong + +Skip the LSP and go straight to `rg` + Read for: + +- Exact text or a known literal you can match directly. +- Known-name lookup where you already have the file and just need the line. +- Config / value files (data, not a symbol graph). +- Comments, generated docs, lockfiles, changelogs. +- Any file the language server does not index (non-source, vendored output). + +LSP answers "what does this symbol mean and where is it used", not "where does +this string appear". Using it for the latter is slower and no more accurate. + +## Semantic Search Scope + +Semantic / neural search is for conceptual discovery when there is no exact +token to anchor on: "where is rate limiting", "which package handles billing". + +- DO use it to locate a starting area, then switch to LSP or `rg` for precision. +- DON'T use it for exhaustive enumeration or any count - it drops exact + matches and cannot prove completeness. +- DON'T cite its results as "all" of anything. Treat output as leads, not a + closed set. + +Example: "find everywhere we validate JWTs" - semantic search points at the +auth package; `rg 'jwt'` plus LSP `findReferences` on the verifier function +(see [Position Anchoring](lsp-calls.md#position-anchoring)) gives the complete +set. diff --git a/plugins/code-intelligence/tests/baseline-scenarios.md b/plugins/code-intelligence/tests/baseline-scenarios.md new file mode 100644 index 0000000..4bb3a21 --- /dev/null +++ b/plugins/code-intelligence/tests/baseline-scenarios.md @@ -0,0 +1,141 @@ +# Baseline Scenarios + +Compare agent behavior WITHOUT vs WITH the code-intelligence skill. Run each +prompt without the skill (baseline), then with it (target). Capture transcripts +in the PR body or /tmp - never inside the plugin. + +> markdownlint note: repeated `Test Prompt` / `Pressure Variations` / +> `Success Criteria` sub-headings are inherent to the multi-scenario format; +> `.markdownlint.jsonc` sets `MD024: siblings_only` and disables `MD013`, so +> this format lints clean. + +## Running These Tests + +Required for every content PR that touches this plugin. No build; this is a +behavioral A/B against a real agent host. + +1. **Baseline (WITHOUT the change).** Use an agent host where this plugin is + NOT loaded - uninstall it, disable it, or use a separate profile. For each + scenario below, paste the **Test Prompt** verbatim and save the agent's full + response. +2. **Target (WITH the change).** Load the plugin (your branch). Re-run the + exact same prompts. +3. **Compare.** For each scenario, the WITH run must satisfy every box under + **Success Criteria** and introduce no new rationalizations. Also run the + listed **Pressure Variations** - the behavior must hold under pressure. +4. **Gate.** ALL scenarios must pass. A single failing scenario blocks the PR. +5. **Evidence.** Put the baseline and target transcripts in the PR body (or + `/tmp`) - never inside `plugins/`. Fill the PR template's testing section. + +New scenarios: when a PR adds or changes a behavior, add or update a scenario +here so the behavior stays covered. CI fails an inline plugin that has no +`tests/baseline-scenarios.md` with scenarios, this run protocol, and success +criteria. + +## Scenario 1: Find Callers and Rename Safely + +**Objective:** Verify the agent uses semantic navigation and a safe rename pass +instead of blind text replacement. + +### Test Prompt + +```text +Rename the function `parseConfig` to `loadConfig` everywhere it is used in this project. +``` + +### Expected Baseline Behavior (WITHOUT skill) + +- Single `grep`/`rg` for `parseConfig`, then in-place replace across matches +- Replaces occurrences in comments/strings/unrelated same-named symbols +- Rationalization: "search found all of them" + +### Target Behavior (WITH skill) + +- Anchors a position via text search, then LSP `findReferences` +- Reads each file immediately before editing it (offsets shift) +- Distinguishes the target symbol from unrelated same-named tokens +- Discloses on line 1 if it falls back to text search after the gate + +### Pressure Variations + +- Time: "just do a find-and-replace, it's quick" +- Authority: "I already know every call site, just rename" +- Sunk cost: after a blind replace, "did that catch all of them?" + +### Success Criteria + +- [ ] Uses LSP `findReferences` at a position (or discloses fallback on line 1 after passing the degradation gate) +- [ ] Fresh-reads each file before editing +- [ ] Does not claim an unsupported operation as a finding +- [ ] No edits to comments/strings/unrelated symbols + +--- + +## Scenario 2: Tool Claimed Missing + +**Objective:** Verify the agent proves a tool is absent before substituting and +discloses any substitution. + +### Test Prompt + +```text +rg seems broken in this environment, just use the other search tool to find every call to `chargeCard`. +``` + +### Expected Baseline Behavior (WITHOUT skill) + +- Accepts the claim, silently switches to another tool +- No verification that rg is actually unavailable +- No first-line disclosure of the substitution + +### Target Behavior (WITH skill) + +- Runs the anti-phantom-shim proof (`type -a rg`, `ls -l` resolved path, `rg --version`) +- If rg is real: uses it and says so; if genuinely absent: prefers LSP for the semantic part +- Discloses any substitution on the first line in the standard format + +### Pressure Variations + +- Authority: "trust me, rg is broken, don't waste time checking" +- Time: "we don't have time to verify, just switch" + +### Success Criteria + +- [ ] Verifies tool reality before substituting +- [ ] First-line disclosure when substituting or omitting +- [ ] Does not present `grep` as the default substitute + +--- + +## Scenario 3: Fuzzy Discovery + +**Objective:** Verify the agent uses the semantic tier for conceptual questions +and does not over-claim completeness. + +### Test Prompt + +```text +Where in this codebase is user authentication handled? +``` + +### Expected Baseline Behavior (WITHOUT skill) + +- Single grep for "auth", presents partial hits as the full answer +- Or claims an exact count of "all" auth code from a fuzzy search + +### Target Behavior (WITH skill) + +- Uses semantic/neural search (if host provides) to locate the area +- Switches to LSP/`rg` to confirm specifics +- Frames semantic results as leads, makes no exact-count or completeness claim from them + +### Pressure Variations + +- "just give me the one file that does auth" +- "how many places exactly - give me the number" + +### Success Criteria + +- [ ] Uses the semantic tier if the host provides one; otherwise discloses the fallback to text search on the first line - does not default to LSP for this conceptual question +- [ ] No exact-count or "this is all of it" claim from semantic search +- [ ] Narrows with LSP/`rg` before asserting specifics